1 //===-- SIInstrInfo.cpp - SI Instruction Information  ---------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief SI Implementation of TargetInstrInfo.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "SIInstrInfo.h"
17 #include "AMDGPUTargetMachine.h"
18 #include "GCNHazardRecognizer.h"
19 #include "SIDefines.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "llvm/CodeGen/MachineFrameInfo.h"
22 #include "llvm/CodeGen/MachineInstrBuilder.h"
23 #include "llvm/CodeGen/MachineRegisterInfo.h"
24 #include "llvm/CodeGen/ScheduleDAG.h"
25 #include "llvm/IR/Function.h"
26 #include "llvm/CodeGen/RegisterScavenging.h"
27 #include "llvm/MC/MCInstrDesc.h"
28 #include "llvm/Support/Debug.h"
29 
30 using namespace llvm;
31 
32 SIInstrInfo::SIInstrInfo(const AMDGPUSubtarget &st)
33     : AMDGPUInstrInfo(st), RI() {}
34 
35 //===----------------------------------------------------------------------===//
36 // TargetInstrInfo callbacks
37 //===----------------------------------------------------------------------===//
38 
39 static unsigned getNumOperandsNoGlue(SDNode *Node) {
40   unsigned N = Node->getNumOperands();
41   while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
42     --N;
43   return N;
44 }
45 
46 static SDValue findChainOperand(SDNode *Load) {
47   SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1);
48   assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node");
49   return LastOp;
50 }
51 
52 /// \brief Returns true if both nodes have the same value for the given
53 ///        operand \p Op, or if both nodes do not have this operand.
54 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
55   unsigned Opc0 = N0->getMachineOpcode();
56   unsigned Opc1 = N1->getMachineOpcode();
57 
58   int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
59   int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
60 
61   if (Op0Idx == -1 && Op1Idx == -1)
62     return true;
63 
64 
65   if ((Op0Idx == -1 && Op1Idx != -1) ||
66       (Op1Idx == -1 && Op0Idx != -1))
67     return false;
68 
69   // getNamedOperandIdx returns the index for the MachineInstr's operands,
70   // which includes the result as the first operand. We are indexing into the
71   // MachineSDNode's operands, so we need to skip the result operand to get
72   // the real index.
73   --Op0Idx;
74   --Op1Idx;
75 
76   return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
77 }
78 
79 bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI,
80                                                     AliasAnalysis *AA) const {
81   // TODO: The generic check fails for VALU instructions that should be
82   // rematerializable due to implicit reads of exec. We really want all of the
83   // generic logic for this except for this.
84   switch (MI->getOpcode()) {
85   case AMDGPU::V_MOV_B32_e32:
86   case AMDGPU::V_MOV_B32_e64:
87   case AMDGPU::V_MOV_B64_PSEUDO:
88     return true;
89   default:
90     return false;
91   }
92 }
93 
94 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
95                                           int64_t &Offset0,
96                                           int64_t &Offset1) const {
97   if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
98     return false;
99 
100   unsigned Opc0 = Load0->getMachineOpcode();
101   unsigned Opc1 = Load1->getMachineOpcode();
102 
103   // Make sure both are actually loads.
104   if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
105     return false;
106 
107   if (isDS(Opc0) && isDS(Opc1)) {
108 
109     // FIXME: Handle this case:
110     if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
111       return false;
112 
113     // Check base reg.
114     if (Load0->getOperand(1) != Load1->getOperand(1))
115       return false;
116 
117     // Check chain.
118     if (findChainOperand(Load0) != findChainOperand(Load1))
119       return false;
120 
121     // Skip read2 / write2 variants for simplicity.
122     // TODO: We should report true if the used offsets are adjacent (excluded
123     // st64 versions).
124     if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 ||
125         AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1)
126       return false;
127 
128     Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue();
129     Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue();
130     return true;
131   }
132 
133   if (isSMRD(Opc0) && isSMRD(Opc1)) {
134     assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1));
135 
136     // Check base reg.
137     if (Load0->getOperand(0) != Load1->getOperand(0))
138       return false;
139 
140     const ConstantSDNode *Load0Offset =
141         dyn_cast<ConstantSDNode>(Load0->getOperand(1));
142     const ConstantSDNode *Load1Offset =
143         dyn_cast<ConstantSDNode>(Load1->getOperand(1));
144 
145     if (!Load0Offset || !Load1Offset)
146       return false;
147 
148     // Check chain.
149     if (findChainOperand(Load0) != findChainOperand(Load1))
150       return false;
151 
152     Offset0 = Load0Offset->getZExtValue();
153     Offset1 = Load1Offset->getZExtValue();
154     return true;
155   }
156 
157   // MUBUF and MTBUF can access the same addresses.
158   if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
159 
160     // MUBUF and MTBUF have vaddr at different indices.
161     if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
162         findChainOperand(Load0) != findChainOperand(Load1) ||
163         !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
164         !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
165       return false;
166 
167     int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
168     int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
169 
170     if (OffIdx0 == -1 || OffIdx1 == -1)
171       return false;
172 
173     // getNamedOperandIdx returns the index for MachineInstrs.  Since they
174     // inlcude the output in the operand list, but SDNodes don't, we need to
175     // subtract the index by one.
176     --OffIdx0;
177     --OffIdx1;
178 
179     SDValue Off0 = Load0->getOperand(OffIdx0);
180     SDValue Off1 = Load1->getOperand(OffIdx1);
181 
182     // The offset might be a FrameIndexSDNode.
183     if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
184       return false;
185 
186     Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue();
187     Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue();
188     return true;
189   }
190 
191   return false;
192 }
193 
194 static bool isStride64(unsigned Opc) {
195   switch (Opc) {
196   case AMDGPU::DS_READ2ST64_B32:
197   case AMDGPU::DS_READ2ST64_B64:
198   case AMDGPU::DS_WRITE2ST64_B32:
199   case AMDGPU::DS_WRITE2ST64_B64:
200     return true;
201   default:
202     return false;
203   }
204 }
205 
206 bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
207                                         int64_t &Offset,
208                                         const TargetRegisterInfo *TRI) const {
209   unsigned Opc = LdSt->getOpcode();
210 
211   if (isDS(*LdSt)) {
212     const MachineOperand *OffsetImm = getNamedOperand(*LdSt,
213                                                       AMDGPU::OpName::offset);
214     if (OffsetImm) {
215       // Normal, single offset LDS instruction.
216       const MachineOperand *AddrReg = getNamedOperand(*LdSt,
217                                                       AMDGPU::OpName::addr);
218 
219       BaseReg = AddrReg->getReg();
220       Offset = OffsetImm->getImm();
221       return true;
222     }
223 
224     // The 2 offset instructions use offset0 and offset1 instead. We can treat
225     // these as a load with a single offset if the 2 offsets are consecutive. We
226     // will use this for some partially aligned loads.
227     const MachineOperand *Offset0Imm = getNamedOperand(*LdSt,
228                                                        AMDGPU::OpName::offset0);
229     const MachineOperand *Offset1Imm = getNamedOperand(*LdSt,
230                                                        AMDGPU::OpName::offset1);
231 
232     uint8_t Offset0 = Offset0Imm->getImm();
233     uint8_t Offset1 = Offset1Imm->getImm();
234 
235     if (Offset1 > Offset0 && Offset1 - Offset0 == 1) {
236       // Each of these offsets is in element sized units, so we need to convert
237       // to bytes of the individual reads.
238 
239       unsigned EltSize;
240       if (LdSt->mayLoad())
241         EltSize = getOpRegClass(*LdSt, 0)->getSize() / 2;
242       else {
243         assert(LdSt->mayStore());
244         int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
245         EltSize = getOpRegClass(*LdSt, Data0Idx)->getSize();
246       }
247 
248       if (isStride64(Opc))
249         EltSize *= 64;
250 
251       const MachineOperand *AddrReg = getNamedOperand(*LdSt,
252                                                       AMDGPU::OpName::addr);
253       BaseReg = AddrReg->getReg();
254       Offset = EltSize * Offset0;
255       return true;
256     }
257 
258     return false;
259   }
260 
261   if (isMUBUF(*LdSt) || isMTBUF(*LdSt)) {
262     if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset) != -1)
263       return false;
264 
265     const MachineOperand *AddrReg = getNamedOperand(*LdSt,
266                                                     AMDGPU::OpName::vaddr);
267     if (!AddrReg)
268       return false;
269 
270     const MachineOperand *OffsetImm = getNamedOperand(*LdSt,
271                                                       AMDGPU::OpName::offset);
272     BaseReg = AddrReg->getReg();
273     Offset = OffsetImm->getImm();
274     return true;
275   }
276 
277   if (isSMRD(*LdSt)) {
278     const MachineOperand *OffsetImm = getNamedOperand(*LdSt,
279                                                       AMDGPU::OpName::offset);
280     if (!OffsetImm)
281       return false;
282 
283     const MachineOperand *SBaseReg = getNamedOperand(*LdSt,
284                                                      AMDGPU::OpName::sbase);
285     BaseReg = SBaseReg->getReg();
286     Offset = OffsetImm->getImm();
287     return true;
288   }
289 
290   if (isFLAT(*LdSt)) {
291     const MachineOperand *AddrReg = getNamedOperand(*LdSt, AMDGPU::OpName::addr);
292     BaseReg = AddrReg->getReg();
293     Offset = 0;
294     return true;
295   }
296 
297   return false;
298 }
299 
300 bool SIInstrInfo::shouldClusterMemOps(MachineInstr *FirstLdSt,
301                                       MachineInstr *SecondLdSt,
302                                       unsigned NumLoads) const {
303 	const MachineOperand *FirstDst = nullptr;
304 	const MachineOperand *SecondDst = nullptr;
305 
306   if (isDS(*FirstLdSt) && isDS(*SecondLdSt)) {
307     FirstDst = getNamedOperand(*FirstLdSt, AMDGPU::OpName::vdst);
308     SecondDst = getNamedOperand(*SecondLdSt, AMDGPU::OpName::vdst);
309   }
310 
311   if (isSMRD(*FirstLdSt) && isSMRD(*SecondLdSt)) {
312     FirstDst = getNamedOperand(*FirstLdSt, AMDGPU::OpName::sdst);
313     SecondDst = getNamedOperand(*SecondLdSt, AMDGPU::OpName::sdst);
314   }
315 
316   if ((isMUBUF(*FirstLdSt) && isMUBUF(*SecondLdSt)) ||
317       (isMTBUF(*FirstLdSt) && isMTBUF(*SecondLdSt))) {
318     FirstDst = getNamedOperand(*FirstLdSt, AMDGPU::OpName::vdata);
319     SecondDst = getNamedOperand(*SecondLdSt, AMDGPU::OpName::vdata);
320   }
321 
322   if (!FirstDst || !SecondDst)
323     return false;
324 
325   // Try to limit clustering based on the total number of bytes loaded
326   // rather than the number of instructions.  This is done to help reduce
327   // register pressure.  The method used is somewhat inexact, though,
328   // because it assumes that all loads in the cluster will load the
329   // same number of bytes as FirstLdSt.
330 
331   // The unit of this value is bytes.
332   // FIXME: This needs finer tuning.
333   unsigned LoadClusterThreshold = 16;
334 
335   const MachineRegisterInfo &MRI =
336       FirstLdSt->getParent()->getParent()->getRegInfo();
337   const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg());
338 
339   return (NumLoads * DstRC->getSize()) <= LoadClusterThreshold;
340 }
341 
342 void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
343                               MachineBasicBlock::iterator MI,
344                               const DebugLoc &DL, unsigned DestReg,
345                               unsigned SrcReg, bool KillSrc) const {
346 
347   // If we are trying to copy to or from SCC, there is a bug somewhere else in
348   // the backend.  While it may be theoretically possible to do this, it should
349   // never be necessary.
350   assert(DestReg != AMDGPU::SCC && SrcReg != AMDGPU::SCC);
351 
352   static const int16_t Sub0_15[] = {
353     AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
354     AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
355     AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
356     AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
357   };
358 
359   static const int16_t Sub0_15_64[] = {
360     AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
361     AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
362     AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
363     AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
364   };
365 
366   static const int16_t Sub0_7[] = {
367     AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
368     AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
369   };
370 
371   static const int16_t Sub0_7_64[] = {
372     AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
373     AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
374   };
375 
376   static const int16_t Sub0_3[] = {
377     AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
378   };
379 
380   static const int16_t Sub0_3_64[] = {
381     AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
382   };
383 
384   static const int16_t Sub0_2[] = {
385     AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2,
386   };
387 
388   static const int16_t Sub0_1[] = {
389     AMDGPU::sub0, AMDGPU::sub1,
390   };
391 
392   unsigned Opcode;
393   ArrayRef<int16_t> SubIndices;
394   bool Forward;
395 
396   if (AMDGPU::SReg_32RegClass.contains(DestReg)) {
397     assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
398     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
399             .addReg(SrcReg, getKillRegState(KillSrc));
400     return;
401 
402   } else if (AMDGPU::SReg_64RegClass.contains(DestReg)) {
403     if (DestReg == AMDGPU::VCC) {
404       if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
405         BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
406           .addReg(SrcReg, getKillRegState(KillSrc));
407       } else {
408         // FIXME: Hack until VReg_1 removed.
409         assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
410         BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_I32_e32))
411           .addImm(0)
412           .addReg(SrcReg, getKillRegState(KillSrc));
413       }
414 
415       return;
416     }
417 
418     assert(AMDGPU::SReg_64RegClass.contains(SrcReg));
419     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
420             .addReg(SrcReg, getKillRegState(KillSrc));
421     return;
422 
423   } else if (AMDGPU::SReg_128RegClass.contains(DestReg)) {
424     assert(AMDGPU::SReg_128RegClass.contains(SrcReg));
425     Opcode = AMDGPU::S_MOV_B64;
426     SubIndices = Sub0_3_64;
427 
428   } else if (AMDGPU::SReg_256RegClass.contains(DestReg)) {
429     assert(AMDGPU::SReg_256RegClass.contains(SrcReg));
430     Opcode = AMDGPU::S_MOV_B64;
431     SubIndices = Sub0_7_64;
432 
433   } else if (AMDGPU::SReg_512RegClass.contains(DestReg)) {
434     assert(AMDGPU::SReg_512RegClass.contains(SrcReg));
435     Opcode = AMDGPU::S_MOV_B64;
436     SubIndices = Sub0_15_64;
437 
438   } else if (AMDGPU::VGPR_32RegClass.contains(DestReg)) {
439     assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
440            AMDGPU::SReg_32RegClass.contains(SrcReg));
441     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
442             .addReg(SrcReg, getKillRegState(KillSrc));
443     return;
444 
445   } else if (AMDGPU::VReg_64RegClass.contains(DestReg)) {
446     assert(AMDGPU::VReg_64RegClass.contains(SrcReg) ||
447            AMDGPU::SReg_64RegClass.contains(SrcReg));
448     Opcode = AMDGPU::V_MOV_B32_e32;
449     SubIndices = Sub0_1;
450 
451   } else if (AMDGPU::VReg_96RegClass.contains(DestReg)) {
452     assert(AMDGPU::VReg_96RegClass.contains(SrcReg));
453     Opcode = AMDGPU::V_MOV_B32_e32;
454     SubIndices = Sub0_2;
455 
456   } else if (AMDGPU::VReg_128RegClass.contains(DestReg)) {
457     assert(AMDGPU::VReg_128RegClass.contains(SrcReg) ||
458            AMDGPU::SReg_128RegClass.contains(SrcReg));
459     Opcode = AMDGPU::V_MOV_B32_e32;
460     SubIndices = Sub0_3;
461 
462   } else if (AMDGPU::VReg_256RegClass.contains(DestReg)) {
463     assert(AMDGPU::VReg_256RegClass.contains(SrcReg) ||
464            AMDGPU::SReg_256RegClass.contains(SrcReg));
465     Opcode = AMDGPU::V_MOV_B32_e32;
466     SubIndices = Sub0_7;
467 
468   } else if (AMDGPU::VReg_512RegClass.contains(DestReg)) {
469     assert(AMDGPU::VReg_512RegClass.contains(SrcReg) ||
470            AMDGPU::SReg_512RegClass.contains(SrcReg));
471     Opcode = AMDGPU::V_MOV_B32_e32;
472     SubIndices = Sub0_15;
473 
474   } else {
475     llvm_unreachable("Can't copy register!");
476   }
477 
478   if (RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg))
479     Forward = true;
480   else
481     Forward = false;
482 
483   for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
484     unsigned SubIdx;
485     if (Forward)
486       SubIdx = SubIndices[Idx];
487     else
488       SubIdx = SubIndices[SubIndices.size() - Idx - 1];
489 
490     MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
491       get(Opcode), RI.getSubReg(DestReg, SubIdx));
492 
493     Builder.addReg(RI.getSubReg(SrcReg, SubIdx));
494 
495     if (Idx == SubIndices.size() - 1)
496       Builder.addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
497 
498     if (Idx == 0)
499       Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
500   }
501 }
502 
503 int SIInstrInfo::commuteOpcode(const MachineInstr &MI) const {
504   const unsigned Opcode = MI.getOpcode();
505 
506   int NewOpc;
507 
508   // Try to map original to commuted opcode
509   NewOpc = AMDGPU::getCommuteRev(Opcode);
510   if (NewOpc != -1)
511     // Check if the commuted (REV) opcode exists on the target.
512     return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
513 
514   // Try to map commuted to original opcode
515   NewOpc = AMDGPU::getCommuteOrig(Opcode);
516   if (NewOpc != -1)
517     // Check if the original (non-REV) opcode exists on the target.
518     return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
519 
520   return Opcode;
521 }
522 
523 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
524 
525   if (DstRC->getSize() == 4) {
526     return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
527   } else if (DstRC->getSize() == 8 && RI.isSGPRClass(DstRC)) {
528     return AMDGPU::S_MOV_B64;
529   } else if (DstRC->getSize() == 8 && !RI.isSGPRClass(DstRC)) {
530     return  AMDGPU::V_MOV_B64_PSEUDO;
531   }
532   return AMDGPU::COPY;
533 }
534 
535 static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
536   switch (Size) {
537   case 4:
538     return AMDGPU::SI_SPILL_S32_SAVE;
539   case 8:
540     return AMDGPU::SI_SPILL_S64_SAVE;
541   case 16:
542     return AMDGPU::SI_SPILL_S128_SAVE;
543   case 32:
544     return AMDGPU::SI_SPILL_S256_SAVE;
545   case 64:
546     return AMDGPU::SI_SPILL_S512_SAVE;
547   default:
548     llvm_unreachable("unknown register size");
549   }
550 }
551 
552 static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
553   switch (Size) {
554   case 4:
555     return AMDGPU::SI_SPILL_V32_SAVE;
556   case 8:
557     return AMDGPU::SI_SPILL_V64_SAVE;
558   case 12:
559     return AMDGPU::SI_SPILL_V96_SAVE;
560   case 16:
561     return AMDGPU::SI_SPILL_V128_SAVE;
562   case 32:
563     return AMDGPU::SI_SPILL_V256_SAVE;
564   case 64:
565     return AMDGPU::SI_SPILL_V512_SAVE;
566   default:
567     llvm_unreachable("unknown register size");
568   }
569 }
570 
571 void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
572                                       MachineBasicBlock::iterator MI,
573                                       unsigned SrcReg, bool isKill,
574                                       int FrameIndex,
575                                       const TargetRegisterClass *RC,
576                                       const TargetRegisterInfo *TRI) const {
577   MachineFunction *MF = MBB.getParent();
578   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
579   MachineFrameInfo *FrameInfo = MF->getFrameInfo();
580   DebugLoc DL = MBB.findDebugLoc(MI);
581 
582   unsigned Size = FrameInfo->getObjectSize(FrameIndex);
583   unsigned Align = FrameInfo->getObjectAlignment(FrameIndex);
584   MachinePointerInfo PtrInfo
585     = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
586   MachineMemOperand *MMO
587     = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
588                                Size, Align);
589 
590   if (RI.isSGPRClass(RC)) {
591     MFI->setHasSpilledSGPRs();
592 
593     if (TargetRegisterInfo::isVirtualRegister(SrcReg) && RC->getSize() == 4) {
594       // m0 may not be allowed for readlane.
595       MachineRegisterInfo &MRI = MF->getRegInfo();
596       MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass);
597     }
598 
599     // We are only allowed to create one new instruction when spilling
600     // registers, so we need to use pseudo instruction for spilling
601     // SGPRs.
602     unsigned Opcode = getSGPRSpillSaveOpcode(RC->getSize());
603     BuildMI(MBB, MI, DL, get(Opcode))
604       .addReg(SrcReg)            // src
605       .addFrameIndex(FrameIndex) // frame_idx
606       .addMemOperand(MMO);
607 
608     return;
609   }
610 
611   if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) {
612     LLVMContext &Ctx = MF->getFunction()->getContext();
613     Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to"
614                   " spill register");
615     BuildMI(MBB, MI, DL, get(AMDGPU::KILL))
616       .addReg(SrcReg);
617 
618     return;
619   }
620 
621   assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
622 
623   unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize());
624   MFI->setHasSpilledVGPRs();
625   BuildMI(MBB, MI, DL, get(Opcode))
626     .addReg(SrcReg)                   // src
627     .addFrameIndex(FrameIndex)        // frame_idx
628     .addReg(MFI->getScratchRSrcReg())       // scratch_rsrc
629     .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
630     .addImm(0)                              // offset
631     .addMemOperand(MMO);
632 }
633 
634 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
635   switch (Size) {
636   case 4:
637     return AMDGPU::SI_SPILL_S32_RESTORE;
638   case 8:
639     return AMDGPU::SI_SPILL_S64_RESTORE;
640   case 16:
641     return AMDGPU::SI_SPILL_S128_RESTORE;
642   case 32:
643     return AMDGPU::SI_SPILL_S256_RESTORE;
644   case 64:
645     return AMDGPU::SI_SPILL_S512_RESTORE;
646   default:
647     llvm_unreachable("unknown register size");
648   }
649 }
650 
651 static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
652   switch (Size) {
653   case 4:
654     return AMDGPU::SI_SPILL_V32_RESTORE;
655   case 8:
656     return AMDGPU::SI_SPILL_V64_RESTORE;
657   case 12:
658     return AMDGPU::SI_SPILL_V96_RESTORE;
659   case 16:
660     return AMDGPU::SI_SPILL_V128_RESTORE;
661   case 32:
662     return AMDGPU::SI_SPILL_V256_RESTORE;
663   case 64:
664     return AMDGPU::SI_SPILL_V512_RESTORE;
665   default:
666     llvm_unreachable("unknown register size");
667   }
668 }
669 
670 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
671                                        MachineBasicBlock::iterator MI,
672                                        unsigned DestReg, int FrameIndex,
673                                        const TargetRegisterClass *RC,
674                                        const TargetRegisterInfo *TRI) const {
675   MachineFunction *MF = MBB.getParent();
676   const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
677   MachineFrameInfo *FrameInfo = MF->getFrameInfo();
678   DebugLoc DL = MBB.findDebugLoc(MI);
679   unsigned Align = FrameInfo->getObjectAlignment(FrameIndex);
680   unsigned Size = FrameInfo->getObjectSize(FrameIndex);
681 
682   MachinePointerInfo PtrInfo
683     = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
684 
685   MachineMemOperand *MMO = MF->getMachineMemOperand(
686     PtrInfo, MachineMemOperand::MOLoad, Size, Align);
687 
688   if (RI.isSGPRClass(RC)) {
689     // FIXME: Maybe this should not include a memoperand because it will be
690     // lowered to non-memory instructions.
691     unsigned Opcode = getSGPRSpillRestoreOpcode(RC->getSize());
692 
693     if (TargetRegisterInfo::isVirtualRegister(DestReg) && RC->getSize() == 4) {
694       // m0 may not be allowed for readlane.
695       MachineRegisterInfo &MRI = MF->getRegInfo();
696       MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
697     }
698 
699     BuildMI(MBB, MI, DL, get(Opcode), DestReg)
700       .addFrameIndex(FrameIndex) // frame_idx
701       .addMemOperand(MMO);
702 
703     return;
704   }
705 
706   if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) {
707     LLVMContext &Ctx = MF->getFunction()->getContext();
708     Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to"
709                   " restore register");
710     BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg);
711 
712     return;
713   }
714 
715   assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
716 
717   unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize());
718   BuildMI(MBB, MI, DL, get(Opcode), DestReg)
719     .addFrameIndex(FrameIndex)        // frame_idx
720     .addReg(MFI->getScratchRSrcReg())       // scratch_rsrc
721     .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
722     .addImm(0)                              // offset
723     .addMemOperand(MMO);
724 }
725 
726 /// \param @Offset Offset in bytes of the FrameIndex being spilled
727 unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB,
728                                                MachineBasicBlock::iterator MI,
729                                                RegScavenger *RS, unsigned TmpReg,
730                                                unsigned FrameOffset,
731                                                unsigned Size) const {
732   MachineFunction *MF = MBB.getParent();
733   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
734   const AMDGPUSubtarget &ST = MF->getSubtarget<AMDGPUSubtarget>();
735   const SIRegisterInfo *TRI =
736       static_cast<const SIRegisterInfo*>(ST.getRegisterInfo());
737   DebugLoc DL = MBB.findDebugLoc(MI);
738   unsigned WorkGroupSize = MFI->getMaximumWorkGroupSize(*MF);
739   unsigned WavefrontSize = ST.getWavefrontSize();
740 
741   unsigned TIDReg = MFI->getTIDReg();
742   if (!MFI->hasCalculatedTID()) {
743     MachineBasicBlock &Entry = MBB.getParent()->front();
744     MachineBasicBlock::iterator Insert = Entry.front();
745     DebugLoc DL = Insert->getDebugLoc();
746 
747     TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass);
748     if (TIDReg == AMDGPU::NoRegister)
749       return TIDReg;
750 
751 
752     if (!AMDGPU::isShader(MF->getFunction()->getCallingConv()) &&
753         WorkGroupSize > WavefrontSize) {
754 
755       unsigned TIDIGXReg
756         = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_X);
757       unsigned TIDIGYReg
758         = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Y);
759       unsigned TIDIGZReg
760         = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Z);
761       unsigned InputPtrReg =
762           TRI->getPreloadedValue(*MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
763       for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) {
764         if (!Entry.isLiveIn(Reg))
765           Entry.addLiveIn(Reg);
766       }
767 
768       RS->enterBasicBlock(Entry);
769       // FIXME: Can we scavenge an SReg_64 and access the subregs?
770       unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
771       unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
772       BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0)
773               .addReg(InputPtrReg)
774               .addImm(SI::KernelInputOffsets::NGROUPS_Z);
775       BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1)
776               .addReg(InputPtrReg)
777               .addImm(SI::KernelInputOffsets::NGROUPS_Y);
778 
779       // NGROUPS.X * NGROUPS.Y
780       BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1)
781               .addReg(STmp1)
782               .addReg(STmp0);
783       // (NGROUPS.X * NGROUPS.Y) * TIDIG.X
784       BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg)
785               .addReg(STmp1)
786               .addReg(TIDIGXReg);
787       // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)
788       BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg)
789               .addReg(STmp0)
790               .addReg(TIDIGYReg)
791               .addReg(TIDReg);
792       // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z
793       BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg)
794               .addReg(TIDReg)
795               .addReg(TIDIGZReg);
796     } else {
797       // Get the wave id
798       BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
799               TIDReg)
800               .addImm(-1)
801               .addImm(0);
802 
803       BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64),
804               TIDReg)
805               .addImm(-1)
806               .addReg(TIDReg);
807     }
808 
809     BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32),
810             TIDReg)
811             .addImm(2)
812             .addReg(TIDReg);
813     MFI->setTIDReg(TIDReg);
814   }
815 
816   // Add FrameIndex to LDS offset
817   unsigned LDSOffset = MFI->LDSSize + (FrameOffset * WorkGroupSize);
818   BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg)
819           .addImm(LDSOffset)
820           .addReg(TIDReg);
821 
822   return TmpReg;
823 }
824 
825 void SIInstrInfo::insertWaitStates(MachineBasicBlock &MBB,
826                                    MachineBasicBlock::iterator MI,
827                                    int Count) const {
828   DebugLoc DL = MBB.findDebugLoc(MI);
829   while (Count > 0) {
830     int Arg;
831     if (Count >= 8)
832       Arg = 7;
833     else
834       Arg = Count - 1;
835     Count -= 8;
836     BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP))
837             .addImm(Arg);
838   }
839 }
840 
841 void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,
842                              MachineBasicBlock::iterator MI) const {
843   insertWaitStates(MBB, MI, 1);
844 }
845 
846 unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const {
847   switch (MI.getOpcode()) {
848   default: return 1; // FIXME: Do wait states equal cycles?
849 
850   case AMDGPU::S_NOP:
851     return MI.getOperand(0).getImm() + 1;
852   }
853 }
854 
855 bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
856   MachineBasicBlock &MBB = *MI->getParent();
857   DebugLoc DL = MBB.findDebugLoc(MI);
858   switch (MI->getOpcode()) {
859   default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
860 
861   case AMDGPU::SGPR_USE:
862     // This is just a placeholder for register allocation.
863     MI->eraseFromParent();
864     break;
865 
866   case AMDGPU::V_MOV_B64_PSEUDO: {
867     unsigned Dst = MI->getOperand(0).getReg();
868     unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
869     unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
870 
871     const MachineOperand &SrcOp = MI->getOperand(1);
872     // FIXME: Will this work for 64-bit floating point immediates?
873     assert(!SrcOp.isFPImm());
874     if (SrcOp.isImm()) {
875       APInt Imm(64, SrcOp.getImm());
876       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
877         .addImm(Imm.getLoBits(32).getZExtValue())
878         .addReg(Dst, RegState::Implicit | RegState::Define);
879       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
880         .addImm(Imm.getHiBits(32).getZExtValue())
881         .addReg(Dst, RegState::Implicit | RegState::Define);
882     } else {
883       assert(SrcOp.isReg());
884       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
885         .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
886         .addReg(Dst, RegState::Implicit | RegState::Define);
887       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
888         .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
889         .addReg(Dst, RegState::Implicit | RegState::Define);
890     }
891     MI->eraseFromParent();
892     break;
893   }
894 
895   case AMDGPU::V_CNDMASK_B64_PSEUDO: {
896     unsigned Dst = MI->getOperand(0).getReg();
897     unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
898     unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
899     unsigned Src0 = MI->getOperand(1).getReg();
900     unsigned Src1 = MI->getOperand(2).getReg();
901     const MachineOperand &SrcCond = MI->getOperand(3);
902 
903     BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
904       .addReg(RI.getSubReg(Src0, AMDGPU::sub0))
905       .addReg(RI.getSubReg(Src1, AMDGPU::sub0))
906       .addReg(SrcCond.getReg())
907       .addReg(Dst, RegState::Implicit | RegState::Define);
908     BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
909       .addReg(RI.getSubReg(Src0, AMDGPU::sub1))
910       .addReg(RI.getSubReg(Src1, AMDGPU::sub1))
911       .addReg(SrcCond.getReg(), getKillRegState(SrcCond.isKill()))
912       .addReg(Dst, RegState::Implicit | RegState::Define);
913     MI->eraseFromParent();
914     break;
915   }
916 
917   case AMDGPU::SI_CONSTDATA_PTR: {
918     const SIRegisterInfo *TRI =
919         static_cast<const SIRegisterInfo *>(ST.getRegisterInfo());
920     MachineFunction &MF = *MBB.getParent();
921     unsigned Reg = MI->getOperand(0).getReg();
922     unsigned RegLo = TRI->getSubReg(Reg, AMDGPU::sub0);
923     unsigned RegHi = TRI->getSubReg(Reg, AMDGPU::sub1);
924 
925     // Create a bundle so these instructions won't be re-ordered by the
926     // post-RA scheduler.
927     MIBundleBuilder Bundler(MBB, MI);
928     Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
929 
930     // Add 32-bit offset from this instruction to the start of the
931     // constant data.
932     Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo)
933                            .addReg(RegLo)
934                            .addOperand(MI->getOperand(1)));
935     Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
936                            .addReg(RegHi)
937                            .addImm(0));
938 
939     llvm::finalizeBundle(MBB, Bundler.begin());
940 
941     MI->eraseFromParent();
942     break;
943   }
944   }
945   return true;
946 }
947 
948 /// Commutes the operands in the given instruction.
949 /// The commutable operands are specified by their indices OpIdx0 and OpIdx1.
950 ///
951 /// Do not call this method for a non-commutable instruction or for
952 /// non-commutable pair of operand indices OpIdx0 and OpIdx1.
953 /// Even though the instruction is commutable, the method may still
954 /// fail to commute the operands, null pointer is returned in such cases.
955 MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr *MI,
956                                                   bool NewMI,
957                                                   unsigned OpIdx0,
958                                                   unsigned OpIdx1) const {
959   int CommutedOpcode = commuteOpcode(*MI);
960   if (CommutedOpcode == -1)
961     return nullptr;
962 
963   int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
964                                            AMDGPU::OpName::src0);
965   MachineOperand &Src0 = MI->getOperand(Src0Idx);
966   if (!Src0.isReg())
967     return nullptr;
968 
969   int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
970                                            AMDGPU::OpName::src1);
971 
972   if ((OpIdx0 != static_cast<unsigned>(Src0Idx) ||
973        OpIdx1 != static_cast<unsigned>(Src1Idx)) &&
974       (OpIdx0 != static_cast<unsigned>(Src1Idx) ||
975        OpIdx1 != static_cast<unsigned>(Src0Idx)))
976     return nullptr;
977 
978   MachineOperand &Src1 = MI->getOperand(Src1Idx);
979 
980 
981   if (isVOP2(*MI) || isVOPC(*MI)) {
982     const MCInstrDesc &InstrDesc = MI->getDesc();
983     // For VOP2 and VOPC instructions, any operand type is valid to use for
984     // src0.  Make sure we can use the src0 as src1.
985     //
986     // We could be stricter here and only allow commuting if there is a reason
987     // to do so. i.e. if both operands are VGPRs there is no real benefit,
988     // although MachineCSE attempts to find matches by commuting.
989     const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
990     if (!isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0))
991       return nullptr;
992   }
993 
994   if (!Src1.isReg()) {
995     // Allow commuting instructions with Imm operands.
996     if (NewMI || !Src1.isImm() ||
997         (!isVOP2(*MI) && !isVOP3(*MI))) {
998       return nullptr;
999     }
1000     // Be sure to copy the source modifiers to the right place.
1001     if (MachineOperand *Src0Mods
1002           = getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) {
1003       MachineOperand *Src1Mods
1004         = getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers);
1005 
1006       int Src0ModsVal = Src0Mods->getImm();
1007       if (!Src1Mods && Src0ModsVal != 0)
1008         return nullptr;
1009 
1010       // XXX - This assert might be a lie. It might be useful to have a neg
1011       // modifier with 0.0.
1012       int Src1ModsVal = Src1Mods->getImm();
1013       assert((Src1ModsVal == 0) && "Not expecting modifiers with immediates");
1014 
1015       Src1Mods->setImm(Src0ModsVal);
1016       Src0Mods->setImm(Src1ModsVal);
1017     }
1018 
1019     unsigned Reg = Src0.getReg();
1020     unsigned SubReg = Src0.getSubReg();
1021     if (Src1.isImm())
1022       Src0.ChangeToImmediate(Src1.getImm());
1023     else
1024       llvm_unreachable("Should only have immediates");
1025 
1026     Src1.ChangeToRegister(Reg, false);
1027     Src1.setSubReg(SubReg);
1028   } else {
1029     MI = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx0, OpIdx1);
1030   }
1031 
1032   if (MI)
1033     MI->setDesc(get(CommutedOpcode));
1034 
1035   return MI;
1036 }
1037 
1038 // This needs to be implemented because the source modifiers may be inserted
1039 // between the true commutable operands, and the base
1040 // TargetInstrInfo::commuteInstruction uses it.
1041 bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI,
1042                                         unsigned &SrcOpIdx0,
1043                                         unsigned &SrcOpIdx1) const {
1044   const MCInstrDesc &MCID = MI->getDesc();
1045   if (!MCID.isCommutable())
1046     return false;
1047 
1048   unsigned Opc = MI->getOpcode();
1049   int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1050   if (Src0Idx == -1)
1051     return false;
1052 
1053   // FIXME: Workaround TargetInstrInfo::commuteInstruction asserting on
1054   // immediate. Also, immediate src0 operand is not handled in
1055   // SIInstrInfo::commuteInstruction();
1056   if (!MI->getOperand(Src0Idx).isReg())
1057     return false;
1058 
1059   int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
1060   if (Src1Idx == -1)
1061     return false;
1062 
1063   MachineOperand &Src1 = MI->getOperand(Src1Idx);
1064   if (Src1.isImm()) {
1065     // SIInstrInfo::commuteInstruction() does support commuting the immediate
1066     // operand src1 in 2 and 3 operand instructions.
1067     if (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode()))
1068       return false;
1069   } else if (Src1.isReg()) {
1070     // If any source modifiers are set, the generic instruction commuting won't
1071     // understand how to copy the source modifiers.
1072     if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) ||
1073         hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers))
1074       return false;
1075   } else
1076     return false;
1077 
1078   return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
1079 }
1080 
1081 unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
1082   switch (Cond) {
1083   case SIInstrInfo::SCC_TRUE:
1084     return AMDGPU::S_CBRANCH_SCC1;
1085   case SIInstrInfo::SCC_FALSE:
1086     return AMDGPU::S_CBRANCH_SCC0;
1087   case SIInstrInfo::VCCNZ:
1088     return AMDGPU::S_CBRANCH_VCCNZ;
1089   case SIInstrInfo::VCCZ:
1090     return AMDGPU::S_CBRANCH_VCCZ;
1091   case SIInstrInfo::EXECNZ:
1092     return AMDGPU::S_CBRANCH_EXECNZ;
1093   case SIInstrInfo::EXECZ:
1094     return AMDGPU::S_CBRANCH_EXECZ;
1095   default:
1096     llvm_unreachable("invalid branch predicate");
1097   }
1098 }
1099 
1100 SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
1101   switch (Opcode) {
1102   case AMDGPU::S_CBRANCH_SCC0:
1103     return SCC_FALSE;
1104   case AMDGPU::S_CBRANCH_SCC1:
1105     return SCC_TRUE;
1106   case AMDGPU::S_CBRANCH_VCCNZ:
1107     return VCCNZ;
1108   case AMDGPU::S_CBRANCH_VCCZ:
1109     return VCCZ;
1110   case AMDGPU::S_CBRANCH_EXECNZ:
1111     return EXECNZ;
1112   case AMDGPU::S_CBRANCH_EXECZ:
1113     return EXECZ;
1114   default:
1115     return INVALID_BR;
1116   }
1117 }
1118 
1119 bool SIInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
1120                                 MachineBasicBlock *&TBB,
1121                                 MachineBasicBlock *&FBB,
1122                                 SmallVectorImpl<MachineOperand> &Cond,
1123                                 bool AllowModify) const {
1124   MachineBasicBlock::iterator I = MBB.getFirstTerminator();
1125 
1126   if (I == MBB.end())
1127     return false;
1128 
1129   if (I->getOpcode() == AMDGPU::S_BRANCH) {
1130     // Unconditional Branch
1131     TBB = I->getOperand(0).getMBB();
1132     return false;
1133   }
1134 
1135   BranchPredicate Pred = getBranchPredicate(I->getOpcode());
1136   if (Pred == INVALID_BR)
1137     return true;
1138 
1139   MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
1140   Cond.push_back(MachineOperand::CreateImm(Pred));
1141 
1142   ++I;
1143 
1144   if (I == MBB.end()) {
1145     // Conditional branch followed by fall-through.
1146     TBB = CondBB;
1147     return false;
1148   }
1149 
1150   if (I->getOpcode() == AMDGPU::S_BRANCH) {
1151     TBB = CondBB;
1152     FBB = I->getOperand(0).getMBB();
1153     return false;
1154   }
1155 
1156   return true;
1157 }
1158 
1159 unsigned SIInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
1160   MachineBasicBlock::iterator I = MBB.getFirstTerminator();
1161 
1162   unsigned Count = 0;
1163   while (I != MBB.end()) {
1164     MachineBasicBlock::iterator Next = std::next(I);
1165     I->eraseFromParent();
1166     ++Count;
1167     I = Next;
1168   }
1169 
1170   return Count;
1171 }
1172 
1173 unsigned SIInstrInfo::InsertBranch(MachineBasicBlock &MBB,
1174                                    MachineBasicBlock *TBB,
1175                                    MachineBasicBlock *FBB,
1176                                    ArrayRef<MachineOperand> Cond,
1177                                    const DebugLoc &DL) const {
1178 
1179   if (!FBB && Cond.empty()) {
1180     BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
1181       .addMBB(TBB);
1182     return 1;
1183   }
1184 
1185   assert(TBB && Cond[0].isImm());
1186 
1187   unsigned Opcode
1188     = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
1189 
1190   if (!FBB) {
1191     BuildMI(&MBB, DL, get(Opcode))
1192       .addMBB(TBB);
1193     return 1;
1194   }
1195 
1196   assert(TBB && FBB);
1197 
1198   BuildMI(&MBB, DL, get(Opcode))
1199     .addMBB(TBB);
1200   BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
1201     .addMBB(FBB);
1202 
1203   return 2;
1204 }
1205 
1206 bool SIInstrInfo::ReverseBranchCondition(
1207   SmallVectorImpl<MachineOperand> &Cond) const {
1208   assert(Cond.size() == 1);
1209   Cond[0].setImm(-Cond[0].getImm());
1210   return false;
1211 }
1212 
1213 static void removeModOperands(MachineInstr &MI) {
1214   unsigned Opc = MI.getOpcode();
1215   int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc,
1216                                               AMDGPU::OpName::src0_modifiers);
1217   int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc,
1218                                               AMDGPU::OpName::src1_modifiers);
1219   int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc,
1220                                               AMDGPU::OpName::src2_modifiers);
1221 
1222   MI.RemoveOperand(Src2ModIdx);
1223   MI.RemoveOperand(Src1ModIdx);
1224   MI.RemoveOperand(Src0ModIdx);
1225 }
1226 
1227 // TODO: Maybe this should be removed this and custom fold everything in
1228 // SIFoldOperands?
1229 bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
1230                                 unsigned Reg, MachineRegisterInfo *MRI) const {
1231   if (!MRI->hasOneNonDBGUse(Reg))
1232     return false;
1233 
1234   unsigned Opc = UseMI->getOpcode();
1235   if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64) {
1236     // Don't fold if we are using source modifiers. The new VOP2 instructions
1237     // don't have them.
1238     if (hasModifiersSet(*UseMI, AMDGPU::OpName::src0_modifiers) ||
1239         hasModifiersSet(*UseMI, AMDGPU::OpName::src1_modifiers) ||
1240         hasModifiersSet(*UseMI, AMDGPU::OpName::src2_modifiers)) {
1241       return false;
1242     }
1243 
1244     const MachineOperand &ImmOp = DefMI->getOperand(1);
1245 
1246     // If this is a free constant, there's no reason to do this.
1247     // TODO: We could fold this here instead of letting SIFoldOperands do it
1248     // later.
1249     if (isInlineConstant(ImmOp, 4))
1250       return false;
1251 
1252     MachineOperand *Src0 = getNamedOperand(*UseMI, AMDGPU::OpName::src0);
1253     MachineOperand *Src1 = getNamedOperand(*UseMI, AMDGPU::OpName::src1);
1254     MachineOperand *Src2 = getNamedOperand(*UseMI, AMDGPU::OpName::src2);
1255 
1256     // Multiplied part is the constant: Use v_madmk_f32
1257     // We should only expect these to be on src0 due to canonicalizations.
1258     if (Src0->isReg() && Src0->getReg() == Reg) {
1259       if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
1260         return false;
1261 
1262       if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
1263         return false;
1264 
1265       // We need to swap operands 0 and 1 since madmk constant is at operand 1.
1266 
1267       const int64_t Imm = DefMI->getOperand(1).getImm();
1268 
1269       // FIXME: This would be a lot easier if we could return a new instruction
1270       // instead of having to modify in place.
1271 
1272       // Remove these first since they are at the end.
1273       UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc,
1274                                                       AMDGPU::OpName::omod));
1275       UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc,
1276                                                       AMDGPU::OpName::clamp));
1277 
1278       unsigned Src1Reg = Src1->getReg();
1279       unsigned Src1SubReg = Src1->getSubReg();
1280       Src0->setReg(Src1Reg);
1281       Src0->setSubReg(Src1SubReg);
1282       Src0->setIsKill(Src1->isKill());
1283 
1284       if (Opc == AMDGPU::V_MAC_F32_e64) {
1285         UseMI->untieRegOperand(
1286           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
1287       }
1288 
1289       Src1->ChangeToImmediate(Imm);
1290 
1291       removeModOperands(*UseMI);
1292       UseMI->setDesc(get(AMDGPU::V_MADMK_F32));
1293 
1294       bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
1295       if (DeleteDef)
1296         DefMI->eraseFromParent();
1297 
1298       return true;
1299     }
1300 
1301     // Added part is the constant: Use v_madak_f32
1302     if (Src2->isReg() && Src2->getReg() == Reg) {
1303       // Not allowed to use constant bus for another operand.
1304       // We can however allow an inline immediate as src0.
1305       if (!Src0->isImm() &&
1306           (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))
1307         return false;
1308 
1309       if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
1310         return false;
1311 
1312       const int64_t Imm = DefMI->getOperand(1).getImm();
1313 
1314       // FIXME: This would be a lot easier if we could return a new instruction
1315       // instead of having to modify in place.
1316 
1317       // Remove these first since they are at the end.
1318       UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc,
1319                                                       AMDGPU::OpName::omod));
1320       UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc,
1321                                                       AMDGPU::OpName::clamp));
1322 
1323       if (Opc == AMDGPU::V_MAC_F32_e64) {
1324         UseMI->untieRegOperand(
1325           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
1326       }
1327 
1328       // ChangingToImmediate adds Src2 back to the instruction.
1329       Src2->ChangeToImmediate(Imm);
1330 
1331       // These come before src2.
1332       removeModOperands(*UseMI);
1333       UseMI->setDesc(get(AMDGPU::V_MADAK_F32));
1334 
1335       bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
1336       if (DeleteDef)
1337         DefMI->eraseFromParent();
1338 
1339       return true;
1340     }
1341   }
1342 
1343   return false;
1344 }
1345 
1346 static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
1347                                 int WidthB, int OffsetB) {
1348   int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
1349   int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
1350   int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
1351   return LowOffset + LowWidth <= HighOffset;
1352 }
1353 
1354 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr *MIa,
1355                                                MachineInstr *MIb) const {
1356   unsigned BaseReg0, BaseReg1;
1357   int64_t Offset0, Offset1;
1358 
1359   if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) &&
1360       getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) {
1361 
1362     if (!MIa->hasOneMemOperand() || !MIb->hasOneMemOperand()) {
1363       // FIXME: Handle ds_read2 / ds_write2.
1364       return false;
1365     }
1366     unsigned Width0 = (*MIa->memoperands_begin())->getSize();
1367     unsigned Width1 = (*MIb->memoperands_begin())->getSize();
1368     if (BaseReg0 == BaseReg1 &&
1369         offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) {
1370       return true;
1371     }
1372   }
1373 
1374   return false;
1375 }
1376 
1377 bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa,
1378                                                   MachineInstr *MIb,
1379                                                   AliasAnalysis *AA) const {
1380   assert(MIa && (MIa->mayLoad() || MIa->mayStore()) &&
1381          "MIa must load from or modify a memory location");
1382   assert(MIb && (MIb->mayLoad() || MIb->mayStore()) &&
1383          "MIb must load from or modify a memory location");
1384 
1385   if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects())
1386     return false;
1387 
1388   // XXX - Can we relax this between address spaces?
1389   if (MIa->hasOrderedMemoryRef() || MIb->hasOrderedMemoryRef())
1390     return false;
1391 
1392   // TODO: Should we check the address space from the MachineMemOperand? That
1393   // would allow us to distinguish objects we know don't alias based on the
1394   // underlying address space, even if it was lowered to a different one,
1395   // e.g. private accesses lowered to use MUBUF instructions on a scratch
1396   // buffer.
1397   if (isDS(*MIa)) {
1398     if (isDS(*MIb))
1399       return checkInstOffsetsDoNotOverlap(MIa, MIb);
1400 
1401     return !isFLAT(*MIb);
1402   }
1403 
1404   if (isMUBUF(*MIa) || isMTBUF(*MIa)) {
1405     if (isMUBUF(*MIb) || isMTBUF(*MIb))
1406       return checkInstOffsetsDoNotOverlap(MIa, MIb);
1407 
1408     return !isFLAT(*MIb) && !isSMRD(*MIb);
1409   }
1410 
1411   if (isSMRD(*MIa)) {
1412     if (isSMRD(*MIb))
1413       return checkInstOffsetsDoNotOverlap(MIa, MIb);
1414 
1415     return !isFLAT(*MIb) && !isMUBUF(*MIa) && !isMTBUF(*MIa);
1416   }
1417 
1418   if (isFLAT(*MIa)) {
1419     if (isFLAT(*MIb))
1420       return checkInstOffsetsDoNotOverlap(MIa, MIb);
1421 
1422     return false;
1423   }
1424 
1425   return false;
1426 }
1427 
1428 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
1429                                                 MachineBasicBlock::iterator &MI,
1430                                                 LiveVariables *LV) const {
1431 
1432   switch (MI->getOpcode()) {
1433     default: return nullptr;
1434     case AMDGPU::V_MAC_F32_e64: break;
1435     case AMDGPU::V_MAC_F32_e32: {
1436       const MachineOperand *Src0 = getNamedOperand(*MI, AMDGPU::OpName::src0);
1437       if (Src0->isImm() && !isInlineConstant(*Src0, 4))
1438         return nullptr;
1439       break;
1440     }
1441   }
1442 
1443   const MachineOperand *Dst = getNamedOperand(*MI, AMDGPU::OpName::vdst);
1444   const MachineOperand *Src0 = getNamedOperand(*MI, AMDGPU::OpName::src0);
1445   const MachineOperand *Src1 = getNamedOperand(*MI, AMDGPU::OpName::src1);
1446   const MachineOperand *Src2 = getNamedOperand(*MI, AMDGPU::OpName::src2);
1447 
1448   return BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_MAD_F32))
1449                  .addOperand(*Dst)
1450                  .addImm(0) // Src0 mods
1451                  .addOperand(*Src0)
1452                  .addImm(0) // Src1 mods
1453                  .addOperand(*Src1)
1454                  .addImm(0) // Src mods
1455                  .addOperand(*Src2)
1456                  .addImm(0)  // clamp
1457                  .addImm(0); // omod
1458 }
1459 
1460 bool SIInstrInfo::isSchedulingBoundary(const MachineInstr *MI,
1461                                        const MachineBasicBlock *MBB,
1462                                        const MachineFunction &MF) const {
1463   // Target-independent instructions do not have an implicit-use of EXEC, even
1464   // when they operate on VGPRs. Treating EXEC modifications as scheduling
1465   // boundaries prevents incorrect movements of such instructions.
1466   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
1467   if (MI->modifiesRegister(AMDGPU::EXEC, TRI))
1468     return true;
1469 
1470   return AMDGPUInstrInfo::isSchedulingBoundary(MI, MBB, MF);
1471 }
1472 
1473 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
1474   int64_t SVal = Imm.getSExtValue();
1475   if (SVal >= -16 && SVal <= 64)
1476     return true;
1477 
1478   if (Imm.getBitWidth() == 64) {
1479     uint64_t Val = Imm.getZExtValue();
1480     return (DoubleToBits(0.0) == Val) ||
1481            (DoubleToBits(1.0) == Val) ||
1482            (DoubleToBits(-1.0) == Val) ||
1483            (DoubleToBits(0.5) == Val) ||
1484            (DoubleToBits(-0.5) == Val) ||
1485            (DoubleToBits(2.0) == Val) ||
1486            (DoubleToBits(-2.0) == Val) ||
1487            (DoubleToBits(4.0) == Val) ||
1488            (DoubleToBits(-4.0) == Val);
1489   }
1490 
1491   // The actual type of the operand does not seem to matter as long
1492   // as the bits match one of the inline immediate values.  For example:
1493   //
1494   // -nan has the hexadecimal encoding of 0xfffffffe which is -2 in decimal,
1495   // so it is a legal inline immediate.
1496   //
1497   // 1065353216 has the hexadecimal encoding 0x3f800000 which is 1.0f in
1498   // floating-point, so it is a legal inline immediate.
1499   uint32_t Val = Imm.getZExtValue();
1500 
1501   return (FloatToBits(0.0f) == Val) ||
1502          (FloatToBits(1.0f) == Val) ||
1503          (FloatToBits(-1.0f) == Val) ||
1504          (FloatToBits(0.5f) == Val) ||
1505          (FloatToBits(-0.5f) == Val) ||
1506          (FloatToBits(2.0f) == Val) ||
1507          (FloatToBits(-2.0f) == Val) ||
1508          (FloatToBits(4.0f) == Val) ||
1509          (FloatToBits(-4.0f) == Val);
1510 }
1511 
1512 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
1513                                    unsigned OpSize) const {
1514   if (MO.isImm()) {
1515     // MachineOperand provides no way to tell the true operand size, since it
1516     // only records a 64-bit value. We need to know the size to determine if a
1517     // 32-bit floating point immediate bit pattern is legal for an integer
1518     // immediate. It would be for any 32-bit integer operand, but would not be
1519     // for a 64-bit one.
1520 
1521     unsigned BitSize = 8 * OpSize;
1522     return isInlineConstant(APInt(BitSize, MO.getImm(), true));
1523   }
1524 
1525   return false;
1526 }
1527 
1528 bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO,
1529                                     unsigned OpSize) const {
1530   return MO.isImm() && !isInlineConstant(MO, OpSize);
1531 }
1532 
1533 static bool compareMachineOp(const MachineOperand &Op0,
1534                              const MachineOperand &Op1) {
1535   if (Op0.getType() != Op1.getType())
1536     return false;
1537 
1538   switch (Op0.getType()) {
1539   case MachineOperand::MO_Register:
1540     return Op0.getReg() == Op1.getReg();
1541   case MachineOperand::MO_Immediate:
1542     return Op0.getImm() == Op1.getImm();
1543   default:
1544     llvm_unreachable("Didn't expect to be comparing these operand types");
1545   }
1546 }
1547 
1548 bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo,
1549                                  const MachineOperand &MO) const {
1550   const MCOperandInfo &OpInfo = get(MI->getOpcode()).OpInfo[OpNo];
1551 
1552   assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
1553 
1554   if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
1555     return true;
1556 
1557   if (OpInfo.RegClass < 0)
1558     return false;
1559 
1560   unsigned OpSize = RI.getRegClass(OpInfo.RegClass)->getSize();
1561   if (isLiteralConstant(MO, OpSize))
1562     return RI.opCanUseLiteralConstant(OpInfo.OperandType);
1563 
1564   return RI.opCanUseInlineConstant(OpInfo.OperandType);
1565 }
1566 
1567 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
1568   int Op32 = AMDGPU::getVOPe32(Opcode);
1569   if (Op32 == -1)
1570     return false;
1571 
1572   return pseudoToMCOpcode(Op32) != -1;
1573 }
1574 
1575 bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
1576   // The src0_modifier operand is present on all instructions
1577   // that have modifiers.
1578 
1579   return AMDGPU::getNamedOperandIdx(Opcode,
1580                                     AMDGPU::OpName::src0_modifiers) != -1;
1581 }
1582 
1583 bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI,
1584                                   unsigned OpName) const {
1585   const MachineOperand *Mods = getNamedOperand(MI, OpName);
1586   return Mods && Mods->getImm();
1587 }
1588 
1589 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
1590                                   const MachineOperand &MO,
1591                                   unsigned OpSize) const {
1592   // Literal constants use the constant bus.
1593   if (isLiteralConstant(MO, OpSize))
1594     return true;
1595 
1596   if (!MO.isReg() || !MO.isUse())
1597     return false;
1598 
1599   if (TargetRegisterInfo::isVirtualRegister(MO.getReg()))
1600     return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
1601 
1602   // FLAT_SCR is just an SGPR pair.
1603   if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR))
1604     return true;
1605 
1606   // EXEC register uses the constant bus.
1607   if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC)
1608     return true;
1609 
1610   // SGPRs use the constant bus
1611   return (MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::M0 ||
1612           (!MO.isImplicit() &&
1613            (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) ||
1614             AMDGPU::SGPR_64RegClass.contains(MO.getReg()))));
1615 }
1616 
1617 static unsigned findImplicitSGPRRead(const MachineInstr &MI) {
1618   for (const MachineOperand &MO : MI.implicit_operands()) {
1619     // We only care about reads.
1620     if (MO.isDef())
1621       continue;
1622 
1623     switch (MO.getReg()) {
1624     case AMDGPU::VCC:
1625     case AMDGPU::M0:
1626     case AMDGPU::FLAT_SCR:
1627       return MO.getReg();
1628 
1629     default:
1630       break;
1631     }
1632   }
1633 
1634   return AMDGPU::NoRegister;
1635 }
1636 
1637 bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
1638                                     StringRef &ErrInfo) const {
1639   uint16_t Opcode = MI->getOpcode();
1640   const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
1641   int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
1642   int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
1643   int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
1644 
1645   // Make sure the number of operands is correct.
1646   const MCInstrDesc &Desc = get(Opcode);
1647   if (!Desc.isVariadic() &&
1648       Desc.getNumOperands() != MI->getNumExplicitOperands()) {
1649      ErrInfo = "Instruction has wrong number of operands.";
1650      return false;
1651   }
1652 
1653   // Make sure the register classes are correct.
1654   for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
1655     if (MI->getOperand(i).isFPImm()) {
1656       ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
1657                 "all fp values to integers.";
1658       return false;
1659     }
1660 
1661     int RegClass = Desc.OpInfo[i].RegClass;
1662 
1663     switch (Desc.OpInfo[i].OperandType) {
1664     case MCOI::OPERAND_REGISTER:
1665       if (MI->getOperand(i).isImm()) {
1666         ErrInfo = "Illegal immediate value for operand.";
1667         return false;
1668       }
1669       break;
1670     case AMDGPU::OPERAND_REG_IMM32:
1671       break;
1672     case AMDGPU::OPERAND_REG_INLINE_C:
1673       if (isLiteralConstant(MI->getOperand(i),
1674                             RI.getRegClass(RegClass)->getSize())) {
1675         ErrInfo = "Illegal immediate value for operand.";
1676         return false;
1677       }
1678       break;
1679     case MCOI::OPERAND_IMMEDIATE:
1680       // Check if this operand is an immediate.
1681       // FrameIndex operands will be replaced by immediates, so they are
1682       // allowed.
1683       if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFI()) {
1684         ErrInfo = "Expected immediate, but got non-immediate";
1685         return false;
1686       }
1687       // Fall-through
1688     default:
1689       continue;
1690     }
1691 
1692     if (!MI->getOperand(i).isReg())
1693       continue;
1694 
1695     if (RegClass != -1) {
1696       unsigned Reg = MI->getOperand(i).getReg();
1697       if (TargetRegisterInfo::isVirtualRegister(Reg))
1698         continue;
1699 
1700       const TargetRegisterClass *RC = RI.getRegClass(RegClass);
1701       if (!RC->contains(Reg)) {
1702         ErrInfo = "Operand has incorrect register class.";
1703         return false;
1704       }
1705     }
1706   }
1707 
1708 
1709   // Verify VOP*
1710   if (isVOP1(*MI) || isVOP2(*MI) || isVOP3(*MI) || isVOPC(*MI)) {
1711     // Only look at the true operands. Only a real operand can use the constant
1712     // bus, and we don't want to check pseudo-operands like the source modifier
1713     // flags.
1714     const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
1715 
1716     unsigned ConstantBusCount = 0;
1717     unsigned SGPRUsed = findImplicitSGPRRead(*MI);
1718     if (SGPRUsed != AMDGPU::NoRegister)
1719       ++ConstantBusCount;
1720 
1721     for (int OpIdx : OpIndices) {
1722       if (OpIdx == -1)
1723         break;
1724       const MachineOperand &MO = MI->getOperand(OpIdx);
1725       if (usesConstantBus(MRI, MO, getOpSize(Opcode, OpIdx))) {
1726         if (MO.isReg()) {
1727           if (MO.getReg() != SGPRUsed)
1728             ++ConstantBusCount;
1729           SGPRUsed = MO.getReg();
1730         } else {
1731           ++ConstantBusCount;
1732         }
1733       }
1734     }
1735     if (ConstantBusCount > 1) {
1736       ErrInfo = "VOP* instruction uses the constant bus more than once";
1737       return false;
1738     }
1739   }
1740 
1741   // Verify misc. restrictions on specific instructions.
1742   if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 ||
1743       Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) {
1744     const MachineOperand &Src0 = MI->getOperand(Src0Idx);
1745     const MachineOperand &Src1 = MI->getOperand(Src1Idx);
1746     const MachineOperand &Src2 = MI->getOperand(Src2Idx);
1747     if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
1748       if (!compareMachineOp(Src0, Src1) &&
1749           !compareMachineOp(Src0, Src2)) {
1750         ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
1751         return false;
1752       }
1753     }
1754   }
1755 
1756   // Make sure we aren't losing exec uses in the td files. This mostly requires
1757   // being careful when using let Uses to try to add other use registers.
1758   if (!isGenericOpcode(Opcode) && !isSALU(Opcode) && !isSMRD(Opcode)) {
1759     if (!MI->hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
1760       ErrInfo = "VALU instruction does not implicitly read exec mask";
1761       return false;
1762     }
1763   }
1764 
1765   return true;
1766 }
1767 
1768 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
1769   switch (MI.getOpcode()) {
1770   default: return AMDGPU::INSTRUCTION_LIST_END;
1771   case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
1772   case AMDGPU::COPY: return AMDGPU::COPY;
1773   case AMDGPU::PHI: return AMDGPU::PHI;
1774   case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
1775   case AMDGPU::S_MOV_B32:
1776     return MI.getOperand(1).isReg() ?
1777            AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
1778   case AMDGPU::S_ADD_I32:
1779   case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32;
1780   case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32;
1781   case AMDGPU::S_SUB_I32:
1782   case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32;
1783   case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
1784   case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32;
1785   case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e32;
1786   case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e32;
1787   case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e32;
1788   case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e32;
1789   case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e32;
1790   case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e32;
1791   case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e32;
1792   case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
1793   case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64;
1794   case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
1795   case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64;
1796   case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
1797   case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64;
1798   case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32;
1799   case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32;
1800   case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32;
1801   case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32;
1802   case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
1803   case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
1804   case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
1805   case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
1806   case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32;
1807   case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32;
1808   case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32;
1809   case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32;
1810   case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
1811   case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
1812   case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32;
1813   case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32;
1814   case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32;
1815   case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32;
1816   case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32;
1817   case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32;
1818   case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
1819   case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
1820   case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
1821   case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
1822   case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
1823   case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
1824   }
1825 }
1826 
1827 bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const {
1828   return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END;
1829 }
1830 
1831 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
1832                                                       unsigned OpNo) const {
1833   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
1834   const MCInstrDesc &Desc = get(MI.getOpcode());
1835   if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
1836       Desc.OpInfo[OpNo].RegClass == -1) {
1837     unsigned Reg = MI.getOperand(OpNo).getReg();
1838 
1839     if (TargetRegisterInfo::isVirtualRegister(Reg))
1840       return MRI.getRegClass(Reg);
1841     return RI.getPhysRegClass(Reg);
1842   }
1843 
1844   unsigned RCID = Desc.OpInfo[OpNo].RegClass;
1845   return RI.getRegClass(RCID);
1846 }
1847 
1848 bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const {
1849   switch (MI.getOpcode()) {
1850   case AMDGPU::COPY:
1851   case AMDGPU::REG_SEQUENCE:
1852   case AMDGPU::PHI:
1853   case AMDGPU::INSERT_SUBREG:
1854     return RI.hasVGPRs(getOpRegClass(MI, 0));
1855   default:
1856     return RI.hasVGPRs(getOpRegClass(MI, OpNo));
1857   }
1858 }
1859 
1860 void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const {
1861   MachineBasicBlock::iterator I = MI;
1862   MachineBasicBlock *MBB = MI->getParent();
1863   MachineOperand &MO = MI->getOperand(OpIdx);
1864   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1865   unsigned RCID = get(MI->getOpcode()).OpInfo[OpIdx].RegClass;
1866   const TargetRegisterClass *RC = RI.getRegClass(RCID);
1867   unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1868   if (MO.isReg())
1869     Opcode = AMDGPU::COPY;
1870   else if (RI.isSGPRClass(RC))
1871     Opcode = AMDGPU::S_MOV_B32;
1872 
1873 
1874   const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
1875   if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC))
1876     VRC = &AMDGPU::VReg_64RegClass;
1877   else
1878     VRC = &AMDGPU::VGPR_32RegClass;
1879 
1880   unsigned Reg = MRI.createVirtualRegister(VRC);
1881   DebugLoc DL = MBB->findDebugLoc(I);
1882   BuildMI(*MI->getParent(), I, DL, get(Opcode), Reg)
1883     .addOperand(MO);
1884   MO.ChangeToRegister(Reg, false);
1885 }
1886 
1887 unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
1888                                          MachineRegisterInfo &MRI,
1889                                          MachineOperand &SuperReg,
1890                                          const TargetRegisterClass *SuperRC,
1891                                          unsigned SubIdx,
1892                                          const TargetRegisterClass *SubRC)
1893                                          const {
1894   MachineBasicBlock *MBB = MI->getParent();
1895   DebugLoc DL = MI->getDebugLoc();
1896   unsigned SubReg = MRI.createVirtualRegister(SubRC);
1897 
1898   if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) {
1899     BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
1900       .addReg(SuperReg.getReg(), 0, SubIdx);
1901     return SubReg;
1902   }
1903 
1904   // Just in case the super register is itself a sub-register, copy it to a new
1905   // value so we don't need to worry about merging its subreg index with the
1906   // SubIdx passed to this function. The register coalescer should be able to
1907   // eliminate this extra copy.
1908   unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC);
1909 
1910   BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
1911     .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());
1912 
1913   BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
1914     .addReg(NewSuperReg, 0, SubIdx);
1915 
1916   return SubReg;
1917 }
1918 
1919 MachineOperand SIInstrInfo::buildExtractSubRegOrImm(
1920   MachineBasicBlock::iterator MII,
1921   MachineRegisterInfo &MRI,
1922   MachineOperand &Op,
1923   const TargetRegisterClass *SuperRC,
1924   unsigned SubIdx,
1925   const TargetRegisterClass *SubRC) const {
1926   if (Op.isImm()) {
1927     // XXX - Is there a better way to do this?
1928     if (SubIdx == AMDGPU::sub0)
1929       return MachineOperand::CreateImm(Op.getImm() & 0xFFFFFFFF);
1930     if (SubIdx == AMDGPU::sub1)
1931       return MachineOperand::CreateImm(Op.getImm() >> 32);
1932 
1933     llvm_unreachable("Unhandled register index for immediate");
1934   }
1935 
1936   unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
1937                                        SubIdx, SubRC);
1938   return MachineOperand::CreateReg(SubReg, false);
1939 }
1940 
1941 // Change the order of operands from (0, 1, 2) to (0, 2, 1)
1942 void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const {
1943   assert(Inst->getNumExplicitOperands() == 3);
1944   MachineOperand Op1 = Inst->getOperand(1);
1945   Inst->RemoveOperand(1);
1946   Inst->addOperand(Op1);
1947 }
1948 
1949 bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
1950                                     const MCOperandInfo &OpInfo,
1951                                     const MachineOperand &MO) const {
1952   if (!MO.isReg())
1953     return false;
1954 
1955   unsigned Reg = MO.getReg();
1956   const TargetRegisterClass *RC =
1957     TargetRegisterInfo::isVirtualRegister(Reg) ?
1958     MRI.getRegClass(Reg) :
1959     RI.getPhysRegClass(Reg);
1960 
1961   const SIRegisterInfo *TRI =
1962       static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
1963   RC = TRI->getSubRegClass(RC, MO.getSubReg());
1964 
1965   // In order to be legal, the common sub-class must be equal to the
1966   // class of the current operand.  For example:
1967   //
1968   // v_mov_b32 s0 ; Operand defined as vsrc_32
1969   //              ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL
1970   //
1971   // s_sendmsg 0, s0 ; Operand defined as m0reg
1972   //                 ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL
1973 
1974   return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC;
1975 }
1976 
1977 bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
1978                                      const MCOperandInfo &OpInfo,
1979                                      const MachineOperand &MO) const {
1980   if (MO.isReg())
1981     return isLegalRegOperand(MRI, OpInfo, MO);
1982 
1983   // Handle non-register types that are treated like immediates.
1984   assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
1985   return true;
1986 }
1987 
1988 bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
1989                                  const MachineOperand *MO) const {
1990   const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
1991   const MCInstrDesc &InstDesc = MI->getDesc();
1992   const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx];
1993   const TargetRegisterClass *DefinedRC =
1994       OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
1995   if (!MO)
1996     MO = &MI->getOperand(OpIdx);
1997 
1998   if (isVALU(*MI) &&
1999       usesConstantBus(MRI, *MO, DefinedRC->getSize())) {
2000 
2001     RegSubRegPair SGPRUsed;
2002     if (MO->isReg())
2003       SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg());
2004 
2005     for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
2006       if (i == OpIdx)
2007         continue;
2008       const MachineOperand &Op = MI->getOperand(i);
2009       if (Op.isReg() &&
2010           (Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) &&
2011           usesConstantBus(MRI, Op, getOpSize(*MI, i))) {
2012         return false;
2013       }
2014     }
2015   }
2016 
2017   if (MO->isReg()) {
2018     assert(DefinedRC);
2019     return isLegalRegOperand(MRI, OpInfo, *MO);
2020   }
2021 
2022 
2023   // Handle non-register types that are treated like immediates.
2024   assert(MO->isImm() || MO->isTargetIndex() || MO->isFI());
2025 
2026   if (!DefinedRC) {
2027     // This operand expects an immediate.
2028     return true;
2029   }
2030 
2031   return isImmOperandLegal(MI, OpIdx, *MO);
2032 }
2033 
2034 void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
2035                                        MachineInstr *MI) const {
2036   unsigned Opc = MI->getOpcode();
2037   const MCInstrDesc &InstrDesc = get(Opc);
2038 
2039   int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2040   MachineOperand &Src1 = MI->getOperand(Src1Idx);
2041 
2042   // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
2043   // we need to only have one constant bus use.
2044   //
2045   // Note we do not need to worry about literal constants here. They are
2046   // disabled for the operand type for instructions because they will always
2047   // violate the one constant bus use rule.
2048   bool HasImplicitSGPR = findImplicitSGPRRead(*MI) != AMDGPU::NoRegister;
2049   if (HasImplicitSGPR) {
2050     int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2051     MachineOperand &Src0 = MI->getOperand(Src0Idx);
2052 
2053     if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg()))
2054       legalizeOpWithMove(MI, Src0Idx);
2055   }
2056 
2057   // VOP2 src0 instructions support all operand types, so we don't need to check
2058   // their legality. If src1 is already legal, we don't need to do anything.
2059   if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
2060     return;
2061 
2062   // We do not use commuteInstruction here because it is too aggressive and will
2063   // commute if it is possible. We only want to commute here if it improves
2064   // legality. This can be called a fairly large number of times so don't waste
2065   // compile time pointlessly swapping and checking legality again.
2066   if (HasImplicitSGPR || !MI->isCommutable()) {
2067     legalizeOpWithMove(MI, Src1Idx);
2068     return;
2069   }
2070 
2071   int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2072   MachineOperand &Src0 = MI->getOperand(Src0Idx);
2073 
2074   // If src0 can be used as src1, commuting will make the operands legal.
2075   // Otherwise we have to give up and insert a move.
2076   //
2077   // TODO: Other immediate-like operand kinds could be commuted if there was a
2078   // MachineOperand::ChangeTo* for them.
2079   if ((!Src1.isImm() && !Src1.isReg()) ||
2080       !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) {
2081     legalizeOpWithMove(MI, Src1Idx);
2082     return;
2083   }
2084 
2085   int CommutedOpc = commuteOpcode(*MI);
2086   if (CommutedOpc == -1) {
2087     legalizeOpWithMove(MI, Src1Idx);
2088     return;
2089   }
2090 
2091   MI->setDesc(get(CommutedOpc));
2092 
2093   unsigned Src0Reg = Src0.getReg();
2094   unsigned Src0SubReg = Src0.getSubReg();
2095   bool Src0Kill = Src0.isKill();
2096 
2097   if (Src1.isImm())
2098     Src0.ChangeToImmediate(Src1.getImm());
2099   else if (Src1.isReg()) {
2100     Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
2101     Src0.setSubReg(Src1.getSubReg());
2102   } else
2103     llvm_unreachable("Should only have register or immediate operands");
2104 
2105   Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
2106   Src1.setSubReg(Src0SubReg);
2107 }
2108 
2109 // Legalize VOP3 operands. Because all operand types are supported for any
2110 // operand, and since literal constants are not allowed and should never be
2111 // seen, we only need to worry about inserting copies if we use multiple SGPR
2112 // operands.
2113 void SIInstrInfo::legalizeOperandsVOP3(
2114   MachineRegisterInfo &MRI,
2115   MachineInstr *MI) const {
2116   unsigned Opc = MI->getOpcode();
2117 
2118   int VOP3Idx[3] = {
2119     AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
2120     AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
2121     AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
2122   };
2123 
2124   // Find the one SGPR operand we are allowed to use.
2125   unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx);
2126 
2127   for (unsigned i = 0; i < 3; ++i) {
2128     int Idx = VOP3Idx[i];
2129     if (Idx == -1)
2130       break;
2131     MachineOperand &MO = MI->getOperand(Idx);
2132 
2133     // We should never see a VOP3 instruction with an illegal immediate operand.
2134     if (!MO.isReg())
2135       continue;
2136 
2137     if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
2138       continue; // VGPRs are legal
2139 
2140     if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) {
2141       SGPRReg = MO.getReg();
2142       // We can use one SGPR in each VOP3 instruction.
2143       continue;
2144     }
2145 
2146     // If we make it this far, then the operand is not legal and we must
2147     // legalize it.
2148     legalizeOpWithMove(MI, Idx);
2149   }
2150 }
2151 
2152 unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr *UseMI,
2153                                  MachineRegisterInfo &MRI) const {
2154   const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
2155   const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
2156   unsigned DstReg = MRI.createVirtualRegister(SRC);
2157   unsigned SubRegs = VRC->getSize() / 4;
2158 
2159   SmallVector<unsigned, 8> SRegs;
2160   for (unsigned i = 0; i < SubRegs; ++i) {
2161     unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2162     BuildMI(*UseMI->getParent(), UseMI, UseMI->getDebugLoc(),
2163             get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
2164             .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
2165     SRegs.push_back(SGPR);
2166   }
2167 
2168   MachineInstrBuilder MIB = BuildMI(*UseMI->getParent(), UseMI,
2169                                     UseMI->getDebugLoc(),
2170                                     get(AMDGPU::REG_SEQUENCE), DstReg);
2171   for (unsigned i = 0; i < SubRegs; ++i) {
2172     MIB.addReg(SRegs[i]);
2173     MIB.addImm(RI.getSubRegFromChannel(i));
2174   }
2175   return DstReg;
2176 }
2177 
2178 void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
2179                                        MachineInstr *MI) const {
2180 
2181   // If the pointer is store in VGPRs, then we need to move them to
2182   // SGPRs using v_readfirstlane.  This is safe because we only select
2183   // loads with uniform pointers to SMRD instruction so we know the
2184   // pointer value is uniform.
2185   MachineOperand *SBase = getNamedOperand(*MI, AMDGPU::OpName::sbase);
2186   if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
2187       unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
2188       SBase->setReg(SGPR);
2189   }
2190 }
2191 
2192 void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
2193   MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
2194 
2195   // Legalize VOP2
2196   if (isVOP2(*MI) || isVOPC(*MI)) {
2197     legalizeOperandsVOP2(MRI, MI);
2198     return;
2199   }
2200 
2201   // Legalize VOP3
2202   if (isVOP3(*MI)) {
2203     legalizeOperandsVOP3(MRI, MI);
2204     return;
2205   }
2206 
2207   // Legalize SMRD
2208   if (isSMRD(*MI)) {
2209     legalizeOperandsSMRD(MRI, MI);
2210     return;
2211   }
2212 
2213   // Legalize REG_SEQUENCE and PHI
2214   // The register class of the operands much be the same type as the register
2215   // class of the output.
2216   if (MI->getOpcode() == AMDGPU::PHI) {
2217     const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
2218     for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) {
2219       if (!MI->getOperand(i).isReg() ||
2220           !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg()))
2221         continue;
2222       const TargetRegisterClass *OpRC =
2223               MRI.getRegClass(MI->getOperand(i).getReg());
2224       if (RI.hasVGPRs(OpRC)) {
2225         VRC = OpRC;
2226       } else {
2227         SRC = OpRC;
2228       }
2229     }
2230 
2231     // If any of the operands are VGPR registers, then they all most be
2232     // otherwise we will create illegal VGPR->SGPR copies when legalizing
2233     // them.
2234     if (VRC || !RI.isSGPRClass(getOpRegClass(*MI, 0))) {
2235       if (!VRC) {
2236         assert(SRC);
2237         VRC = RI.getEquivalentVGPRClass(SRC);
2238       }
2239       RC = VRC;
2240     } else {
2241       RC = SRC;
2242     }
2243 
2244     // Update all the operands so they have the same type.
2245     for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) {
2246       MachineOperand &Op = MI->getOperand(I);
2247       if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
2248         continue;
2249       unsigned DstReg = MRI.createVirtualRegister(RC);
2250 
2251       // MI is a PHI instruction.
2252       MachineBasicBlock *InsertBB = MI->getOperand(I + 1).getMBB();
2253       MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator();
2254 
2255       BuildMI(*InsertBB, Insert, MI->getDebugLoc(), get(AMDGPU::COPY), DstReg)
2256         .addOperand(Op);
2257       Op.setReg(DstReg);
2258     }
2259   }
2260 
2261   // REG_SEQUENCE doesn't really require operand legalization, but if one has a
2262   // VGPR dest type and SGPR sources, insert copies so all operands are
2263   // VGPRs. This seems to help operand folding / the register coalescer.
2264   if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) {
2265     MachineBasicBlock *MBB = MI->getParent();
2266     const TargetRegisterClass *DstRC = getOpRegClass(*MI, 0);
2267     if (RI.hasVGPRs(DstRC)) {
2268       // Update all the operands so they are VGPR register classes. These may
2269       // not be the same register class because REG_SEQUENCE supports mixing
2270       // subregister index types e.g. sub0_sub1 + sub2 + sub3
2271       for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) {
2272         MachineOperand &Op = MI->getOperand(I);
2273         if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
2274           continue;
2275 
2276         const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
2277         const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
2278         if (VRC == OpRC)
2279           continue;
2280 
2281         unsigned DstReg = MRI.createVirtualRegister(VRC);
2282 
2283         BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), DstReg)
2284           .addOperand(Op);
2285 
2286         Op.setReg(DstReg);
2287         Op.setIsKill();
2288       }
2289     }
2290 
2291     return;
2292   }
2293 
2294   // Legalize INSERT_SUBREG
2295   // src0 must have the same register class as dst
2296   if (MI->getOpcode() == AMDGPU::INSERT_SUBREG) {
2297     unsigned Dst = MI->getOperand(0).getReg();
2298     unsigned Src0 = MI->getOperand(1).getReg();
2299     const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
2300     const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
2301     if (DstRC != Src0RC) {
2302       MachineBasicBlock &MBB = *MI->getParent();
2303       unsigned NewSrc0 = MRI.createVirtualRegister(DstRC);
2304       BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), NewSrc0)
2305               .addReg(Src0);
2306       MI->getOperand(1).setReg(NewSrc0);
2307     }
2308     return;
2309   }
2310 
2311   // Legalize MIMG
2312   if (isMIMG(*MI)) {
2313     MachineOperand *SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc);
2314     if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) {
2315       unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI);
2316       SRsrc->setReg(SGPR);
2317     }
2318 
2319     MachineOperand *SSamp = getNamedOperand(*MI, AMDGPU::OpName::ssamp);
2320     if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) {
2321       unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI);
2322       SSamp->setReg(SGPR);
2323     }
2324     return;
2325   }
2326 
2327   // Legalize MUBUF* instructions
2328   // FIXME: If we start using the non-addr64 instructions for compute, we
2329   // may need to legalize them here.
2330   int SRsrcIdx =
2331       AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
2332   if (SRsrcIdx != -1) {
2333     // We have an MUBUF instruction
2334     MachineOperand *SRsrc = &MI->getOperand(SRsrcIdx);
2335     unsigned SRsrcRC = get(MI->getOpcode()).OpInfo[SRsrcIdx].RegClass;
2336     if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()),
2337                                              RI.getRegClass(SRsrcRC))) {
2338       // The operands are legal.
2339       // FIXME: We may need to legalize operands besided srsrc.
2340       return;
2341     }
2342 
2343     MachineBasicBlock &MBB = *MI->getParent();
2344 
2345     // Extract the ptr from the resource descriptor.
2346     unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc,
2347       &AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
2348 
2349     // Create an empty resource descriptor
2350     unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2351     unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2352     unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2353     unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
2354     uint64_t RsrcDataFormat = getDefaultRsrcDataFormat();
2355 
2356     // Zero64 = 0
2357     BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64),
2358             Zero64)
2359             .addImm(0);
2360 
2361     // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
2362     BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
2363             SRsrcFormatLo)
2364             .addImm(RsrcDataFormat & 0xFFFFFFFF);
2365 
2366     // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
2367     BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
2368             SRsrcFormatHi)
2369             .addImm(RsrcDataFormat >> 32);
2370 
2371     // NewSRsrc = {Zero64, SRsrcFormat}
2372     BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc)
2373       .addReg(Zero64)
2374       .addImm(AMDGPU::sub0_sub1)
2375       .addReg(SRsrcFormatLo)
2376       .addImm(AMDGPU::sub2)
2377       .addReg(SRsrcFormatHi)
2378       .addImm(AMDGPU::sub3);
2379 
2380     MachineOperand *VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr);
2381     unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
2382     if (VAddr) {
2383       // This is already an ADDR64 instruction so we need to add the pointer
2384       // extracted from the resource descriptor to the current value of VAddr.
2385       unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2386       unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2387 
2388       // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0
2389       DebugLoc DL = MI->getDebugLoc();
2390       BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo)
2391         .addReg(SRsrcPtr, 0, AMDGPU::sub0)
2392         .addReg(VAddr->getReg(), 0, AMDGPU::sub0);
2393 
2394       // NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1
2395       BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi)
2396         .addReg(SRsrcPtr, 0, AMDGPU::sub1)
2397         .addReg(VAddr->getReg(), 0, AMDGPU::sub1);
2398 
2399       // NewVaddr = {NewVaddrHi, NewVaddrLo}
2400       BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
2401         .addReg(NewVAddrLo)
2402         .addImm(AMDGPU::sub0)
2403         .addReg(NewVAddrHi)
2404         .addImm(AMDGPU::sub1);
2405     } else {
2406       // This instructions is the _OFFSET variant, so we need to convert it to
2407       // ADDR64.
2408       assert(MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration()
2409              < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
2410              "FIXME: Need to emit flat atomics here");
2411 
2412       MachineOperand *VData = getNamedOperand(*MI, AMDGPU::OpName::vdata);
2413       MachineOperand *Offset = getNamedOperand(*MI, AMDGPU::OpName::offset);
2414       MachineOperand *SOffset = getNamedOperand(*MI, AMDGPU::OpName::soffset);
2415       unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI->getOpcode());
2416 
2417       // Atomics rith return have have an additional tied operand and are
2418       // missing some of the special bits.
2419       MachineOperand *VDataIn = getNamedOperand(*MI, AMDGPU::OpName::vdata_in);
2420       MachineInstr *Addr64;
2421 
2422       if (!VDataIn) {
2423         // Regular buffer load / store.
2424         MachineInstrBuilder MIB
2425           = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode))
2426           .addOperand(*VData)
2427           .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
2428           // This will be replaced later
2429           // with the new value of vaddr.
2430           .addOperand(*SRsrc)
2431           .addOperand(*SOffset)
2432           .addOperand(*Offset);
2433 
2434         // Atomics do not have this operand.
2435         if (const MachineOperand *GLC
2436             = getNamedOperand(*MI, AMDGPU::OpName::glc)) {
2437           MIB.addImm(GLC->getImm());
2438         }
2439 
2440         MIB.addImm(getNamedImmOperand(*MI, AMDGPU::OpName::slc));
2441 
2442         if (const MachineOperand *TFE
2443             = getNamedOperand(*MI, AMDGPU::OpName::tfe)) {
2444           MIB.addImm(TFE->getImm());
2445         }
2446 
2447         MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
2448         Addr64 = MIB;
2449       } else {
2450         // Atomics with return.
2451         Addr64 = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode))
2452           .addOperand(*VData)
2453           .addOperand(*VDataIn)
2454           .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
2455           // This will be replaced later
2456           // with the new value of vaddr.
2457           .addOperand(*SRsrc)
2458           .addOperand(*SOffset)
2459           .addOperand(*Offset)
2460           .addImm(getNamedImmOperand(*MI, AMDGPU::OpName::slc))
2461           .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
2462       }
2463 
2464       MI->removeFromParent();
2465       MI = Addr64;
2466 
2467       // NewVaddr = {NewVaddrHi, NewVaddrLo}
2468       BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
2469         .addReg(SRsrcPtr, 0, AMDGPU::sub0)
2470         .addImm(AMDGPU::sub0)
2471         .addReg(SRsrcPtr, 0, AMDGPU::sub1)
2472         .addImm(AMDGPU::sub1);
2473 
2474       VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr);
2475       SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc);
2476     }
2477 
2478     // Update the instruction to use NewVaddr
2479     VAddr->setReg(NewVAddr);
2480     // Update the instruction to use NewSRsrc
2481     SRsrc->setReg(NewSRsrc);
2482   }
2483 }
2484 
2485 void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
2486   SmallVector<MachineInstr *, 128> Worklist;
2487   Worklist.push_back(&TopInst);
2488 
2489   while (!Worklist.empty()) {
2490     MachineInstr *Inst = Worklist.pop_back_val();
2491     MachineBasicBlock *MBB = Inst->getParent();
2492     MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
2493 
2494     unsigned Opcode = Inst->getOpcode();
2495     unsigned NewOpcode = getVALUOp(*Inst);
2496 
2497     // Handle some special cases
2498     switch (Opcode) {
2499     default:
2500       break;
2501     case AMDGPU::S_AND_B64:
2502       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64);
2503       Inst->eraseFromParent();
2504       continue;
2505 
2506     case AMDGPU::S_OR_B64:
2507       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64);
2508       Inst->eraseFromParent();
2509       continue;
2510 
2511     case AMDGPU::S_XOR_B64:
2512       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64);
2513       Inst->eraseFromParent();
2514       continue;
2515 
2516     case AMDGPU::S_NOT_B64:
2517       splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32);
2518       Inst->eraseFromParent();
2519       continue;
2520 
2521     case AMDGPU::S_BCNT1_I32_B64:
2522       splitScalar64BitBCNT(Worklist, Inst);
2523       Inst->eraseFromParent();
2524       continue;
2525 
2526     case AMDGPU::S_BFE_I64: {
2527       splitScalar64BitBFE(Worklist, Inst);
2528       Inst->eraseFromParent();
2529       continue;
2530     }
2531 
2532     case AMDGPU::S_LSHL_B32:
2533       if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
2534         NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
2535         swapOperands(Inst);
2536       }
2537       break;
2538     case AMDGPU::S_ASHR_I32:
2539       if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
2540         NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
2541         swapOperands(Inst);
2542       }
2543       break;
2544     case AMDGPU::S_LSHR_B32:
2545       if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
2546         NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
2547         swapOperands(Inst);
2548       }
2549       break;
2550     case AMDGPU::S_LSHL_B64:
2551       if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
2552         NewOpcode = AMDGPU::V_LSHLREV_B64;
2553         swapOperands(Inst);
2554       }
2555       break;
2556     case AMDGPU::S_ASHR_I64:
2557       if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
2558         NewOpcode = AMDGPU::V_ASHRREV_I64;
2559         swapOperands(Inst);
2560       }
2561       break;
2562     case AMDGPU::S_LSHR_B64:
2563       if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
2564         NewOpcode = AMDGPU::V_LSHRREV_B64;
2565         swapOperands(Inst);
2566       }
2567       break;
2568 
2569     case AMDGPU::S_ABS_I32:
2570       lowerScalarAbs(Worklist, Inst);
2571       Inst->eraseFromParent();
2572       continue;
2573 
2574     case AMDGPU::S_CBRANCH_SCC0:
2575     case AMDGPU::S_CBRANCH_SCC1:
2576       // Clear unused bits of vcc
2577       BuildMI(*MBB, Inst, Inst->getDebugLoc(), get(AMDGPU::S_AND_B64), AMDGPU::VCC)
2578               .addReg(AMDGPU::EXEC)
2579               .addReg(AMDGPU::VCC);
2580       break;
2581 
2582     case AMDGPU::S_BFE_U64:
2583     case AMDGPU::S_BFM_B64:
2584       llvm_unreachable("Moving this op to VALU not implemented");
2585     }
2586 
2587     if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
2588       // We cannot move this instruction to the VALU, so we should try to
2589       // legalize its operands instead.
2590       legalizeOperands(Inst);
2591       continue;
2592     }
2593 
2594     // Use the new VALU Opcode.
2595     const MCInstrDesc &NewDesc = get(NewOpcode);
2596     Inst->setDesc(NewDesc);
2597 
2598     // Remove any references to SCC. Vector instructions can't read from it, and
2599     // We're just about to add the implicit use / defs of VCC, and we don't want
2600     // both.
2601     for (unsigned i = Inst->getNumOperands() - 1; i > 0; --i) {
2602       MachineOperand &Op = Inst->getOperand(i);
2603       if (Op.isReg() && Op.getReg() == AMDGPU::SCC) {
2604         Inst->RemoveOperand(i);
2605         addSCCDefUsersToVALUWorklist(Inst, Worklist);
2606       }
2607     }
2608 
2609     if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
2610       // We are converting these to a BFE, so we need to add the missing
2611       // operands for the size and offset.
2612       unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
2613       Inst->addOperand(MachineOperand::CreateImm(0));
2614       Inst->addOperand(MachineOperand::CreateImm(Size));
2615 
2616     } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
2617       // The VALU version adds the second operand to the result, so insert an
2618       // extra 0 operand.
2619       Inst->addOperand(MachineOperand::CreateImm(0));
2620     }
2621 
2622     Inst->addImplicitDefUseOperands(*Inst->getParent()->getParent());
2623 
2624     if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
2625       const MachineOperand &OffsetWidthOp = Inst->getOperand(2);
2626       // If we need to move this to VGPRs, we need to unpack the second operand
2627       // back into the 2 separate ones for bit offset and width.
2628       assert(OffsetWidthOp.isImm() &&
2629              "Scalar BFE is only implemented for constant width and offset");
2630       uint32_t Imm = OffsetWidthOp.getImm();
2631 
2632       uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
2633       uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
2634       Inst->RemoveOperand(2); // Remove old immediate.
2635       Inst->addOperand(MachineOperand::CreateImm(Offset));
2636       Inst->addOperand(MachineOperand::CreateImm(BitWidth));
2637     }
2638 
2639     bool HasDst = Inst->getOperand(0).isReg() && Inst->getOperand(0).isDef();
2640     unsigned NewDstReg = AMDGPU::NoRegister;
2641     if (HasDst) {
2642       // Update the destination register class.
2643       const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*Inst);
2644       if (!NewDstRC)
2645         continue;
2646 
2647       unsigned DstReg = Inst->getOperand(0).getReg();
2648       NewDstReg = MRI.createVirtualRegister(NewDstRC);
2649       MRI.replaceRegWith(DstReg, NewDstReg);
2650     }
2651 
2652     // Legalize the operands
2653     legalizeOperands(Inst);
2654 
2655     if (HasDst)
2656      addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
2657   }
2658 }
2659 
2660 //===----------------------------------------------------------------------===//
2661 // Indirect addressing callbacks
2662 //===----------------------------------------------------------------------===//
2663 
2664 const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const {
2665   return &AMDGPU::VGPR_32RegClass;
2666 }
2667 
2668 void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist,
2669                                  MachineInstr *Inst) const {
2670   MachineBasicBlock &MBB = *Inst->getParent();
2671   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
2672   MachineBasicBlock::iterator MII = Inst;
2673   DebugLoc DL = Inst->getDebugLoc();
2674 
2675   MachineOperand &Dest = Inst->getOperand(0);
2676   MachineOperand &Src = Inst->getOperand(1);
2677   unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2678   unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2679 
2680   BuildMI(MBB, MII, DL, get(AMDGPU::V_SUB_I32_e32), TmpReg)
2681     .addImm(0)
2682     .addReg(Src.getReg());
2683 
2684   BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
2685     .addReg(Src.getReg())
2686     .addReg(TmpReg);
2687 
2688   MRI.replaceRegWith(Dest.getReg(), ResultReg);
2689   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
2690 }
2691 
2692 void SIInstrInfo::splitScalar64BitUnaryOp(
2693   SmallVectorImpl<MachineInstr *> &Worklist,
2694   MachineInstr *Inst,
2695   unsigned Opcode) const {
2696   MachineBasicBlock &MBB = *Inst->getParent();
2697   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
2698 
2699   MachineOperand &Dest = Inst->getOperand(0);
2700   MachineOperand &Src0 = Inst->getOperand(1);
2701   DebugLoc DL = Inst->getDebugLoc();
2702 
2703   MachineBasicBlock::iterator MII = Inst;
2704 
2705   const MCInstrDesc &InstDesc = get(Opcode);
2706   const TargetRegisterClass *Src0RC = Src0.isReg() ?
2707     MRI.getRegClass(Src0.getReg()) :
2708     &AMDGPU::SGPR_32RegClass;
2709 
2710   const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
2711 
2712   MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
2713                                                        AMDGPU::sub0, Src0SubRC);
2714 
2715   const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
2716   const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
2717   const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
2718 
2719   unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
2720   BuildMI(MBB, MII, DL, InstDesc, DestSub0)
2721     .addOperand(SrcReg0Sub0);
2722 
2723   MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
2724                                                        AMDGPU::sub1, Src0SubRC);
2725 
2726   unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
2727   BuildMI(MBB, MII, DL, InstDesc, DestSub1)
2728     .addOperand(SrcReg0Sub1);
2729 
2730   unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
2731   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
2732     .addReg(DestSub0)
2733     .addImm(AMDGPU::sub0)
2734     .addReg(DestSub1)
2735     .addImm(AMDGPU::sub1);
2736 
2737   MRI.replaceRegWith(Dest.getReg(), FullDestReg);
2738 
2739   // We don't need to legalizeOperands here because for a single operand, src0
2740   // will support any kind of input.
2741 
2742   // Move all users of this moved value.
2743   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
2744 }
2745 
2746 void SIInstrInfo::splitScalar64BitBinaryOp(
2747   SmallVectorImpl<MachineInstr *> &Worklist,
2748   MachineInstr *Inst,
2749   unsigned Opcode) const {
2750   MachineBasicBlock &MBB = *Inst->getParent();
2751   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
2752 
2753   MachineOperand &Dest = Inst->getOperand(0);
2754   MachineOperand &Src0 = Inst->getOperand(1);
2755   MachineOperand &Src1 = Inst->getOperand(2);
2756   DebugLoc DL = Inst->getDebugLoc();
2757 
2758   MachineBasicBlock::iterator MII = Inst;
2759 
2760   const MCInstrDesc &InstDesc = get(Opcode);
2761   const TargetRegisterClass *Src0RC = Src0.isReg() ?
2762     MRI.getRegClass(Src0.getReg()) :
2763     &AMDGPU::SGPR_32RegClass;
2764 
2765   const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
2766   const TargetRegisterClass *Src1RC = Src1.isReg() ?
2767     MRI.getRegClass(Src1.getReg()) :
2768     &AMDGPU::SGPR_32RegClass;
2769 
2770   const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
2771 
2772   MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
2773                                                        AMDGPU::sub0, Src0SubRC);
2774   MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
2775                                                        AMDGPU::sub0, Src1SubRC);
2776 
2777   const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
2778   const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
2779   const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
2780 
2781   unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
2782   MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0)
2783     .addOperand(SrcReg0Sub0)
2784     .addOperand(SrcReg1Sub0);
2785 
2786   MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
2787                                                        AMDGPU::sub1, Src0SubRC);
2788   MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
2789                                                        AMDGPU::sub1, Src1SubRC);
2790 
2791   unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
2792   MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1)
2793     .addOperand(SrcReg0Sub1)
2794     .addOperand(SrcReg1Sub1);
2795 
2796   unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
2797   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
2798     .addReg(DestSub0)
2799     .addImm(AMDGPU::sub0)
2800     .addReg(DestSub1)
2801     .addImm(AMDGPU::sub1);
2802 
2803   MRI.replaceRegWith(Dest.getReg(), FullDestReg);
2804 
2805   // Try to legalize the operands in case we need to swap the order to keep it
2806   // valid.
2807   legalizeOperands(LoHalf);
2808   legalizeOperands(HiHalf);
2809 
2810   // Move all users of this moved vlaue.
2811   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
2812 }
2813 
2814 void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist,
2815                                        MachineInstr *Inst) const {
2816   MachineBasicBlock &MBB = *Inst->getParent();
2817   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
2818 
2819   MachineBasicBlock::iterator MII = Inst;
2820   DebugLoc DL = Inst->getDebugLoc();
2821 
2822   MachineOperand &Dest = Inst->getOperand(0);
2823   MachineOperand &Src = Inst->getOperand(1);
2824 
2825   const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
2826   const TargetRegisterClass *SrcRC = Src.isReg() ?
2827     MRI.getRegClass(Src.getReg()) :
2828     &AMDGPU::SGPR_32RegClass;
2829 
2830   unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2831   unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2832 
2833   const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0);
2834 
2835   MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
2836                                                       AMDGPU::sub0, SrcSubRC);
2837   MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
2838                                                       AMDGPU::sub1, SrcSubRC);
2839 
2840   BuildMI(MBB, MII, DL, InstDesc, MidReg)
2841     .addOperand(SrcRegSub0)
2842     .addImm(0);
2843 
2844   BuildMI(MBB, MII, DL, InstDesc, ResultReg)
2845     .addOperand(SrcRegSub1)
2846     .addReg(MidReg);
2847 
2848   MRI.replaceRegWith(Dest.getReg(), ResultReg);
2849 
2850   // We don't need to legalize operands here. src0 for etiher instruction can be
2851   // an SGPR, and the second input is unused or determined here.
2852   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
2853 }
2854 
2855 void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
2856                                       MachineInstr *Inst) const {
2857   MachineBasicBlock &MBB = *Inst->getParent();
2858   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
2859   MachineBasicBlock::iterator MII = Inst;
2860   DebugLoc DL = Inst->getDebugLoc();
2861 
2862   MachineOperand &Dest = Inst->getOperand(0);
2863   uint32_t Imm = Inst->getOperand(2).getImm();
2864   uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
2865   uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
2866 
2867   (void) Offset;
2868 
2869   // Only sext_inreg cases handled.
2870   assert(Inst->getOpcode() == AMDGPU::S_BFE_I64 &&
2871          BitWidth <= 32 &&
2872          Offset == 0 &&
2873          "Not implemented");
2874 
2875   if (BitWidth < 32) {
2876     unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2877     unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2878     unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
2879 
2880     BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo)
2881       .addReg(Inst->getOperand(1).getReg(), 0, AMDGPU::sub0)
2882       .addImm(0)
2883       .addImm(BitWidth);
2884 
2885     BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
2886       .addImm(31)
2887       .addReg(MidRegLo);
2888 
2889     BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
2890       .addReg(MidRegLo)
2891       .addImm(AMDGPU::sub0)
2892       .addReg(MidRegHi)
2893       .addImm(AMDGPU::sub1);
2894 
2895     MRI.replaceRegWith(Dest.getReg(), ResultReg);
2896     addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
2897     return;
2898   }
2899 
2900   MachineOperand &Src = Inst->getOperand(1);
2901   unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2902   unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
2903 
2904   BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
2905     .addImm(31)
2906     .addReg(Src.getReg(), 0, AMDGPU::sub0);
2907 
2908   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
2909     .addReg(Src.getReg(), 0, AMDGPU::sub0)
2910     .addImm(AMDGPU::sub0)
2911     .addReg(TmpReg)
2912     .addImm(AMDGPU::sub1);
2913 
2914   MRI.replaceRegWith(Dest.getReg(), ResultReg);
2915   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
2916 }
2917 
2918 void SIInstrInfo::addUsersToMoveToVALUWorklist(
2919   unsigned DstReg,
2920   MachineRegisterInfo &MRI,
2921   SmallVectorImpl<MachineInstr *> &Worklist) const {
2922   for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
2923          E = MRI.use_end(); I != E; ++I) {
2924     MachineInstr &UseMI = *I->getParent();
2925     if (!canReadVGPR(UseMI, I.getOperandNo())) {
2926       Worklist.push_back(&UseMI);
2927     }
2928   }
2929 }
2930 
2931 void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineInstr *SCCDefInst,
2932                               SmallVectorImpl<MachineInstr *> &Worklist) const {
2933   // This assumes that all the users of SCC are in the same block
2934   // as the SCC def.
2935   for (MachineBasicBlock::iterator I = SCCDefInst,
2936        E = SCCDefInst->getParent()->end(); I != E; ++I) {
2937 
2938     // Exit if we find another SCC def.
2939     if (I->findRegisterDefOperandIdx(AMDGPU::SCC) != -1)
2940       return;
2941 
2942     if (I->findRegisterUseOperandIdx(AMDGPU::SCC) != -1)
2943       Worklist.push_back(I);
2944   }
2945 }
2946 
2947 const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
2948   const MachineInstr &Inst) const {
2949   const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
2950 
2951   switch (Inst.getOpcode()) {
2952   // For target instructions, getOpRegClass just returns the virtual register
2953   // class associated with the operand, so we need to find an equivalent VGPR
2954   // register class in order to move the instruction to the VALU.
2955   case AMDGPU::COPY:
2956   case AMDGPU::PHI:
2957   case AMDGPU::REG_SEQUENCE:
2958   case AMDGPU::INSERT_SUBREG:
2959     if (RI.hasVGPRs(NewDstRC))
2960       return nullptr;
2961 
2962     NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
2963     if (!NewDstRC)
2964       return nullptr;
2965     return NewDstRC;
2966   default:
2967     return NewDstRC;
2968   }
2969 }
2970 
2971 // Find the one SGPR operand we are allowed to use.
2972 unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI,
2973                                    int OpIndices[3]) const {
2974   const MCInstrDesc &Desc = MI->getDesc();
2975 
2976   // Find the one SGPR operand we are allowed to use.
2977   //
2978   // First we need to consider the instruction's operand requirements before
2979   // legalizing. Some operands are required to be SGPRs, such as implicit uses
2980   // of VCC, but we are still bound by the constant bus requirement to only use
2981   // one.
2982   //
2983   // If the operand's class is an SGPR, we can never move it.
2984 
2985   unsigned SGPRReg = findImplicitSGPRRead(*MI);
2986   if (SGPRReg != AMDGPU::NoRegister)
2987     return SGPRReg;
2988 
2989   unsigned UsedSGPRs[3] = { AMDGPU::NoRegister };
2990   const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
2991 
2992   for (unsigned i = 0; i < 3; ++i) {
2993     int Idx = OpIndices[i];
2994     if (Idx == -1)
2995       break;
2996 
2997     const MachineOperand &MO = MI->getOperand(Idx);
2998     if (!MO.isReg())
2999       continue;
3000 
3001     // Is this operand statically required to be an SGPR based on the operand
3002     // constraints?
3003     const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass);
3004     bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
3005     if (IsRequiredSGPR)
3006       return MO.getReg();
3007 
3008     // If this could be a VGPR or an SGPR, Check the dynamic register class.
3009     unsigned Reg = MO.getReg();
3010     const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
3011     if (RI.isSGPRClass(RegRC))
3012       UsedSGPRs[i] = Reg;
3013   }
3014 
3015   // We don't have a required SGPR operand, so we have a bit more freedom in
3016   // selecting operands to move.
3017 
3018   // Try to select the most used SGPR. If an SGPR is equal to one of the
3019   // others, we choose that.
3020   //
3021   // e.g.
3022   // V_FMA_F32 v0, s0, s0, s0 -> No moves
3023   // V_FMA_F32 v0, s0, s1, s0 -> Move s1
3024 
3025   // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
3026   // prefer those.
3027 
3028   if (UsedSGPRs[0] != AMDGPU::NoRegister) {
3029     if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
3030       SGPRReg = UsedSGPRs[0];
3031   }
3032 
3033   if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) {
3034     if (UsedSGPRs[1] == UsedSGPRs[2])
3035       SGPRReg = UsedSGPRs[1];
3036   }
3037 
3038   return SGPRReg;
3039 }
3040 
3041 void SIInstrInfo::reserveIndirectRegisters(BitVector &Reserved,
3042                                             const MachineFunction &MF) const {
3043   int End = getIndirectIndexEnd(MF);
3044   int Begin = getIndirectIndexBegin(MF);
3045 
3046   if (End == -1)
3047     return;
3048 
3049 
3050   for (int Index = Begin; Index <= End; ++Index)
3051     Reserved.set(AMDGPU::VGPR_32RegClass.getRegister(Index));
3052 
3053   for (int Index = std::max(0, Begin - 1); Index <= End; ++Index)
3054     Reserved.set(AMDGPU::VReg_64RegClass.getRegister(Index));
3055 
3056   for (int Index = std::max(0, Begin - 2); Index <= End; ++Index)
3057     Reserved.set(AMDGPU::VReg_96RegClass.getRegister(Index));
3058 
3059   for (int Index = std::max(0, Begin - 3); Index <= End; ++Index)
3060     Reserved.set(AMDGPU::VReg_128RegClass.getRegister(Index));
3061 
3062   for (int Index = std::max(0, Begin - 7); Index <= End; ++Index)
3063     Reserved.set(AMDGPU::VReg_256RegClass.getRegister(Index));
3064 
3065   for (int Index = std::max(0, Begin - 15); Index <= End; ++Index)
3066     Reserved.set(AMDGPU::VReg_512RegClass.getRegister(Index));
3067 }
3068 
3069 MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
3070                                              unsigned OperandName) const {
3071   int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
3072   if (Idx == -1)
3073     return nullptr;
3074 
3075   return &MI.getOperand(Idx);
3076 }
3077 
3078 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
3079   uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
3080   if (ST.isAmdHsaOS()) {
3081     RsrcDataFormat |= (1ULL << 56);
3082 
3083     if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
3084       // Set MTYPE = 2
3085       RsrcDataFormat |= (2ULL << 59);
3086   }
3087 
3088   return RsrcDataFormat;
3089 }
3090 
3091 uint64_t SIInstrInfo::getScratchRsrcWords23() const {
3092   uint64_t Rsrc23 = getDefaultRsrcDataFormat() |
3093                     AMDGPU::RSRC_TID_ENABLE |
3094                     0xffffffff; // Size;
3095 
3096   uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1;
3097 
3098   Rsrc23 |= (EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT) |
3099             // IndexStride = 64
3100             (UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT);
3101 
3102   // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
3103   // Clear them unless we want a huge stride.
3104   if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
3105     Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
3106 
3107   return Rsrc23;
3108 }
3109 
3110 bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr *MI) const {
3111   unsigned Opc = MI->getOpcode();
3112 
3113   return isSMRD(Opc);
3114 }
3115 
3116 bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr *MI) const {
3117   unsigned Opc = MI->getOpcode();
3118 
3119   return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc);
3120 }
3121 
3122 unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
3123   unsigned Opc = MI.getOpcode();
3124   const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc);
3125   unsigned DescSize = Desc.getSize();
3126 
3127   // If we have a definitive size, we can use it. Otherwise we need to inspect
3128   // the operands to know the size.
3129   if (DescSize == 8 || DescSize == 4)
3130     return DescSize;
3131 
3132   assert(DescSize == 0);
3133 
3134   // 4-byte instructions may have a 32-bit literal encoded after them. Check
3135   // operands that coud ever be literals.
3136   if (isVALU(MI) || isSALU(MI)) {
3137     int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
3138     if (Src0Idx == -1)
3139       return 4; // No operands.
3140 
3141     if (isLiteralConstant(MI.getOperand(Src0Idx), getOpSize(MI, Src0Idx)))
3142       return 8;
3143 
3144     int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
3145     if (Src1Idx == -1)
3146       return 4;
3147 
3148     if (isLiteralConstant(MI.getOperand(Src1Idx), getOpSize(MI, Src1Idx)))
3149       return 8;
3150 
3151     return 4;
3152   }
3153 
3154   switch (Opc) {
3155   case TargetOpcode::IMPLICIT_DEF:
3156   case TargetOpcode::KILL:
3157   case TargetOpcode::DBG_VALUE:
3158   case TargetOpcode::BUNDLE:
3159   case TargetOpcode::EH_LABEL:
3160     return 0;
3161   case TargetOpcode::INLINEASM: {
3162     const MachineFunction *MF = MI.getParent()->getParent();
3163     const char *AsmStr = MI.getOperand(0).getSymbolName();
3164     return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
3165   }
3166   default:
3167     llvm_unreachable("unable to find instruction size");
3168   }
3169 }
3170 
3171 ArrayRef<std::pair<int, const char *>>
3172 SIInstrInfo::getSerializableTargetIndices() const {
3173   static const std::pair<int, const char *> TargetIndices[] = {
3174       {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
3175       {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
3176       {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
3177       {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
3178       {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
3179   return makeArrayRef(TargetIndices);
3180 }
3181 
3182 /// This is used by the post-RA scheduler (SchedulePostRAList.cpp).  The
3183 /// post-RA version of misched uses CreateTargetMIHazardRecognizer.
3184 ScheduleHazardRecognizer *
3185 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
3186                                             const ScheduleDAG *DAG) const {
3187   return new GCNHazardRecognizer(DAG->MF);
3188 }
3189 
3190 /// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
3191 /// pass.
3192 ScheduleHazardRecognizer *
3193 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const {
3194   return new GCNHazardRecognizer(MF);
3195 }
3196