1 //===-- SIInstrInfo.cpp - SI Instruction Information  ---------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief SI Implementation of TargetInstrInfo.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "SIInstrInfo.h"
17 #include "AMDGPUTargetMachine.h"
18 #include "SIDefines.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/CodeGen/MachineFrameInfo.h"
21 #include "llvm/CodeGen/MachineInstrBuilder.h"
22 #include "llvm/CodeGen/MachineRegisterInfo.h"
23 #include "llvm/IR/Function.h"
24 #include "llvm/CodeGen/RegisterScavenging.h"
25 #include "llvm/MC/MCInstrDesc.h"
26 #include "llvm/Support/Debug.h"
27 
28 using namespace llvm;
29 
30 SIInstrInfo::SIInstrInfo(const AMDGPUSubtarget &st)
31     : AMDGPUInstrInfo(st), RI() {}
32 
33 //===----------------------------------------------------------------------===//
34 // TargetInstrInfo callbacks
35 //===----------------------------------------------------------------------===//
36 
37 static unsigned getNumOperandsNoGlue(SDNode *Node) {
38   unsigned N = Node->getNumOperands();
39   while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
40     --N;
41   return N;
42 }
43 
44 static SDValue findChainOperand(SDNode *Load) {
45   SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1);
46   assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node");
47   return LastOp;
48 }
49 
50 /// \brief Returns true if both nodes have the same value for the given
51 ///        operand \p Op, or if both nodes do not have this operand.
52 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
53   unsigned Opc0 = N0->getMachineOpcode();
54   unsigned Opc1 = N1->getMachineOpcode();
55 
56   int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
57   int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
58 
59   if (Op0Idx == -1 && Op1Idx == -1)
60     return true;
61 
62 
63   if ((Op0Idx == -1 && Op1Idx != -1) ||
64       (Op1Idx == -1 && Op0Idx != -1))
65     return false;
66 
67   // getNamedOperandIdx returns the index for the MachineInstr's operands,
68   // which includes the result as the first operand. We are indexing into the
69   // MachineSDNode's operands, so we need to skip the result operand to get
70   // the real index.
71   --Op0Idx;
72   --Op1Idx;
73 
74   return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
75 }
76 
77 bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI,
78                                                     AliasAnalysis *AA) const {
79   // TODO: The generic check fails for VALU instructions that should be
80   // rematerializable due to implicit reads of exec. We really want all of the
81   // generic logic for this except for this.
82   switch (MI->getOpcode()) {
83   case AMDGPU::V_MOV_B32_e32:
84   case AMDGPU::V_MOV_B32_e64:
85   case AMDGPU::V_MOV_B64_PSEUDO:
86     return true;
87   default:
88     return false;
89   }
90 }
91 
92 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
93                                           int64_t &Offset0,
94                                           int64_t &Offset1) const {
95   if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
96     return false;
97 
98   unsigned Opc0 = Load0->getMachineOpcode();
99   unsigned Opc1 = Load1->getMachineOpcode();
100 
101   // Make sure both are actually loads.
102   if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
103     return false;
104 
105   if (isDS(Opc0) && isDS(Opc1)) {
106 
107     // FIXME: Handle this case:
108     if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
109       return false;
110 
111     // Check base reg.
112     if (Load0->getOperand(1) != Load1->getOperand(1))
113       return false;
114 
115     // Check chain.
116     if (findChainOperand(Load0) != findChainOperand(Load1))
117       return false;
118 
119     // Skip read2 / write2 variants for simplicity.
120     // TODO: We should report true if the used offsets are adjacent (excluded
121     // st64 versions).
122     if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 ||
123         AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1)
124       return false;
125 
126     Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue();
127     Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue();
128     return true;
129   }
130 
131   if (isSMRD(Opc0) && isSMRD(Opc1)) {
132     assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1));
133 
134     // Check base reg.
135     if (Load0->getOperand(0) != Load1->getOperand(0))
136       return false;
137 
138     const ConstantSDNode *Load0Offset =
139         dyn_cast<ConstantSDNode>(Load0->getOperand(1));
140     const ConstantSDNode *Load1Offset =
141         dyn_cast<ConstantSDNode>(Load1->getOperand(1));
142 
143     if (!Load0Offset || !Load1Offset)
144       return false;
145 
146     // Check chain.
147     if (findChainOperand(Load0) != findChainOperand(Load1))
148       return false;
149 
150     Offset0 = Load0Offset->getZExtValue();
151     Offset1 = Load1Offset->getZExtValue();
152     return true;
153   }
154 
155   // MUBUF and MTBUF can access the same addresses.
156   if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
157 
158     // MUBUF and MTBUF have vaddr at different indices.
159     if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
160         findChainOperand(Load0) != findChainOperand(Load1) ||
161         !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
162         !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
163       return false;
164 
165     int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
166     int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
167 
168     if (OffIdx0 == -1 || OffIdx1 == -1)
169       return false;
170 
171     // getNamedOperandIdx returns the index for MachineInstrs.  Since they
172     // inlcude the output in the operand list, but SDNodes don't, we need to
173     // subtract the index by one.
174     --OffIdx0;
175     --OffIdx1;
176 
177     SDValue Off0 = Load0->getOperand(OffIdx0);
178     SDValue Off1 = Load1->getOperand(OffIdx1);
179 
180     // The offset might be a FrameIndexSDNode.
181     if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
182       return false;
183 
184     Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue();
185     Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue();
186     return true;
187   }
188 
189   return false;
190 }
191 
192 static bool isStride64(unsigned Opc) {
193   switch (Opc) {
194   case AMDGPU::DS_READ2ST64_B32:
195   case AMDGPU::DS_READ2ST64_B64:
196   case AMDGPU::DS_WRITE2ST64_B32:
197   case AMDGPU::DS_WRITE2ST64_B64:
198     return true;
199   default:
200     return false;
201   }
202 }
203 
204 bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
205                                         int64_t &Offset,
206                                         const TargetRegisterInfo *TRI) const {
207   unsigned Opc = LdSt->getOpcode();
208 
209   if (isDS(*LdSt)) {
210     const MachineOperand *OffsetImm = getNamedOperand(*LdSt,
211                                                       AMDGPU::OpName::offset);
212     if (OffsetImm) {
213       // Normal, single offset LDS instruction.
214       const MachineOperand *AddrReg = getNamedOperand(*LdSt,
215                                                       AMDGPU::OpName::addr);
216 
217       BaseReg = AddrReg->getReg();
218       Offset = OffsetImm->getImm();
219       return true;
220     }
221 
222     // The 2 offset instructions use offset0 and offset1 instead. We can treat
223     // these as a load with a single offset if the 2 offsets are consecutive. We
224     // will use this for some partially aligned loads.
225     const MachineOperand *Offset0Imm = getNamedOperand(*LdSt,
226                                                        AMDGPU::OpName::offset0);
227     // DS_PERMUTE does not have Offset0Imm (and Offset1Imm).
228     if (!Offset0Imm)
229       return false;
230 
231     const MachineOperand *Offset1Imm = getNamedOperand(*LdSt,
232                                                        AMDGPU::OpName::offset1);
233 
234     uint8_t Offset0 = Offset0Imm->getImm();
235     uint8_t Offset1 = Offset1Imm->getImm();
236 
237     if (Offset1 > Offset0 && Offset1 - Offset0 == 1) {
238       // Each of these offsets is in element sized units, so we need to convert
239       // to bytes of the individual reads.
240 
241       unsigned EltSize;
242       if (LdSt->mayLoad())
243         EltSize = getOpRegClass(*LdSt, 0)->getSize() / 2;
244       else {
245         assert(LdSt->mayStore());
246         int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
247         EltSize = getOpRegClass(*LdSt, Data0Idx)->getSize();
248       }
249 
250       if (isStride64(Opc))
251         EltSize *= 64;
252 
253       const MachineOperand *AddrReg = getNamedOperand(*LdSt,
254                                                       AMDGPU::OpName::addr);
255       BaseReg = AddrReg->getReg();
256       Offset = EltSize * Offset0;
257       return true;
258     }
259 
260     return false;
261   }
262 
263   if (isMUBUF(*LdSt) || isMTBUF(*LdSt)) {
264     if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset) != -1)
265       return false;
266 
267     const MachineOperand *AddrReg = getNamedOperand(*LdSt,
268                                                     AMDGPU::OpName::vaddr);
269     if (!AddrReg)
270       return false;
271 
272     const MachineOperand *OffsetImm = getNamedOperand(*LdSt,
273                                                       AMDGPU::OpName::offset);
274     BaseReg = AddrReg->getReg();
275     Offset = OffsetImm->getImm();
276     return true;
277   }
278 
279   if (isSMRD(*LdSt)) {
280     const MachineOperand *OffsetImm = getNamedOperand(*LdSt,
281                                                       AMDGPU::OpName::offset);
282     if (!OffsetImm)
283       return false;
284 
285     const MachineOperand *SBaseReg = getNamedOperand(*LdSt,
286                                                      AMDGPU::OpName::sbase);
287     BaseReg = SBaseReg->getReg();
288     Offset = OffsetImm->getImm();
289     return true;
290   }
291 
292   return false;
293 }
294 
295 bool SIInstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt,
296                                      MachineInstr *SecondLdSt,
297                                      unsigned NumLoads) const {
298 	const MachineOperand *FirstDst = nullptr;
299 	const MachineOperand *SecondDst = nullptr;
300 
301   if (isDS(*FirstLdSt) && isDS(*SecondLdSt)) {
302     FirstDst = getNamedOperand(*FirstLdSt, AMDGPU::OpName::vdst);
303     SecondDst = getNamedOperand(*SecondLdSt, AMDGPU::OpName::vdst);
304   }
305 
306   if (isSMRD(*FirstLdSt) && isSMRD(*FirstLdSt)) {
307     FirstDst = getNamedOperand(*FirstLdSt, AMDGPU::OpName::sdst);
308     SecondDst = getNamedOperand(*SecondLdSt, AMDGPU::OpName::sdst);
309   }
310 
311   if ((isMUBUF(*FirstLdSt) && isMUBUF(*SecondLdSt)) ||
312       (isMTBUF(*FirstLdSt) && isMTBUF(*SecondLdSt))) {
313     FirstDst = getNamedOperand(*FirstLdSt, AMDGPU::OpName::vdata);
314     SecondDst = getNamedOperand(*SecondLdSt, AMDGPU::OpName::vdata);
315   }
316 
317   if (!FirstDst || !SecondDst)
318     return false;
319 
320   // Try to limit clustering based on the total number of bytes loaded
321   // rather than the number of instructions.  This is done to help reduce
322   // register pressure.  The method used is somewhat inexact, though,
323   // because it assumes that all loads in the cluster will load the
324   // same number of bytes as FirstLdSt.
325 
326   // The unit of this value is bytes.
327   // FIXME: This needs finer tuning.
328   unsigned LoadClusterThreshold = 16;
329 
330   const MachineRegisterInfo &MRI =
331       FirstLdSt->getParent()->getParent()->getRegInfo();
332   const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg());
333 
334   return (NumLoads * DstRC->getSize()) <= LoadClusterThreshold;
335 }
336 
337 void
338 SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
339                          MachineBasicBlock::iterator MI, DebugLoc DL,
340                          unsigned DestReg, unsigned SrcReg,
341                          bool KillSrc) const {
342 
343   // If we are trying to copy to or from SCC, there is a bug somewhere else in
344   // the backend.  While it may be theoretically possible to do this, it should
345   // never be necessary.
346   assert(DestReg != AMDGPU::SCC && SrcReg != AMDGPU::SCC);
347 
348   static const int16_t Sub0_15[] = {
349     AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
350     AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
351     AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
352     AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
353   };
354 
355   static const int16_t Sub0_15_64[] = {
356     AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
357     AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
358     AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
359     AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
360   };
361 
362   static const int16_t Sub0_7[] = {
363     AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
364     AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
365   };
366 
367   static const int16_t Sub0_7_64[] = {
368     AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
369     AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
370   };
371 
372   static const int16_t Sub0_3[] = {
373     AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
374   };
375 
376   static const int16_t Sub0_3_64[] = {
377     AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
378   };
379 
380   static const int16_t Sub0_2[] = {
381     AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2,
382   };
383 
384   static const int16_t Sub0_1[] = {
385     AMDGPU::sub0, AMDGPU::sub1,
386   };
387 
388   unsigned Opcode;
389   ArrayRef<int16_t> SubIndices;
390   bool Forward;
391 
392   if (AMDGPU::SReg_32RegClass.contains(DestReg)) {
393     assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
394     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
395             .addReg(SrcReg, getKillRegState(KillSrc));
396     return;
397 
398   } else if (AMDGPU::SReg_64RegClass.contains(DestReg)) {
399     if (DestReg == AMDGPU::VCC) {
400       if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
401         BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
402           .addReg(SrcReg, getKillRegState(KillSrc));
403       } else {
404         // FIXME: Hack until VReg_1 removed.
405         assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
406         BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_I32_e32))
407           .addImm(0)
408           .addReg(SrcReg, getKillRegState(KillSrc));
409       }
410 
411       return;
412     }
413 
414     assert(AMDGPU::SReg_64RegClass.contains(SrcReg));
415     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
416             .addReg(SrcReg, getKillRegState(KillSrc));
417     return;
418 
419   } else if (AMDGPU::SReg_128RegClass.contains(DestReg)) {
420     assert(AMDGPU::SReg_128RegClass.contains(SrcReg));
421     Opcode = AMDGPU::S_MOV_B64;
422     SubIndices = Sub0_3_64;
423 
424   } else if (AMDGPU::SReg_256RegClass.contains(DestReg)) {
425     assert(AMDGPU::SReg_256RegClass.contains(SrcReg));
426     Opcode = AMDGPU::S_MOV_B64;
427     SubIndices = Sub0_7_64;
428 
429   } else if (AMDGPU::SReg_512RegClass.contains(DestReg)) {
430     assert(AMDGPU::SReg_512RegClass.contains(SrcReg));
431     Opcode = AMDGPU::S_MOV_B64;
432     SubIndices = Sub0_15_64;
433 
434   } else if (AMDGPU::VGPR_32RegClass.contains(DestReg)) {
435     assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
436            AMDGPU::SReg_32RegClass.contains(SrcReg));
437     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
438             .addReg(SrcReg, getKillRegState(KillSrc));
439     return;
440 
441   } else if (AMDGPU::VReg_64RegClass.contains(DestReg)) {
442     assert(AMDGPU::VReg_64RegClass.contains(SrcReg) ||
443            AMDGPU::SReg_64RegClass.contains(SrcReg));
444     Opcode = AMDGPU::V_MOV_B32_e32;
445     SubIndices = Sub0_1;
446 
447   } else if (AMDGPU::VReg_96RegClass.contains(DestReg)) {
448     assert(AMDGPU::VReg_96RegClass.contains(SrcReg));
449     Opcode = AMDGPU::V_MOV_B32_e32;
450     SubIndices = Sub0_2;
451 
452   } else if (AMDGPU::VReg_128RegClass.contains(DestReg)) {
453     assert(AMDGPU::VReg_128RegClass.contains(SrcReg) ||
454            AMDGPU::SReg_128RegClass.contains(SrcReg));
455     Opcode = AMDGPU::V_MOV_B32_e32;
456     SubIndices = Sub0_3;
457 
458   } else if (AMDGPU::VReg_256RegClass.contains(DestReg)) {
459     assert(AMDGPU::VReg_256RegClass.contains(SrcReg) ||
460            AMDGPU::SReg_256RegClass.contains(SrcReg));
461     Opcode = AMDGPU::V_MOV_B32_e32;
462     SubIndices = Sub0_7;
463 
464   } else if (AMDGPU::VReg_512RegClass.contains(DestReg)) {
465     assert(AMDGPU::VReg_512RegClass.contains(SrcReg) ||
466            AMDGPU::SReg_512RegClass.contains(SrcReg));
467     Opcode = AMDGPU::V_MOV_B32_e32;
468     SubIndices = Sub0_15;
469 
470   } else {
471     llvm_unreachable("Can't copy register!");
472   }
473 
474   if (RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg))
475     Forward = true;
476   else
477     Forward = false;
478 
479   for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
480     unsigned SubIdx;
481     if (Forward)
482       SubIdx = SubIndices[Idx];
483     else
484       SubIdx = SubIndices[SubIndices.size() - Idx - 1];
485 
486     MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
487       get(Opcode), RI.getSubReg(DestReg, SubIdx));
488 
489     Builder.addReg(RI.getSubReg(SrcReg, SubIdx));
490 
491     if (Idx == SubIndices.size() - 1)
492       Builder.addReg(SrcReg, RegState::Kill | RegState::Implicit);
493 
494     if (Idx == 0)
495       Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
496   }
497 }
498 
499 int SIInstrInfo::commuteOpcode(const MachineInstr &MI) const {
500   const unsigned Opcode = MI.getOpcode();
501 
502   int NewOpc;
503 
504   // Try to map original to commuted opcode
505   NewOpc = AMDGPU::getCommuteRev(Opcode);
506   if (NewOpc != -1)
507     // Check if the commuted (REV) opcode exists on the target.
508     return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
509 
510   // Try to map commuted to original opcode
511   NewOpc = AMDGPU::getCommuteOrig(Opcode);
512   if (NewOpc != -1)
513     // Check if the original (non-REV) opcode exists on the target.
514     return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
515 
516   return Opcode;
517 }
518 
519 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
520 
521   if (DstRC->getSize() == 4) {
522     return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
523   } else if (DstRC->getSize() == 8 && RI.isSGPRClass(DstRC)) {
524     return AMDGPU::S_MOV_B64;
525   } else if (DstRC->getSize() == 8 && !RI.isSGPRClass(DstRC)) {
526     return  AMDGPU::V_MOV_B64_PSEUDO;
527   }
528   return AMDGPU::COPY;
529 }
530 
531 static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
532   switch (Size) {
533   case 4:
534     return AMDGPU::SI_SPILL_S32_SAVE;
535   case 8:
536     return AMDGPU::SI_SPILL_S64_SAVE;
537   case 16:
538     return AMDGPU::SI_SPILL_S128_SAVE;
539   case 32:
540     return AMDGPU::SI_SPILL_S256_SAVE;
541   case 64:
542     return AMDGPU::SI_SPILL_S512_SAVE;
543   default:
544     llvm_unreachable("unknown register size");
545   }
546 }
547 
548 static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
549   switch (Size) {
550   case 4:
551     return AMDGPU::SI_SPILL_V32_SAVE;
552   case 8:
553     return AMDGPU::SI_SPILL_V64_SAVE;
554   case 16:
555     return AMDGPU::SI_SPILL_V128_SAVE;
556   case 32:
557     return AMDGPU::SI_SPILL_V256_SAVE;
558   case 64:
559     return AMDGPU::SI_SPILL_V512_SAVE;
560   default:
561     llvm_unreachable("unknown register size");
562   }
563 }
564 
565 void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
566                                       MachineBasicBlock::iterator MI,
567                                       unsigned SrcReg, bool isKill,
568                                       int FrameIndex,
569                                       const TargetRegisterClass *RC,
570                                       const TargetRegisterInfo *TRI) const {
571   MachineFunction *MF = MBB.getParent();
572   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
573   MachineFrameInfo *FrameInfo = MF->getFrameInfo();
574   DebugLoc DL = MBB.findDebugLoc(MI);
575 
576   unsigned Size = FrameInfo->getObjectSize(FrameIndex);
577   unsigned Align = FrameInfo->getObjectAlignment(FrameIndex);
578   MachinePointerInfo PtrInfo
579     = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
580   MachineMemOperand *MMO
581     = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
582                                Size, Align);
583 
584   if (RI.isSGPRClass(RC)) {
585     MFI->setHasSpilledSGPRs();
586 
587     // We are only allowed to create one new instruction when spilling
588     // registers, so we need to use pseudo instruction for spilling
589     // SGPRs.
590     unsigned Opcode = getSGPRSpillSaveOpcode(RC->getSize());
591     BuildMI(MBB, MI, DL, get(Opcode))
592       .addReg(SrcReg)            // src
593       .addFrameIndex(FrameIndex) // frame_idx
594       .addMemOperand(MMO);
595 
596     return;
597   }
598 
599   if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) {
600     LLVMContext &Ctx = MF->getFunction()->getContext();
601     Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to"
602                   " spill register");
603     BuildMI(MBB, MI, DL, get(AMDGPU::KILL))
604       .addReg(SrcReg);
605 
606     return;
607   }
608 
609   assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
610 
611   unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize());
612   MFI->setHasSpilledVGPRs();
613   BuildMI(MBB, MI, DL, get(Opcode))
614     .addReg(SrcReg)                   // src
615     .addFrameIndex(FrameIndex)        // frame_idx
616     .addReg(MFI->getScratchRSrcReg())       // scratch_rsrc
617     .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
618     .addImm(0)                              // offset
619     .addMemOperand(MMO);
620 }
621 
622 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
623   switch (Size) {
624   case 4:
625     return AMDGPU::SI_SPILL_S32_RESTORE;
626   case 8:
627     return AMDGPU::SI_SPILL_S64_RESTORE;
628   case 16:
629     return AMDGPU::SI_SPILL_S128_RESTORE;
630   case 32:
631     return AMDGPU::SI_SPILL_S256_RESTORE;
632   case 64:
633     return AMDGPU::SI_SPILL_S512_RESTORE;
634   default:
635     llvm_unreachable("unknown register size");
636   }
637 }
638 
639 static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
640   switch (Size) {
641   case 4:
642     return AMDGPU::SI_SPILL_V32_RESTORE;
643   case 8:
644     return AMDGPU::SI_SPILL_V64_RESTORE;
645   case 16:
646     return AMDGPU::SI_SPILL_V128_RESTORE;
647   case 32:
648     return AMDGPU::SI_SPILL_V256_RESTORE;
649   case 64:
650     return AMDGPU::SI_SPILL_V512_RESTORE;
651   default:
652     llvm_unreachable("unknown register size");
653   }
654 }
655 
656 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
657                                        MachineBasicBlock::iterator MI,
658                                        unsigned DestReg, int FrameIndex,
659                                        const TargetRegisterClass *RC,
660                                        const TargetRegisterInfo *TRI) const {
661   MachineFunction *MF = MBB.getParent();
662   const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
663   MachineFrameInfo *FrameInfo = MF->getFrameInfo();
664   DebugLoc DL = MBB.findDebugLoc(MI);
665   unsigned Align = FrameInfo->getObjectAlignment(FrameIndex);
666   unsigned Size = FrameInfo->getObjectSize(FrameIndex);
667 
668   MachinePointerInfo PtrInfo
669     = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
670 
671   MachineMemOperand *MMO = MF->getMachineMemOperand(
672     PtrInfo, MachineMemOperand::MOLoad, Size, Align);
673 
674   if (RI.isSGPRClass(RC)) {
675     // FIXME: Maybe this should not include a memoperand because it will be
676     // lowered to non-memory instructions.
677     unsigned Opcode = getSGPRSpillRestoreOpcode(RC->getSize());
678     BuildMI(MBB, MI, DL, get(Opcode), DestReg)
679       .addFrameIndex(FrameIndex) // frame_idx
680       .addMemOperand(MMO);
681 
682     return;
683   }
684 
685   if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) {
686     LLVMContext &Ctx = MF->getFunction()->getContext();
687     Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to"
688                   " restore register");
689     BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg);
690 
691     return;
692   }
693 
694   assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
695 
696   unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize());
697   BuildMI(MBB, MI, DL, get(Opcode), DestReg)
698     .addFrameIndex(FrameIndex)        // frame_idx
699     .addReg(MFI->getScratchRSrcReg())       // scratch_rsrc
700     .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset
701     .addImm(0)                              // offset
702     .addMemOperand(MMO);
703 }
704 
705 /// \param @Offset Offset in bytes of the FrameIndex being spilled
706 unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB,
707                                                MachineBasicBlock::iterator MI,
708                                                RegScavenger *RS, unsigned TmpReg,
709                                                unsigned FrameOffset,
710                                                unsigned Size) const {
711   MachineFunction *MF = MBB.getParent();
712   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
713   const AMDGPUSubtarget &ST = MF->getSubtarget<AMDGPUSubtarget>();
714   const SIRegisterInfo *TRI =
715       static_cast<const SIRegisterInfo*>(ST.getRegisterInfo());
716   DebugLoc DL = MBB.findDebugLoc(MI);
717   unsigned WorkGroupSize = MFI->getMaximumWorkGroupSize(*MF);
718   unsigned WavefrontSize = ST.getWavefrontSize();
719 
720   unsigned TIDReg = MFI->getTIDReg();
721   if (!MFI->hasCalculatedTID()) {
722     MachineBasicBlock &Entry = MBB.getParent()->front();
723     MachineBasicBlock::iterator Insert = Entry.front();
724     DebugLoc DL = Insert->getDebugLoc();
725 
726     TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass);
727     if (TIDReg == AMDGPU::NoRegister)
728       return TIDReg;
729 
730 
731     if (!AMDGPU::isShader(MF->getFunction()->getCallingConv()) &&
732         WorkGroupSize > WavefrontSize) {
733 
734       unsigned TIDIGXReg
735         = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_X);
736       unsigned TIDIGYReg
737         = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Y);
738       unsigned TIDIGZReg
739         = TRI->getPreloadedValue(*MF, SIRegisterInfo::WORKGROUP_ID_Z);
740       unsigned InputPtrReg =
741           TRI->getPreloadedValue(*MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
742       for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) {
743         if (!Entry.isLiveIn(Reg))
744           Entry.addLiveIn(Reg);
745       }
746 
747       RS->enterBasicBlock(Entry);
748       // FIXME: Can we scavenge an SReg_64 and access the subregs?
749       unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
750       unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
751       BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0)
752               .addReg(InputPtrReg)
753               .addImm(SI::KernelInputOffsets::NGROUPS_Z);
754       BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1)
755               .addReg(InputPtrReg)
756               .addImm(SI::KernelInputOffsets::NGROUPS_Y);
757 
758       // NGROUPS.X * NGROUPS.Y
759       BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1)
760               .addReg(STmp1)
761               .addReg(STmp0);
762       // (NGROUPS.X * NGROUPS.Y) * TIDIG.X
763       BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg)
764               .addReg(STmp1)
765               .addReg(TIDIGXReg);
766       // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)
767       BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg)
768               .addReg(STmp0)
769               .addReg(TIDIGYReg)
770               .addReg(TIDReg);
771       // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z
772       BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg)
773               .addReg(TIDReg)
774               .addReg(TIDIGZReg);
775     } else {
776       // Get the wave id
777       BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
778               TIDReg)
779               .addImm(-1)
780               .addImm(0);
781 
782       BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64),
783               TIDReg)
784               .addImm(-1)
785               .addReg(TIDReg);
786     }
787 
788     BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32),
789             TIDReg)
790             .addImm(2)
791             .addReg(TIDReg);
792     MFI->setTIDReg(TIDReg);
793   }
794 
795   // Add FrameIndex to LDS offset
796   unsigned LDSOffset = MFI->LDSSize + (FrameOffset * WorkGroupSize);
797   BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg)
798           .addImm(LDSOffset)
799           .addReg(TIDReg);
800 
801   return TmpReg;
802 }
803 
804 void SIInstrInfo::insertWaitStates(MachineBasicBlock &MBB,
805                                    MachineBasicBlock::iterator MI,
806                                    int Count) const {
807   while (Count > 0) {
808     int Arg;
809     if (Count >= 8)
810       Arg = 7;
811     else
812       Arg = Count - 1;
813     Count -= 8;
814     BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_NOP))
815             .addImm(Arg);
816   }
817 }
818 
819 bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
820   MachineBasicBlock &MBB = *MI->getParent();
821   DebugLoc DL = MBB.findDebugLoc(MI);
822   switch (MI->getOpcode()) {
823   default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
824 
825   case AMDGPU::SGPR_USE:
826     // This is just a placeholder for register allocation.
827     MI->eraseFromParent();
828     break;
829 
830   case AMDGPU::V_MOV_B64_PSEUDO: {
831     unsigned Dst = MI->getOperand(0).getReg();
832     unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
833     unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
834 
835     const MachineOperand &SrcOp = MI->getOperand(1);
836     // FIXME: Will this work for 64-bit floating point immediates?
837     assert(!SrcOp.isFPImm());
838     if (SrcOp.isImm()) {
839       APInt Imm(64, SrcOp.getImm());
840       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
841               .addImm(Imm.getLoBits(32).getZExtValue())
842               .addReg(Dst, RegState::Implicit);
843       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
844               .addImm(Imm.getHiBits(32).getZExtValue())
845               .addReg(Dst, RegState::Implicit);
846     } else {
847       assert(SrcOp.isReg());
848       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
849               .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
850               .addReg(Dst, RegState::Implicit);
851       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
852               .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
853               .addReg(Dst, RegState::Implicit);
854     }
855     MI->eraseFromParent();
856     break;
857   }
858 
859   case AMDGPU::V_CNDMASK_B64_PSEUDO: {
860     unsigned Dst = MI->getOperand(0).getReg();
861     unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
862     unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
863     unsigned Src0 = MI->getOperand(1).getReg();
864     unsigned Src1 = MI->getOperand(2).getReg();
865     const MachineOperand &SrcCond = MI->getOperand(3);
866 
867     BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
868         .addReg(RI.getSubReg(Src0, AMDGPU::sub0))
869         .addReg(RI.getSubReg(Src1, AMDGPU::sub0))
870         .addOperand(SrcCond);
871     BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
872         .addReg(RI.getSubReg(Src0, AMDGPU::sub1))
873         .addReg(RI.getSubReg(Src1, AMDGPU::sub1))
874         .addOperand(SrcCond);
875     MI->eraseFromParent();
876     break;
877   }
878 
879   case AMDGPU::SI_CONSTDATA_PTR: {
880     const SIRegisterInfo *TRI =
881         static_cast<const SIRegisterInfo *>(ST.getRegisterInfo());
882     MachineFunction &MF = *MBB.getParent();
883     unsigned Reg = MI->getOperand(0).getReg();
884     unsigned RegLo = TRI->getSubReg(Reg, AMDGPU::sub0);
885     unsigned RegHi = TRI->getSubReg(Reg, AMDGPU::sub1);
886 
887     // Create a bundle so these instructions won't be re-ordered by the
888     // post-RA scheduler.
889     MIBundleBuilder Bundler(MBB, MI);
890     Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
891 
892     // Add 32-bit offset from this instruction to the start of the
893     // constant data.
894     Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo)
895                            .addReg(RegLo)
896                            .addOperand(MI->getOperand(1)));
897     Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
898                            .addReg(RegHi)
899                            .addImm(0));
900 
901     llvm::finalizeBundle(MBB, Bundler.begin());
902 
903     MI->eraseFromParent();
904     break;
905   }
906   }
907   return true;
908 }
909 
910 /// Commutes the operands in the given instruction.
911 /// The commutable operands are specified by their indices OpIdx0 and OpIdx1.
912 ///
913 /// Do not call this method for a non-commutable instruction or for
914 /// non-commutable pair of operand indices OpIdx0 and OpIdx1.
915 /// Even though the instruction is commutable, the method may still
916 /// fail to commute the operands, null pointer is returned in such cases.
917 MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr *MI,
918                                                   bool NewMI,
919                                                   unsigned OpIdx0,
920                                                   unsigned OpIdx1) const {
921   int CommutedOpcode = commuteOpcode(*MI);
922   if (CommutedOpcode == -1)
923     return nullptr;
924 
925   int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
926                                            AMDGPU::OpName::src0);
927   MachineOperand &Src0 = MI->getOperand(Src0Idx);
928   if (!Src0.isReg())
929     return nullptr;
930 
931   int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
932                                            AMDGPU::OpName::src1);
933 
934   if ((OpIdx0 != static_cast<unsigned>(Src0Idx) ||
935        OpIdx1 != static_cast<unsigned>(Src1Idx)) &&
936       (OpIdx0 != static_cast<unsigned>(Src1Idx) ||
937        OpIdx1 != static_cast<unsigned>(Src0Idx)))
938     return nullptr;
939 
940   MachineOperand &Src1 = MI->getOperand(Src1Idx);
941 
942 
943   if (isVOP2(*MI)) {
944     const MCInstrDesc &InstrDesc = MI->getDesc();
945     // For VOP2 instructions, any operand type is valid to use for src0.  Make
946     // sure we can use the src1 as src0.
947     //
948     // We could be stricter here and only allow commuting if there is a reason
949     // to do so. i.e. if both operands are VGPRs there is no real benefit,
950     // although MachineCSE attempts to find matches by commuting.
951     const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
952     if (!isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0))
953       return nullptr;
954   }
955 
956   if (!Src1.isReg()) {
957     // Allow commuting instructions with Imm operands.
958     if (NewMI || !Src1.isImm() ||
959         (!isVOP2(*MI) && !isVOP3(*MI))) {
960       return nullptr;
961     }
962     // Be sure to copy the source modifiers to the right place.
963     if (MachineOperand *Src0Mods
964           = getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) {
965       MachineOperand *Src1Mods
966         = getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers);
967 
968       int Src0ModsVal = Src0Mods->getImm();
969       if (!Src1Mods && Src0ModsVal != 0)
970         return nullptr;
971 
972       // XXX - This assert might be a lie. It might be useful to have a neg
973       // modifier with 0.0.
974       int Src1ModsVal = Src1Mods->getImm();
975       assert((Src1ModsVal == 0) && "Not expecting modifiers with immediates");
976 
977       Src1Mods->setImm(Src0ModsVal);
978       Src0Mods->setImm(Src1ModsVal);
979     }
980 
981     unsigned Reg = Src0.getReg();
982     unsigned SubReg = Src0.getSubReg();
983     if (Src1.isImm())
984       Src0.ChangeToImmediate(Src1.getImm());
985     else
986       llvm_unreachable("Should only have immediates");
987 
988     Src1.ChangeToRegister(Reg, false);
989     Src1.setSubReg(SubReg);
990   } else {
991     MI = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx0, OpIdx1);
992   }
993 
994   if (MI)
995     MI->setDesc(get(CommutedOpcode));
996 
997   return MI;
998 }
999 
1000 // This needs to be implemented because the source modifiers may be inserted
1001 // between the true commutable operands, and the base
1002 // TargetInstrInfo::commuteInstruction uses it.
1003 bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI,
1004                                         unsigned &SrcOpIdx0,
1005                                         unsigned &SrcOpIdx1) const {
1006   const MCInstrDesc &MCID = MI->getDesc();
1007   if (!MCID.isCommutable())
1008     return false;
1009 
1010   unsigned Opc = MI->getOpcode();
1011   int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1012   if (Src0Idx == -1)
1013     return false;
1014 
1015   // FIXME: Workaround TargetInstrInfo::commuteInstruction asserting on
1016   // immediate. Also, immediate src0 operand is not handled in
1017   // SIInstrInfo::commuteInstruction();
1018   if (!MI->getOperand(Src0Idx).isReg())
1019     return false;
1020 
1021   int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
1022   if (Src1Idx == -1)
1023     return false;
1024 
1025   MachineOperand &Src1 = MI->getOperand(Src1Idx);
1026   if (Src1.isImm()) {
1027     // SIInstrInfo::commuteInstruction() does support commuting the immediate
1028     // operand src1 in 2 and 3 operand instructions.
1029     if (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode()))
1030       return false;
1031   } else if (Src1.isReg()) {
1032     // If any source modifiers are set, the generic instruction commuting won't
1033     // understand how to copy the source modifiers.
1034     if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) ||
1035         hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers))
1036       return false;
1037   } else
1038     return false;
1039 
1040   return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
1041 }
1042 
1043 static void removeModOperands(MachineInstr &MI) {
1044   unsigned Opc = MI.getOpcode();
1045   int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc,
1046                                               AMDGPU::OpName::src0_modifiers);
1047   int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc,
1048                                               AMDGPU::OpName::src1_modifiers);
1049   int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc,
1050                                               AMDGPU::OpName::src2_modifiers);
1051 
1052   MI.RemoveOperand(Src2ModIdx);
1053   MI.RemoveOperand(Src1ModIdx);
1054   MI.RemoveOperand(Src0ModIdx);
1055 }
1056 
1057 bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
1058                                 unsigned Reg, MachineRegisterInfo *MRI) const {
1059   if (!MRI->hasOneNonDBGUse(Reg))
1060     return false;
1061 
1062   unsigned Opc = UseMI->getOpcode();
1063   if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64) {
1064     // Don't fold if we are using source modifiers. The new VOP2 instructions
1065     // don't have them.
1066     if (hasModifiersSet(*UseMI, AMDGPU::OpName::src0_modifiers) ||
1067         hasModifiersSet(*UseMI, AMDGPU::OpName::src1_modifiers) ||
1068         hasModifiersSet(*UseMI, AMDGPU::OpName::src2_modifiers)) {
1069       return false;
1070     }
1071 
1072     MachineOperand *Src0 = getNamedOperand(*UseMI, AMDGPU::OpName::src0);
1073     MachineOperand *Src1 = getNamedOperand(*UseMI, AMDGPU::OpName::src1);
1074     MachineOperand *Src2 = getNamedOperand(*UseMI, AMDGPU::OpName::src2);
1075 
1076     // Multiplied part is the constant: Use v_madmk_f32
1077     // We should only expect these to be on src0 due to canonicalizations.
1078     if (Src0->isReg() && Src0->getReg() == Reg) {
1079       if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
1080         return false;
1081 
1082       if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
1083         return false;
1084 
1085       // We need to swap operands 0 and 1 since madmk constant is at operand 1.
1086 
1087       const int64_t Imm = DefMI->getOperand(1).getImm();
1088 
1089       // FIXME: This would be a lot easier if we could return a new instruction
1090       // instead of having to modify in place.
1091 
1092       // Remove these first since they are at the end.
1093       UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc,
1094                                                       AMDGPU::OpName::omod));
1095       UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc,
1096                                                       AMDGPU::OpName::clamp));
1097 
1098       unsigned Src1Reg = Src1->getReg();
1099       unsigned Src1SubReg = Src1->getSubReg();
1100       Src0->setReg(Src1Reg);
1101       Src0->setSubReg(Src1SubReg);
1102       Src0->setIsKill(Src1->isKill());
1103 
1104       if (Opc == AMDGPU::V_MAC_F32_e64) {
1105         UseMI->untieRegOperand(
1106           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
1107       }
1108 
1109       Src1->ChangeToImmediate(Imm);
1110 
1111       removeModOperands(*UseMI);
1112       UseMI->setDesc(get(AMDGPU::V_MADMK_F32));
1113 
1114       bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
1115       if (DeleteDef)
1116         DefMI->eraseFromParent();
1117 
1118       return true;
1119     }
1120 
1121     // Added part is the constant: Use v_madak_f32
1122     if (Src2->isReg() && Src2->getReg() == Reg) {
1123       // Not allowed to use constant bus for another operand.
1124       // We can however allow an inline immediate as src0.
1125       if (!Src0->isImm() &&
1126           (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))
1127         return false;
1128 
1129       if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
1130         return false;
1131 
1132       const int64_t Imm = DefMI->getOperand(1).getImm();
1133 
1134       // FIXME: This would be a lot easier if we could return a new instruction
1135       // instead of having to modify in place.
1136 
1137       // Remove these first since they are at the end.
1138       UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc,
1139                                                       AMDGPU::OpName::omod));
1140       UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc,
1141                                                       AMDGPU::OpName::clamp));
1142 
1143       if (Opc == AMDGPU::V_MAC_F32_e64) {
1144         UseMI->untieRegOperand(
1145           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
1146       }
1147 
1148       // ChangingToImmediate adds Src2 back to the instruction.
1149       Src2->ChangeToImmediate(Imm);
1150 
1151       // These come before src2.
1152       removeModOperands(*UseMI);
1153       UseMI->setDesc(get(AMDGPU::V_MADAK_F32));
1154 
1155       bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
1156       if (DeleteDef)
1157         DefMI->eraseFromParent();
1158 
1159       return true;
1160     }
1161   }
1162 
1163   return false;
1164 }
1165 
1166 static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
1167                                 int WidthB, int OffsetB) {
1168   int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
1169   int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
1170   int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
1171   return LowOffset + LowWidth <= HighOffset;
1172 }
1173 
1174 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr *MIa,
1175                                                MachineInstr *MIb) const {
1176   unsigned BaseReg0, BaseReg1;
1177   int64_t Offset0, Offset1;
1178 
1179   if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) &&
1180       getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) {
1181     assert(MIa->hasOneMemOperand() && MIb->hasOneMemOperand() &&
1182            "read2 / write2 not expected here yet");
1183     unsigned Width0 = (*MIa->memoperands_begin())->getSize();
1184     unsigned Width1 = (*MIb->memoperands_begin())->getSize();
1185     if (BaseReg0 == BaseReg1 &&
1186         offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) {
1187       return true;
1188     }
1189   }
1190 
1191   return false;
1192 }
1193 
1194 bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa,
1195                                                   MachineInstr *MIb,
1196                                                   AliasAnalysis *AA) const {
1197   assert(MIa && (MIa->mayLoad() || MIa->mayStore()) &&
1198          "MIa must load from or modify a memory location");
1199   assert(MIb && (MIb->mayLoad() || MIb->mayStore()) &&
1200          "MIb must load from or modify a memory location");
1201 
1202   if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects())
1203     return false;
1204 
1205   // XXX - Can we relax this between address spaces?
1206   if (MIa->hasOrderedMemoryRef() || MIb->hasOrderedMemoryRef())
1207     return false;
1208 
1209   // TODO: Should we check the address space from the MachineMemOperand? That
1210   // would allow us to distinguish objects we know don't alias based on the
1211   // underlying address space, even if it was lowered to a different one,
1212   // e.g. private accesses lowered to use MUBUF instructions on a scratch
1213   // buffer.
1214   if (isDS(*MIa)) {
1215     if (isDS(*MIb))
1216       return checkInstOffsetsDoNotOverlap(MIa, MIb);
1217 
1218     return !isFLAT(*MIb);
1219   }
1220 
1221   if (isMUBUF(*MIa) || isMTBUF(*MIa)) {
1222     if (isMUBUF(*MIb) || isMTBUF(*MIb))
1223       return checkInstOffsetsDoNotOverlap(MIa, MIb);
1224 
1225     return !isFLAT(*MIb) && !isSMRD(*MIb);
1226   }
1227 
1228   if (isSMRD(*MIa)) {
1229     if (isSMRD(*MIb))
1230       return checkInstOffsetsDoNotOverlap(MIa, MIb);
1231 
1232     return !isFLAT(*MIb) && !isMUBUF(*MIa) && !isMTBUF(*MIa);
1233   }
1234 
1235   if (isFLAT(*MIa)) {
1236     if (isFLAT(*MIb))
1237       return checkInstOffsetsDoNotOverlap(MIa, MIb);
1238 
1239     return false;
1240   }
1241 
1242   return false;
1243 }
1244 
1245 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
1246                                                 MachineBasicBlock::iterator &MI,
1247                                                 LiveVariables *LV) const {
1248 
1249   switch (MI->getOpcode()) {
1250     default: return nullptr;
1251     case AMDGPU::V_MAC_F32_e64: break;
1252     case AMDGPU::V_MAC_F32_e32: {
1253       const MachineOperand *Src0 = getNamedOperand(*MI, AMDGPU::OpName::src0);
1254       if (Src0->isImm() && !isInlineConstant(*Src0, 4))
1255         return nullptr;
1256       break;
1257     }
1258   }
1259 
1260   const MachineOperand *Dst = getNamedOperand(*MI, AMDGPU::OpName::vdst);
1261   const MachineOperand *Src0 = getNamedOperand(*MI, AMDGPU::OpName::src0);
1262   const MachineOperand *Src1 = getNamedOperand(*MI, AMDGPU::OpName::src1);
1263   const MachineOperand *Src2 = getNamedOperand(*MI, AMDGPU::OpName::src2);
1264 
1265   return BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_MAD_F32))
1266                  .addOperand(*Dst)
1267                  .addImm(0) // Src0 mods
1268                  .addOperand(*Src0)
1269                  .addImm(0) // Src1 mods
1270                  .addOperand(*Src1)
1271                  .addImm(0) // Src mods
1272                  .addOperand(*Src2)
1273                  .addImm(0)  // clamp
1274                  .addImm(0); // omod
1275 }
1276 
1277 bool SIInstrInfo::isSchedulingBoundary(const MachineInstr *MI,
1278                                        const MachineBasicBlock *MBB,
1279                                        const MachineFunction &MF) const {
1280   // Target-independent instructions do not have an implicit-use of EXEC, even
1281   // when they operate on VGPRs. Treating EXEC modifications as scheduling
1282   // boundaries prevents incorrect movements of such instructions.
1283   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
1284   if (MI->modifiesRegister(AMDGPU::EXEC, TRI))
1285     return true;
1286 
1287   return AMDGPUInstrInfo::isSchedulingBoundary(MI, MBB, MF);
1288 }
1289 
1290 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
1291   int64_t SVal = Imm.getSExtValue();
1292   if (SVal >= -16 && SVal <= 64)
1293     return true;
1294 
1295   if (Imm.getBitWidth() == 64) {
1296     uint64_t Val = Imm.getZExtValue();
1297     return (DoubleToBits(0.0) == Val) ||
1298            (DoubleToBits(1.0) == Val) ||
1299            (DoubleToBits(-1.0) == Val) ||
1300            (DoubleToBits(0.5) == Val) ||
1301            (DoubleToBits(-0.5) == Val) ||
1302            (DoubleToBits(2.0) == Val) ||
1303            (DoubleToBits(-2.0) == Val) ||
1304            (DoubleToBits(4.0) == Val) ||
1305            (DoubleToBits(-4.0) == Val);
1306   }
1307 
1308   // The actual type of the operand does not seem to matter as long
1309   // as the bits match one of the inline immediate values.  For example:
1310   //
1311   // -nan has the hexadecimal encoding of 0xfffffffe which is -2 in decimal,
1312   // so it is a legal inline immediate.
1313   //
1314   // 1065353216 has the hexadecimal encoding 0x3f800000 which is 1.0f in
1315   // floating-point, so it is a legal inline immediate.
1316   uint32_t Val = Imm.getZExtValue();
1317 
1318   return (FloatToBits(0.0f) == Val) ||
1319          (FloatToBits(1.0f) == Val) ||
1320          (FloatToBits(-1.0f) == Val) ||
1321          (FloatToBits(0.5f) == Val) ||
1322          (FloatToBits(-0.5f) == Val) ||
1323          (FloatToBits(2.0f) == Val) ||
1324          (FloatToBits(-2.0f) == Val) ||
1325          (FloatToBits(4.0f) == Val) ||
1326          (FloatToBits(-4.0f) == Val);
1327 }
1328 
1329 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
1330                                    unsigned OpSize) const {
1331   if (MO.isImm()) {
1332     // MachineOperand provides no way to tell the true operand size, since it
1333     // only records a 64-bit value. We need to know the size to determine if a
1334     // 32-bit floating point immediate bit pattern is legal for an integer
1335     // immediate. It would be for any 32-bit integer operand, but would not be
1336     // for a 64-bit one.
1337 
1338     unsigned BitSize = 8 * OpSize;
1339     return isInlineConstant(APInt(BitSize, MO.getImm(), true));
1340   }
1341 
1342   return false;
1343 }
1344 
1345 bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO,
1346                                     unsigned OpSize) const {
1347   return MO.isImm() && !isInlineConstant(MO, OpSize);
1348 }
1349 
1350 static bool compareMachineOp(const MachineOperand &Op0,
1351                              const MachineOperand &Op1) {
1352   if (Op0.getType() != Op1.getType())
1353     return false;
1354 
1355   switch (Op0.getType()) {
1356   case MachineOperand::MO_Register:
1357     return Op0.getReg() == Op1.getReg();
1358   case MachineOperand::MO_Immediate:
1359     return Op0.getImm() == Op1.getImm();
1360   default:
1361     llvm_unreachable("Didn't expect to be comparing these operand types");
1362   }
1363 }
1364 
1365 bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo,
1366                                  const MachineOperand &MO) const {
1367   const MCOperandInfo &OpInfo = get(MI->getOpcode()).OpInfo[OpNo];
1368 
1369   assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
1370 
1371   if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
1372     return true;
1373 
1374   if (OpInfo.RegClass < 0)
1375     return false;
1376 
1377   unsigned OpSize = RI.getRegClass(OpInfo.RegClass)->getSize();
1378   if (isLiteralConstant(MO, OpSize))
1379     return RI.opCanUseLiteralConstant(OpInfo.OperandType);
1380 
1381   return RI.opCanUseInlineConstant(OpInfo.OperandType);
1382 }
1383 
1384 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
1385   int Op32 = AMDGPU::getVOPe32(Opcode);
1386   if (Op32 == -1)
1387     return false;
1388 
1389   return pseudoToMCOpcode(Op32) != -1;
1390 }
1391 
1392 bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
1393   // The src0_modifier operand is present on all instructions
1394   // that have modifiers.
1395 
1396   return AMDGPU::getNamedOperandIdx(Opcode,
1397                                     AMDGPU::OpName::src0_modifiers) != -1;
1398 }
1399 
1400 bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI,
1401                                   unsigned OpName) const {
1402   const MachineOperand *Mods = getNamedOperand(MI, OpName);
1403   return Mods && Mods->getImm();
1404 }
1405 
1406 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
1407                                   const MachineOperand &MO,
1408                                   unsigned OpSize) const {
1409   // Literal constants use the constant bus.
1410   if (isLiteralConstant(MO, OpSize))
1411     return true;
1412 
1413   if (!MO.isReg() || !MO.isUse())
1414     return false;
1415 
1416   if (TargetRegisterInfo::isVirtualRegister(MO.getReg()))
1417     return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
1418 
1419   // FLAT_SCR is just an SGPR pair.
1420   if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR))
1421     return true;
1422 
1423   // EXEC register uses the constant bus.
1424   if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC)
1425     return true;
1426 
1427   // SGPRs use the constant bus
1428   return (MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::M0 ||
1429           (!MO.isImplicit() &&
1430            (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) ||
1431             AMDGPU::SGPR_64RegClass.contains(MO.getReg()))));
1432 }
1433 
1434 static unsigned findImplicitSGPRRead(const MachineInstr &MI) {
1435   for (const MachineOperand &MO : MI.implicit_operands()) {
1436     // We only care about reads.
1437     if (MO.isDef())
1438       continue;
1439 
1440     switch (MO.getReg()) {
1441     case AMDGPU::VCC:
1442     case AMDGPU::M0:
1443     case AMDGPU::FLAT_SCR:
1444       return MO.getReg();
1445 
1446     default:
1447       break;
1448     }
1449   }
1450 
1451   return AMDGPU::NoRegister;
1452 }
1453 
1454 bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
1455                                     StringRef &ErrInfo) const {
1456   uint16_t Opcode = MI->getOpcode();
1457   const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
1458   int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
1459   int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
1460   int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
1461 
1462   // Make sure we don't have SCC live-ins to basic blocks.  moveToVALU assumes
1463   // all SCC users are in the same blocks as their defs.
1464   const MachineBasicBlock *MBB = MI->getParent();
1465   if (MI == &MBB->front()) {
1466     if (MBB->isLiveIn(AMDGPU::SCC)) {
1467       ErrInfo = "scc register cannot be live across blocks.";
1468       return false;
1469     }
1470   }
1471 
1472   // Make sure the number of operands is correct.
1473   const MCInstrDesc &Desc = get(Opcode);
1474   if (!Desc.isVariadic() &&
1475       Desc.getNumOperands() != MI->getNumExplicitOperands()) {
1476      ErrInfo = "Instruction has wrong number of operands.";
1477      return false;
1478   }
1479 
1480   // Make sure the register classes are correct.
1481   for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
1482     if (MI->getOperand(i).isFPImm()) {
1483       ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
1484                 "all fp values to integers.";
1485       return false;
1486     }
1487 
1488     int RegClass = Desc.OpInfo[i].RegClass;
1489 
1490     switch (Desc.OpInfo[i].OperandType) {
1491     case MCOI::OPERAND_REGISTER:
1492       if (MI->getOperand(i).isImm()) {
1493         ErrInfo = "Illegal immediate value for operand.";
1494         return false;
1495       }
1496       break;
1497     case AMDGPU::OPERAND_REG_IMM32:
1498       break;
1499     case AMDGPU::OPERAND_REG_INLINE_C:
1500       if (isLiteralConstant(MI->getOperand(i),
1501                             RI.getRegClass(RegClass)->getSize())) {
1502         ErrInfo = "Illegal immediate value for operand.";
1503         return false;
1504       }
1505       break;
1506     case MCOI::OPERAND_IMMEDIATE:
1507       // Check if this operand is an immediate.
1508       // FrameIndex operands will be replaced by immediates, so they are
1509       // allowed.
1510       if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFI()) {
1511         ErrInfo = "Expected immediate, but got non-immediate";
1512         return false;
1513       }
1514       // Fall-through
1515     default:
1516       continue;
1517     }
1518 
1519     if (!MI->getOperand(i).isReg())
1520       continue;
1521 
1522     if (RegClass != -1) {
1523       unsigned Reg = MI->getOperand(i).getReg();
1524       if (TargetRegisterInfo::isVirtualRegister(Reg))
1525         continue;
1526 
1527       const TargetRegisterClass *RC = RI.getRegClass(RegClass);
1528       if (!RC->contains(Reg)) {
1529         ErrInfo = "Operand has incorrect register class.";
1530         return false;
1531       }
1532     }
1533   }
1534 
1535 
1536   // Verify VOP*
1537   if (isVOP1(*MI) || isVOP2(*MI) || isVOP3(*MI) || isVOPC(*MI)) {
1538     // Only look at the true operands. Only a real operand can use the constant
1539     // bus, and we don't want to check pseudo-operands like the source modifier
1540     // flags.
1541     const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
1542 
1543     unsigned ConstantBusCount = 0;
1544     unsigned SGPRUsed = findImplicitSGPRRead(*MI);
1545     if (SGPRUsed != AMDGPU::NoRegister)
1546       ++ConstantBusCount;
1547 
1548     for (int OpIdx : OpIndices) {
1549       if (OpIdx == -1)
1550         break;
1551       const MachineOperand &MO = MI->getOperand(OpIdx);
1552       if (usesConstantBus(MRI, MO, getOpSize(Opcode, OpIdx))) {
1553         if (MO.isReg()) {
1554           if (MO.getReg() != SGPRUsed)
1555             ++ConstantBusCount;
1556           SGPRUsed = MO.getReg();
1557         } else {
1558           ++ConstantBusCount;
1559         }
1560       }
1561     }
1562     if (ConstantBusCount > 1) {
1563       ErrInfo = "VOP* instruction uses the constant bus more than once";
1564       return false;
1565     }
1566   }
1567 
1568   // Verify misc. restrictions on specific instructions.
1569   if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 ||
1570       Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) {
1571     const MachineOperand &Src0 = MI->getOperand(Src0Idx);
1572     const MachineOperand &Src1 = MI->getOperand(Src1Idx);
1573     const MachineOperand &Src2 = MI->getOperand(Src2Idx);
1574     if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
1575       if (!compareMachineOp(Src0, Src1) &&
1576           !compareMachineOp(Src0, Src2)) {
1577         ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
1578         return false;
1579       }
1580     }
1581   }
1582 
1583   // Make sure we aren't losing exec uses in the td files. This mostly requires
1584   // being careful when using let Uses to try to add other use registers.
1585   if (!isGenericOpcode(Opcode) && !isSALU(Opcode) && !isSMRD(Opcode)) {
1586     const MachineOperand *Exec = MI->findRegisterUseOperand(AMDGPU::EXEC);
1587     if (!Exec || !Exec->isImplicit()) {
1588       ErrInfo = "VALU instruction does not implicitly read exec mask";
1589       return false;
1590     }
1591   }
1592 
1593   return true;
1594 }
1595 
1596 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
1597   switch (MI.getOpcode()) {
1598   default: return AMDGPU::INSTRUCTION_LIST_END;
1599   case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
1600   case AMDGPU::COPY: return AMDGPU::COPY;
1601   case AMDGPU::PHI: return AMDGPU::PHI;
1602   case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
1603   case AMDGPU::S_MOV_B32:
1604     return MI.getOperand(1).isReg() ?
1605            AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
1606   case AMDGPU::S_ADD_I32:
1607   case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32;
1608   case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32;
1609   case AMDGPU::S_SUB_I32:
1610   case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32;
1611   case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
1612   case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32;
1613   case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e32;
1614   case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e32;
1615   case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e32;
1616   case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e32;
1617   case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e32;
1618   case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e32;
1619   case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e32;
1620   case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
1621   case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64;
1622   case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
1623   case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64;
1624   case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
1625   case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64;
1626   case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32;
1627   case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32;
1628   case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32;
1629   case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32;
1630   case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
1631   case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
1632   case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
1633   case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
1634   case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32;
1635   case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32;
1636   case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32;
1637   case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32;
1638   case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
1639   case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
1640   case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32;
1641   case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32;
1642   case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32;
1643   case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32;
1644   case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32;
1645   case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32;
1646   case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
1647   case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
1648   case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
1649   case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
1650   case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
1651   case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
1652   }
1653 }
1654 
1655 bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const {
1656   return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END;
1657 }
1658 
1659 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
1660                                                       unsigned OpNo) const {
1661   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
1662   const MCInstrDesc &Desc = get(MI.getOpcode());
1663   if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
1664       Desc.OpInfo[OpNo].RegClass == -1) {
1665     unsigned Reg = MI.getOperand(OpNo).getReg();
1666 
1667     if (TargetRegisterInfo::isVirtualRegister(Reg))
1668       return MRI.getRegClass(Reg);
1669     return RI.getPhysRegClass(Reg);
1670   }
1671 
1672   unsigned RCID = Desc.OpInfo[OpNo].RegClass;
1673   return RI.getRegClass(RCID);
1674 }
1675 
1676 bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const {
1677   switch (MI.getOpcode()) {
1678   case AMDGPU::COPY:
1679   case AMDGPU::REG_SEQUENCE:
1680   case AMDGPU::PHI:
1681   case AMDGPU::INSERT_SUBREG:
1682     return RI.hasVGPRs(getOpRegClass(MI, 0));
1683   default:
1684     return RI.hasVGPRs(getOpRegClass(MI, OpNo));
1685   }
1686 }
1687 
1688 void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const {
1689   MachineBasicBlock::iterator I = MI;
1690   MachineBasicBlock *MBB = MI->getParent();
1691   MachineOperand &MO = MI->getOperand(OpIdx);
1692   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1693   unsigned RCID = get(MI->getOpcode()).OpInfo[OpIdx].RegClass;
1694   const TargetRegisterClass *RC = RI.getRegClass(RCID);
1695   unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1696   if (MO.isReg())
1697     Opcode = AMDGPU::COPY;
1698   else if (RI.isSGPRClass(RC))
1699     Opcode = AMDGPU::S_MOV_B32;
1700 
1701 
1702   const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
1703   if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC))
1704     VRC = &AMDGPU::VReg_64RegClass;
1705   else
1706     VRC = &AMDGPU::VGPR_32RegClass;
1707 
1708   unsigned Reg = MRI.createVirtualRegister(VRC);
1709   DebugLoc DL = MBB->findDebugLoc(I);
1710   BuildMI(*MI->getParent(), I, DL, get(Opcode), Reg)
1711     .addOperand(MO);
1712   MO.ChangeToRegister(Reg, false);
1713 }
1714 
1715 unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
1716                                          MachineRegisterInfo &MRI,
1717                                          MachineOperand &SuperReg,
1718                                          const TargetRegisterClass *SuperRC,
1719                                          unsigned SubIdx,
1720                                          const TargetRegisterClass *SubRC)
1721                                          const {
1722   MachineBasicBlock *MBB = MI->getParent();
1723   DebugLoc DL = MI->getDebugLoc();
1724   unsigned SubReg = MRI.createVirtualRegister(SubRC);
1725 
1726   if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) {
1727     BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
1728       .addReg(SuperReg.getReg(), 0, SubIdx);
1729     return SubReg;
1730   }
1731 
1732   // Just in case the super register is itself a sub-register, copy it to a new
1733   // value so we don't need to worry about merging its subreg index with the
1734   // SubIdx passed to this function. The register coalescer should be able to
1735   // eliminate this extra copy.
1736   unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC);
1737 
1738   BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
1739     .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());
1740 
1741   BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
1742     .addReg(NewSuperReg, 0, SubIdx);
1743 
1744   return SubReg;
1745 }
1746 
1747 MachineOperand SIInstrInfo::buildExtractSubRegOrImm(
1748   MachineBasicBlock::iterator MII,
1749   MachineRegisterInfo &MRI,
1750   MachineOperand &Op,
1751   const TargetRegisterClass *SuperRC,
1752   unsigned SubIdx,
1753   const TargetRegisterClass *SubRC) const {
1754   if (Op.isImm()) {
1755     // XXX - Is there a better way to do this?
1756     if (SubIdx == AMDGPU::sub0)
1757       return MachineOperand::CreateImm(Op.getImm() & 0xFFFFFFFF);
1758     if (SubIdx == AMDGPU::sub1)
1759       return MachineOperand::CreateImm(Op.getImm() >> 32);
1760 
1761     llvm_unreachable("Unhandled register index for immediate");
1762   }
1763 
1764   unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
1765                                        SubIdx, SubRC);
1766   return MachineOperand::CreateReg(SubReg, false);
1767 }
1768 
1769 // Change the order of operands from (0, 1, 2) to (0, 2, 1)
1770 void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const {
1771   assert(Inst->getNumExplicitOperands() == 3);
1772   MachineOperand Op1 = Inst->getOperand(1);
1773   Inst->RemoveOperand(1);
1774   Inst->addOperand(Op1);
1775 }
1776 
1777 bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
1778                                     const MCOperandInfo &OpInfo,
1779                                     const MachineOperand &MO) const {
1780   if (!MO.isReg())
1781     return false;
1782 
1783   unsigned Reg = MO.getReg();
1784   const TargetRegisterClass *RC =
1785     TargetRegisterInfo::isVirtualRegister(Reg) ?
1786     MRI.getRegClass(Reg) :
1787     RI.getPhysRegClass(Reg);
1788 
1789   const SIRegisterInfo *TRI =
1790       static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
1791   RC = TRI->getSubRegClass(RC, MO.getSubReg());
1792 
1793   // In order to be legal, the common sub-class must be equal to the
1794   // class of the current operand.  For example:
1795   //
1796   // v_mov_b32 s0 ; Operand defined as vsrc_32
1797   //              ; RI.getCommonSubClass(s0,vsrc_32) = sgpr ; LEGAL
1798   //
1799   // s_sendmsg 0, s0 ; Operand defined as m0reg
1800   //                 ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL
1801 
1802   return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC;
1803 }
1804 
1805 bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
1806                                      const MCOperandInfo &OpInfo,
1807                                      const MachineOperand &MO) const {
1808   if (MO.isReg())
1809     return isLegalRegOperand(MRI, OpInfo, MO);
1810 
1811   // Handle non-register types that are treated like immediates.
1812   assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
1813   return true;
1814 }
1815 
1816 bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx,
1817                                  const MachineOperand *MO) const {
1818   const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
1819   const MCInstrDesc &InstDesc = MI->getDesc();
1820   const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx];
1821   const TargetRegisterClass *DefinedRC =
1822       OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
1823   if (!MO)
1824     MO = &MI->getOperand(OpIdx);
1825 
1826   if (isVALU(*MI) &&
1827       usesConstantBus(MRI, *MO, DefinedRC->getSize())) {
1828 
1829     RegSubRegPair SGPRUsed;
1830     if (MO->isReg())
1831       SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg());
1832 
1833     for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
1834       if (i == OpIdx)
1835         continue;
1836       const MachineOperand &Op = MI->getOperand(i);
1837       if (Op.isReg() &&
1838           (Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) &&
1839           usesConstantBus(MRI, Op, getOpSize(*MI, i))) {
1840         return false;
1841       }
1842     }
1843   }
1844 
1845   if (MO->isReg()) {
1846     assert(DefinedRC);
1847     return isLegalRegOperand(MRI, OpInfo, *MO);
1848   }
1849 
1850 
1851   // Handle non-register types that are treated like immediates.
1852   assert(MO->isImm() || MO->isTargetIndex() || MO->isFI());
1853 
1854   if (!DefinedRC) {
1855     // This operand expects an immediate.
1856     return true;
1857   }
1858 
1859   return isImmOperandLegal(MI, OpIdx, *MO);
1860 }
1861 
1862 void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
1863                                        MachineInstr *MI) const {
1864   unsigned Opc = MI->getOpcode();
1865   const MCInstrDesc &InstrDesc = get(Opc);
1866 
1867   int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
1868   MachineOperand &Src1 = MI->getOperand(Src1Idx);
1869 
1870   // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
1871   // we need to only have one constant bus use.
1872   //
1873   // Note we do not need to worry about literal constants here. They are
1874   // disabled for the operand type for instructions because they will always
1875   // violate the one constant bus use rule.
1876   bool HasImplicitSGPR = findImplicitSGPRRead(*MI) != AMDGPU::NoRegister;
1877   if (HasImplicitSGPR) {
1878     int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1879     MachineOperand &Src0 = MI->getOperand(Src0Idx);
1880 
1881     if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg()))
1882       legalizeOpWithMove(MI, Src0Idx);
1883   }
1884 
1885   // VOP2 src0 instructions support all operand types, so we don't need to check
1886   // their legality. If src1 is already legal, we don't need to do anything.
1887   if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
1888     return;
1889 
1890   // We do not use commuteInstruction here because it is too aggressive and will
1891   // commute if it is possible. We only want to commute here if it improves
1892   // legality. This can be called a fairly large number of times so don't waste
1893   // compile time pointlessly swapping and checking legality again.
1894   if (HasImplicitSGPR || !MI->isCommutable()) {
1895     legalizeOpWithMove(MI, Src1Idx);
1896     return;
1897   }
1898 
1899   int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1900   MachineOperand &Src0 = MI->getOperand(Src0Idx);
1901 
1902   // If src0 can be used as src1, commuting will make the operands legal.
1903   // Otherwise we have to give up and insert a move.
1904   //
1905   // TODO: Other immediate-like operand kinds could be commuted if there was a
1906   // MachineOperand::ChangeTo* for them.
1907   if ((!Src1.isImm() && !Src1.isReg()) ||
1908       !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) {
1909     legalizeOpWithMove(MI, Src1Idx);
1910     return;
1911   }
1912 
1913   int CommutedOpc = commuteOpcode(*MI);
1914   if (CommutedOpc == -1) {
1915     legalizeOpWithMove(MI, Src1Idx);
1916     return;
1917   }
1918 
1919   MI->setDesc(get(CommutedOpc));
1920 
1921   unsigned Src0Reg = Src0.getReg();
1922   unsigned Src0SubReg = Src0.getSubReg();
1923   bool Src0Kill = Src0.isKill();
1924 
1925   if (Src1.isImm())
1926     Src0.ChangeToImmediate(Src1.getImm());
1927   else if (Src1.isReg()) {
1928     Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
1929     Src0.setSubReg(Src1.getSubReg());
1930   } else
1931     llvm_unreachable("Should only have register or immediate operands");
1932 
1933   Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
1934   Src1.setSubReg(Src0SubReg);
1935 }
1936 
1937 // Legalize VOP3 operands. Because all operand types are supported for any
1938 // operand, and since literal constants are not allowed and should never be
1939 // seen, we only need to worry about inserting copies if we use multiple SGPR
1940 // operands.
1941 void SIInstrInfo::legalizeOperandsVOP3(
1942   MachineRegisterInfo &MRI,
1943   MachineInstr *MI) const {
1944   unsigned Opc = MI->getOpcode();
1945 
1946   int VOP3Idx[3] = {
1947     AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
1948     AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
1949     AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
1950   };
1951 
1952   // Find the one SGPR operand we are allowed to use.
1953   unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx);
1954 
1955   for (unsigned i = 0; i < 3; ++i) {
1956     int Idx = VOP3Idx[i];
1957     if (Idx == -1)
1958       break;
1959     MachineOperand &MO = MI->getOperand(Idx);
1960 
1961     // We should never see a VOP3 instruction with an illegal immediate operand.
1962     if (!MO.isReg())
1963       continue;
1964 
1965     if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
1966       continue; // VGPRs are legal
1967 
1968     if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) {
1969       SGPRReg = MO.getReg();
1970       // We can use one SGPR in each VOP3 instruction.
1971       continue;
1972     }
1973 
1974     // If we make it this far, then the operand is not legal and we must
1975     // legalize it.
1976     legalizeOpWithMove(MI, Idx);
1977   }
1978 }
1979 
1980 unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr *UseMI,
1981                                  MachineRegisterInfo &MRI) const {
1982   const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
1983   const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
1984   unsigned DstReg = MRI.createVirtualRegister(SRC);
1985   unsigned SubRegs = VRC->getSize() / 4;
1986 
1987   SmallVector<unsigned, 8> SRegs;
1988   for (unsigned i = 0; i < SubRegs; ++i) {
1989     unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
1990     BuildMI(*UseMI->getParent(), UseMI, UseMI->getDebugLoc(),
1991             get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
1992             .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
1993     SRegs.push_back(SGPR);
1994   }
1995 
1996   MachineInstrBuilder MIB = BuildMI(*UseMI->getParent(), UseMI,
1997                                     UseMI->getDebugLoc(),
1998                                     get(AMDGPU::REG_SEQUENCE), DstReg);
1999   for (unsigned i = 0; i < SubRegs; ++i) {
2000     MIB.addReg(SRegs[i]);
2001     MIB.addImm(RI.getSubRegFromChannel(i));
2002   }
2003   return DstReg;
2004 }
2005 
2006 void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
2007                                        MachineInstr *MI) const {
2008 
2009   // If the pointer is store in VGPRs, then we need to move them to
2010   // SGPRs using v_readfirstlane.  This is safe because we only select
2011   // loads with uniform pointers to SMRD instruction so we know the
2012   // pointer value is uniform.
2013   MachineOperand *SBase = getNamedOperand(*MI, AMDGPU::OpName::sbase);
2014   if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
2015       unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
2016       SBase->setReg(SGPR);
2017   }
2018 }
2019 
2020 void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
2021   MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
2022 
2023   // Legalize VOP2
2024   if (isVOP2(*MI) || isVOPC(*MI)) {
2025     legalizeOperandsVOP2(MRI, MI);
2026     return;
2027   }
2028 
2029   // Legalize VOP3
2030   if (isVOP3(*MI)) {
2031     legalizeOperandsVOP3(MRI, MI);
2032     return;
2033   }
2034 
2035   // Legalize SMRD
2036   if (isSMRD(*MI)) {
2037     legalizeOperandsSMRD(MRI, MI);
2038     return;
2039   }
2040 
2041   // Legalize REG_SEQUENCE and PHI
2042   // The register class of the operands much be the same type as the register
2043   // class of the output.
2044   if (MI->getOpcode() == AMDGPU::PHI) {
2045     const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
2046     for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) {
2047       if (!MI->getOperand(i).isReg() ||
2048           !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg()))
2049         continue;
2050       const TargetRegisterClass *OpRC =
2051               MRI.getRegClass(MI->getOperand(i).getReg());
2052       if (RI.hasVGPRs(OpRC)) {
2053         VRC = OpRC;
2054       } else {
2055         SRC = OpRC;
2056       }
2057     }
2058 
2059     // If any of the operands are VGPR registers, then they all most be
2060     // otherwise we will create illegal VGPR->SGPR copies when legalizing
2061     // them.
2062     if (VRC || !RI.isSGPRClass(getOpRegClass(*MI, 0))) {
2063       if (!VRC) {
2064         assert(SRC);
2065         VRC = RI.getEquivalentVGPRClass(SRC);
2066       }
2067       RC = VRC;
2068     } else {
2069       RC = SRC;
2070     }
2071 
2072     // Update all the operands so they have the same type.
2073     for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) {
2074       MachineOperand &Op = MI->getOperand(I);
2075       if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
2076         continue;
2077       unsigned DstReg = MRI.createVirtualRegister(RC);
2078 
2079       // MI is a PHI instruction.
2080       MachineBasicBlock *InsertBB = MI->getOperand(I + 1).getMBB();
2081       MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator();
2082 
2083       BuildMI(*InsertBB, Insert, MI->getDebugLoc(), get(AMDGPU::COPY), DstReg)
2084         .addOperand(Op);
2085       Op.setReg(DstReg);
2086     }
2087   }
2088 
2089   // REG_SEQUENCE doesn't really require operand legalization, but if one has a
2090   // VGPR dest type and SGPR sources, insert copies so all operands are
2091   // VGPRs. This seems to help operand folding / the register coalescer.
2092   if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) {
2093     MachineBasicBlock *MBB = MI->getParent();
2094     const TargetRegisterClass *DstRC = getOpRegClass(*MI, 0);
2095     if (RI.hasVGPRs(DstRC)) {
2096       // Update all the operands so they are VGPR register classes. These may
2097       // not be the same register class because REG_SEQUENCE supports mixing
2098       // subregister index types e.g. sub0_sub1 + sub2 + sub3
2099       for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) {
2100         MachineOperand &Op = MI->getOperand(I);
2101         if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
2102           continue;
2103 
2104         const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
2105         const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
2106         if (VRC == OpRC)
2107           continue;
2108 
2109         unsigned DstReg = MRI.createVirtualRegister(VRC);
2110 
2111         BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), DstReg)
2112           .addOperand(Op);
2113 
2114         Op.setReg(DstReg);
2115         Op.setIsKill();
2116       }
2117     }
2118 
2119     return;
2120   }
2121 
2122   // Legalize INSERT_SUBREG
2123   // src0 must have the same register class as dst
2124   if (MI->getOpcode() == AMDGPU::INSERT_SUBREG) {
2125     unsigned Dst = MI->getOperand(0).getReg();
2126     unsigned Src0 = MI->getOperand(1).getReg();
2127     const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
2128     const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
2129     if (DstRC != Src0RC) {
2130       MachineBasicBlock &MBB = *MI->getParent();
2131       unsigned NewSrc0 = MRI.createVirtualRegister(DstRC);
2132       BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), NewSrc0)
2133               .addReg(Src0);
2134       MI->getOperand(1).setReg(NewSrc0);
2135     }
2136     return;
2137   }
2138 
2139   // Legalize MIMG
2140   if (isMIMG(*MI)) {
2141     MachineOperand *SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc);
2142     if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) {
2143       unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI);
2144       SRsrc->setReg(SGPR);
2145     }
2146 
2147     MachineOperand *SSamp = getNamedOperand(*MI, AMDGPU::OpName::ssamp);
2148     if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) {
2149       unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI);
2150       SSamp->setReg(SGPR);
2151     }
2152     return;
2153   }
2154 
2155   // Legalize MUBUF* instructions
2156   // FIXME: If we start using the non-addr64 instructions for compute, we
2157   // may need to legalize them here.
2158   int SRsrcIdx =
2159       AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
2160   if (SRsrcIdx != -1) {
2161     // We have an MUBUF instruction
2162     MachineOperand *SRsrc = &MI->getOperand(SRsrcIdx);
2163     unsigned SRsrcRC = get(MI->getOpcode()).OpInfo[SRsrcIdx].RegClass;
2164     if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()),
2165                                              RI.getRegClass(SRsrcRC))) {
2166       // The operands are legal.
2167       // FIXME: We may need to legalize operands besided srsrc.
2168       return;
2169     }
2170 
2171     MachineBasicBlock &MBB = *MI->getParent();
2172 
2173     // Extract the ptr from the resource descriptor.
2174     unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc,
2175       &AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
2176 
2177     // Create an empty resource descriptor
2178     unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2179     unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2180     unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2181     unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
2182     uint64_t RsrcDataFormat = getDefaultRsrcDataFormat();
2183 
2184     // Zero64 = 0
2185     BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64),
2186             Zero64)
2187             .addImm(0);
2188 
2189     // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
2190     BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
2191             SRsrcFormatLo)
2192             .addImm(RsrcDataFormat & 0xFFFFFFFF);
2193 
2194     // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
2195     BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
2196             SRsrcFormatHi)
2197             .addImm(RsrcDataFormat >> 32);
2198 
2199     // NewSRsrc = {Zero64, SRsrcFormat}
2200     BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc)
2201       .addReg(Zero64)
2202       .addImm(AMDGPU::sub0_sub1)
2203       .addReg(SRsrcFormatLo)
2204       .addImm(AMDGPU::sub2)
2205       .addReg(SRsrcFormatHi)
2206       .addImm(AMDGPU::sub3);
2207 
2208     MachineOperand *VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr);
2209     unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
2210     if (VAddr) {
2211       // This is already an ADDR64 instruction so we need to add the pointer
2212       // extracted from the resource descriptor to the current value of VAddr.
2213       unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2214       unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2215 
2216       // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0
2217       DebugLoc DL = MI->getDebugLoc();
2218       BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo)
2219         .addReg(SRsrcPtr, 0, AMDGPU::sub0)
2220         .addReg(VAddr->getReg(), 0, AMDGPU::sub0);
2221 
2222       // NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1
2223       BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi)
2224         .addReg(SRsrcPtr, 0, AMDGPU::sub1)
2225         .addReg(VAddr->getReg(), 0, AMDGPU::sub1);
2226 
2227       // NewVaddr = {NewVaddrHi, NewVaddrLo}
2228       BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
2229         .addReg(NewVAddrLo)
2230         .addImm(AMDGPU::sub0)
2231         .addReg(NewVAddrHi)
2232         .addImm(AMDGPU::sub1);
2233     } else {
2234       // This instructions is the _OFFSET variant, so we need to convert it to
2235       // ADDR64.
2236       assert(MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration()
2237              < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
2238              "FIXME: Need to emit flat atomics here");
2239 
2240       MachineOperand *VData = getNamedOperand(*MI, AMDGPU::OpName::vdata);
2241       MachineOperand *Offset = getNamedOperand(*MI, AMDGPU::OpName::offset);
2242       MachineOperand *SOffset = getNamedOperand(*MI, AMDGPU::OpName::soffset);
2243       unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI->getOpcode());
2244 
2245       // Atomics rith return have have an additional tied operand and are
2246       // missing some of the special bits.
2247       MachineOperand *VDataIn = getNamedOperand(*MI, AMDGPU::OpName::vdata_in);
2248       MachineInstr *Addr64;
2249 
2250       if (!VDataIn) {
2251         // Regular buffer load / store.
2252         MachineInstrBuilder MIB
2253           = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode))
2254           .addOperand(*VData)
2255           .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
2256           // This will be replaced later
2257           // with the new value of vaddr.
2258           .addOperand(*SRsrc)
2259           .addOperand(*SOffset)
2260           .addOperand(*Offset);
2261 
2262         // Atomics do not have this operand.
2263         if (const MachineOperand *GLC
2264             = getNamedOperand(*MI, AMDGPU::OpName::glc)) {
2265           MIB.addImm(GLC->getImm());
2266         }
2267 
2268         MIB.addImm(getNamedImmOperand(*MI, AMDGPU::OpName::slc));
2269 
2270         if (const MachineOperand *TFE
2271             = getNamedOperand(*MI, AMDGPU::OpName::tfe)) {
2272           MIB.addImm(TFE->getImm());
2273         }
2274 
2275         MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
2276         Addr64 = MIB;
2277       } else {
2278         // Atomics with return.
2279         Addr64 = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode))
2280           .addOperand(*VData)
2281           .addOperand(*VDataIn)
2282           .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
2283           // This will be replaced later
2284           // with the new value of vaddr.
2285           .addOperand(*SRsrc)
2286           .addOperand(*SOffset)
2287           .addOperand(*Offset)
2288           .addImm(getNamedImmOperand(*MI, AMDGPU::OpName::slc))
2289           .setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
2290       }
2291 
2292       MI->removeFromParent();
2293       MI = Addr64;
2294 
2295       // NewVaddr = {NewVaddrHi, NewVaddrLo}
2296       BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
2297         .addReg(SRsrcPtr, 0, AMDGPU::sub0)
2298         .addImm(AMDGPU::sub0)
2299         .addReg(SRsrcPtr, 0, AMDGPU::sub1)
2300         .addImm(AMDGPU::sub1);
2301 
2302       VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr);
2303       SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc);
2304     }
2305 
2306     // Update the instruction to use NewVaddr
2307     VAddr->setReg(NewVAddr);
2308     // Update the instruction to use NewSRsrc
2309     SRsrc->setReg(NewSRsrc);
2310   }
2311 }
2312 
2313 void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
2314   SmallVector<MachineInstr *, 128> Worklist;
2315   Worklist.push_back(&TopInst);
2316 
2317   while (!Worklist.empty()) {
2318     MachineInstr *Inst = Worklist.pop_back_val();
2319     MachineBasicBlock *MBB = Inst->getParent();
2320     MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
2321 
2322     unsigned Opcode = Inst->getOpcode();
2323     unsigned NewOpcode = getVALUOp(*Inst);
2324 
2325     // Handle some special cases
2326     switch (Opcode) {
2327     default:
2328       break;
2329     case AMDGPU::S_AND_B64:
2330       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64);
2331       Inst->eraseFromParent();
2332       continue;
2333 
2334     case AMDGPU::S_OR_B64:
2335       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64);
2336       Inst->eraseFromParent();
2337       continue;
2338 
2339     case AMDGPU::S_XOR_B64:
2340       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64);
2341       Inst->eraseFromParent();
2342       continue;
2343 
2344     case AMDGPU::S_NOT_B64:
2345       splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32);
2346       Inst->eraseFromParent();
2347       continue;
2348 
2349     case AMDGPU::S_BCNT1_I32_B64:
2350       splitScalar64BitBCNT(Worklist, Inst);
2351       Inst->eraseFromParent();
2352       continue;
2353 
2354     case AMDGPU::S_BFE_I64: {
2355       splitScalar64BitBFE(Worklist, Inst);
2356       Inst->eraseFromParent();
2357       continue;
2358     }
2359 
2360     case AMDGPU::S_LSHL_B32:
2361       if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
2362         NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
2363         swapOperands(Inst);
2364       }
2365       break;
2366     case AMDGPU::S_ASHR_I32:
2367       if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
2368         NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
2369         swapOperands(Inst);
2370       }
2371       break;
2372     case AMDGPU::S_LSHR_B32:
2373       if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
2374         NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
2375         swapOperands(Inst);
2376       }
2377       break;
2378     case AMDGPU::S_LSHL_B64:
2379       if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
2380         NewOpcode = AMDGPU::V_LSHLREV_B64;
2381         swapOperands(Inst);
2382       }
2383       break;
2384     case AMDGPU::S_ASHR_I64:
2385       if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
2386         NewOpcode = AMDGPU::V_ASHRREV_I64;
2387         swapOperands(Inst);
2388       }
2389       break;
2390     case AMDGPU::S_LSHR_B64:
2391       if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
2392         NewOpcode = AMDGPU::V_LSHRREV_B64;
2393         swapOperands(Inst);
2394       }
2395       break;
2396 
2397     case AMDGPU::S_ABS_I32:
2398       lowerScalarAbs(Worklist, Inst);
2399       Inst->eraseFromParent();
2400       continue;
2401 
2402     case AMDGPU::S_CBRANCH_SCC0:
2403     case AMDGPU::S_CBRANCH_SCC1:
2404       // Clear unused bits of vcc
2405       BuildMI(*MBB, Inst, Inst->getDebugLoc(), get(AMDGPU::S_AND_B64), AMDGPU::VCC)
2406               .addReg(AMDGPU::EXEC)
2407               .addReg(AMDGPU::VCC);
2408       break;
2409 
2410     case AMDGPU::S_BFE_U64:
2411     case AMDGPU::S_BFM_B64:
2412       llvm_unreachable("Moving this op to VALU not implemented");
2413     }
2414 
2415     if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
2416       // We cannot move this instruction to the VALU, so we should try to
2417       // legalize its operands instead.
2418       legalizeOperands(Inst);
2419       continue;
2420     }
2421 
2422     // Use the new VALU Opcode.
2423     const MCInstrDesc &NewDesc = get(NewOpcode);
2424     Inst->setDesc(NewDesc);
2425 
2426     // Remove any references to SCC. Vector instructions can't read from it, and
2427     // We're just about to add the implicit use / defs of VCC, and we don't want
2428     // both.
2429     for (unsigned i = Inst->getNumOperands() - 1; i > 0; --i) {
2430       MachineOperand &Op = Inst->getOperand(i);
2431       if (Op.isReg() && Op.getReg() == AMDGPU::SCC) {
2432         Inst->RemoveOperand(i);
2433         addSCCDefUsersToVALUWorklist(Inst, Worklist);
2434       }
2435     }
2436 
2437     if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
2438       // We are converting these to a BFE, so we need to add the missing
2439       // operands for the size and offset.
2440       unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
2441       Inst->addOperand(MachineOperand::CreateImm(0));
2442       Inst->addOperand(MachineOperand::CreateImm(Size));
2443 
2444     } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
2445       // The VALU version adds the second operand to the result, so insert an
2446       // extra 0 operand.
2447       Inst->addOperand(MachineOperand::CreateImm(0));
2448     }
2449 
2450     Inst->addImplicitDefUseOperands(*Inst->getParent()->getParent());
2451 
2452     if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
2453       const MachineOperand &OffsetWidthOp = Inst->getOperand(2);
2454       // If we need to move this to VGPRs, we need to unpack the second operand
2455       // back into the 2 separate ones for bit offset and width.
2456       assert(OffsetWidthOp.isImm() &&
2457              "Scalar BFE is only implemented for constant width and offset");
2458       uint32_t Imm = OffsetWidthOp.getImm();
2459 
2460       uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
2461       uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
2462       Inst->RemoveOperand(2); // Remove old immediate.
2463       Inst->addOperand(MachineOperand::CreateImm(Offset));
2464       Inst->addOperand(MachineOperand::CreateImm(BitWidth));
2465     }
2466 
2467     bool HasDst = Inst->getOperand(0).isReg() && Inst->getOperand(0).isDef();
2468     unsigned NewDstReg = AMDGPU::NoRegister;
2469     if (HasDst) {
2470       // Update the destination register class.
2471       const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*Inst);
2472       if (!NewDstRC)
2473         continue;
2474 
2475       unsigned DstReg = Inst->getOperand(0).getReg();
2476       NewDstReg = MRI.createVirtualRegister(NewDstRC);
2477       MRI.replaceRegWith(DstReg, NewDstReg);
2478     }
2479 
2480     // Legalize the operands
2481     legalizeOperands(Inst);
2482 
2483     if (HasDst)
2484      addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
2485   }
2486 }
2487 
2488 //===----------------------------------------------------------------------===//
2489 // Indirect addressing callbacks
2490 //===----------------------------------------------------------------------===//
2491 
2492 const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const {
2493   return &AMDGPU::VGPR_32RegClass;
2494 }
2495 
2496 void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist,
2497                                  MachineInstr *Inst) const {
2498   MachineBasicBlock &MBB = *Inst->getParent();
2499   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
2500   MachineBasicBlock::iterator MII = Inst;
2501   DebugLoc DL = Inst->getDebugLoc();
2502 
2503   MachineOperand &Dest = Inst->getOperand(0);
2504   MachineOperand &Src = Inst->getOperand(1);
2505   unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2506   unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2507 
2508   BuildMI(MBB, MII, DL, get(AMDGPU::V_SUB_I32_e32), TmpReg)
2509     .addImm(0)
2510     .addReg(Src.getReg());
2511 
2512   BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
2513     .addReg(Src.getReg())
2514     .addReg(TmpReg);
2515 
2516   MRI.replaceRegWith(Dest.getReg(), ResultReg);
2517   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
2518 }
2519 
2520 void SIInstrInfo::splitScalar64BitUnaryOp(
2521   SmallVectorImpl<MachineInstr *> &Worklist,
2522   MachineInstr *Inst,
2523   unsigned Opcode) const {
2524   MachineBasicBlock &MBB = *Inst->getParent();
2525   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
2526 
2527   MachineOperand &Dest = Inst->getOperand(0);
2528   MachineOperand &Src0 = Inst->getOperand(1);
2529   DebugLoc DL = Inst->getDebugLoc();
2530 
2531   MachineBasicBlock::iterator MII = Inst;
2532 
2533   const MCInstrDesc &InstDesc = get(Opcode);
2534   const TargetRegisterClass *Src0RC = Src0.isReg() ?
2535     MRI.getRegClass(Src0.getReg()) :
2536     &AMDGPU::SGPR_32RegClass;
2537 
2538   const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
2539 
2540   MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
2541                                                        AMDGPU::sub0, Src0SubRC);
2542 
2543   const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
2544   const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
2545   const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
2546 
2547   unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
2548   BuildMI(MBB, MII, DL, InstDesc, DestSub0)
2549     .addOperand(SrcReg0Sub0);
2550 
2551   MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
2552                                                        AMDGPU::sub1, Src0SubRC);
2553 
2554   unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
2555   BuildMI(MBB, MII, DL, InstDesc, DestSub1)
2556     .addOperand(SrcReg0Sub1);
2557 
2558   unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
2559   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
2560     .addReg(DestSub0)
2561     .addImm(AMDGPU::sub0)
2562     .addReg(DestSub1)
2563     .addImm(AMDGPU::sub1);
2564 
2565   MRI.replaceRegWith(Dest.getReg(), FullDestReg);
2566 
2567   // We don't need to legalizeOperands here because for a single operand, src0
2568   // will support any kind of input.
2569 
2570   // Move all users of this moved value.
2571   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
2572 }
2573 
2574 void SIInstrInfo::splitScalar64BitBinaryOp(
2575   SmallVectorImpl<MachineInstr *> &Worklist,
2576   MachineInstr *Inst,
2577   unsigned Opcode) const {
2578   MachineBasicBlock &MBB = *Inst->getParent();
2579   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
2580 
2581   MachineOperand &Dest = Inst->getOperand(0);
2582   MachineOperand &Src0 = Inst->getOperand(1);
2583   MachineOperand &Src1 = Inst->getOperand(2);
2584   DebugLoc DL = Inst->getDebugLoc();
2585 
2586   MachineBasicBlock::iterator MII = Inst;
2587 
2588   const MCInstrDesc &InstDesc = get(Opcode);
2589   const TargetRegisterClass *Src0RC = Src0.isReg() ?
2590     MRI.getRegClass(Src0.getReg()) :
2591     &AMDGPU::SGPR_32RegClass;
2592 
2593   const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
2594   const TargetRegisterClass *Src1RC = Src1.isReg() ?
2595     MRI.getRegClass(Src1.getReg()) :
2596     &AMDGPU::SGPR_32RegClass;
2597 
2598   const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
2599 
2600   MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
2601                                                        AMDGPU::sub0, Src0SubRC);
2602   MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
2603                                                        AMDGPU::sub0, Src1SubRC);
2604 
2605   const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
2606   const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
2607   const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
2608 
2609   unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
2610   MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0)
2611     .addOperand(SrcReg0Sub0)
2612     .addOperand(SrcReg1Sub0);
2613 
2614   MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
2615                                                        AMDGPU::sub1, Src0SubRC);
2616   MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
2617                                                        AMDGPU::sub1, Src1SubRC);
2618 
2619   unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
2620   MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1)
2621     .addOperand(SrcReg0Sub1)
2622     .addOperand(SrcReg1Sub1);
2623 
2624   unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
2625   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
2626     .addReg(DestSub0)
2627     .addImm(AMDGPU::sub0)
2628     .addReg(DestSub1)
2629     .addImm(AMDGPU::sub1);
2630 
2631   MRI.replaceRegWith(Dest.getReg(), FullDestReg);
2632 
2633   // Try to legalize the operands in case we need to swap the order to keep it
2634   // valid.
2635   legalizeOperands(LoHalf);
2636   legalizeOperands(HiHalf);
2637 
2638   // Move all users of this moved vlaue.
2639   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
2640 }
2641 
2642 void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist,
2643                                        MachineInstr *Inst) const {
2644   MachineBasicBlock &MBB = *Inst->getParent();
2645   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
2646 
2647   MachineBasicBlock::iterator MII = Inst;
2648   DebugLoc DL = Inst->getDebugLoc();
2649 
2650   MachineOperand &Dest = Inst->getOperand(0);
2651   MachineOperand &Src = Inst->getOperand(1);
2652 
2653   const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
2654   const TargetRegisterClass *SrcRC = Src.isReg() ?
2655     MRI.getRegClass(Src.getReg()) :
2656     &AMDGPU::SGPR_32RegClass;
2657 
2658   unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2659   unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2660 
2661   const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0);
2662 
2663   MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
2664                                                       AMDGPU::sub0, SrcSubRC);
2665   MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
2666                                                       AMDGPU::sub1, SrcSubRC);
2667 
2668   BuildMI(MBB, MII, DL, InstDesc, MidReg)
2669     .addOperand(SrcRegSub0)
2670     .addImm(0);
2671 
2672   BuildMI(MBB, MII, DL, InstDesc, ResultReg)
2673     .addOperand(SrcRegSub1)
2674     .addReg(MidReg);
2675 
2676   MRI.replaceRegWith(Dest.getReg(), ResultReg);
2677 
2678   // We don't need to legalize operands here. src0 for etiher instruction can be
2679   // an SGPR, and the second input is unused or determined here.
2680   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
2681 }
2682 
2683 void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
2684                                       MachineInstr *Inst) const {
2685   MachineBasicBlock &MBB = *Inst->getParent();
2686   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
2687   MachineBasicBlock::iterator MII = Inst;
2688   DebugLoc DL = Inst->getDebugLoc();
2689 
2690   MachineOperand &Dest = Inst->getOperand(0);
2691   uint32_t Imm = Inst->getOperand(2).getImm();
2692   uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
2693   uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
2694 
2695   (void) Offset;
2696 
2697   // Only sext_inreg cases handled.
2698   assert(Inst->getOpcode() == AMDGPU::S_BFE_I64 &&
2699          BitWidth <= 32 &&
2700          Offset == 0 &&
2701          "Not implemented");
2702 
2703   if (BitWidth < 32) {
2704     unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2705     unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2706     unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
2707 
2708     BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo)
2709       .addReg(Inst->getOperand(1).getReg(), 0, AMDGPU::sub0)
2710       .addImm(0)
2711       .addImm(BitWidth);
2712 
2713     BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
2714       .addImm(31)
2715       .addReg(MidRegLo);
2716 
2717     BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
2718       .addReg(MidRegLo)
2719       .addImm(AMDGPU::sub0)
2720       .addReg(MidRegHi)
2721       .addImm(AMDGPU::sub1);
2722 
2723     MRI.replaceRegWith(Dest.getReg(), ResultReg);
2724     addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
2725     return;
2726   }
2727 
2728   MachineOperand &Src = Inst->getOperand(1);
2729   unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2730   unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
2731 
2732   BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
2733     .addImm(31)
2734     .addReg(Src.getReg(), 0, AMDGPU::sub0);
2735 
2736   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
2737     .addReg(Src.getReg(), 0, AMDGPU::sub0)
2738     .addImm(AMDGPU::sub0)
2739     .addReg(TmpReg)
2740     .addImm(AMDGPU::sub1);
2741 
2742   MRI.replaceRegWith(Dest.getReg(), ResultReg);
2743   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
2744 }
2745 
2746 void SIInstrInfo::addUsersToMoveToVALUWorklist(
2747   unsigned DstReg,
2748   MachineRegisterInfo &MRI,
2749   SmallVectorImpl<MachineInstr *> &Worklist) const {
2750   for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
2751          E = MRI.use_end(); I != E; ++I) {
2752     MachineInstr &UseMI = *I->getParent();
2753     if (!canReadVGPR(UseMI, I.getOperandNo())) {
2754       Worklist.push_back(&UseMI);
2755     }
2756   }
2757 }
2758 
2759 void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineInstr *SCCDefInst,
2760                               SmallVectorImpl<MachineInstr *> &Worklist) const {
2761   // This assumes that all the users of SCC are in the same block
2762   // as the SCC def.
2763   for (MachineBasicBlock::iterator I = SCCDefInst,
2764        E = SCCDefInst->getParent()->end(); I != E; ++I) {
2765 
2766     // Exit if we find another SCC def.
2767     if (I->findRegisterDefOperandIdx(AMDGPU::SCC) != -1)
2768       return;
2769 
2770     if (I->findRegisterUseOperandIdx(AMDGPU::SCC) != -1)
2771       Worklist.push_back(I);
2772   }
2773 }
2774 
2775 const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
2776   const MachineInstr &Inst) const {
2777   const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
2778 
2779   switch (Inst.getOpcode()) {
2780   // For target instructions, getOpRegClass just returns the virtual register
2781   // class associated with the operand, so we need to find an equivalent VGPR
2782   // register class in order to move the instruction to the VALU.
2783   case AMDGPU::COPY:
2784   case AMDGPU::PHI:
2785   case AMDGPU::REG_SEQUENCE:
2786   case AMDGPU::INSERT_SUBREG:
2787     if (RI.hasVGPRs(NewDstRC))
2788       return nullptr;
2789 
2790     NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
2791     if (!NewDstRC)
2792       return nullptr;
2793     return NewDstRC;
2794   default:
2795     return NewDstRC;
2796   }
2797 }
2798 
2799 // Find the one SGPR operand we are allowed to use.
2800 unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI,
2801                                    int OpIndices[3]) const {
2802   const MCInstrDesc &Desc = MI->getDesc();
2803 
2804   // Find the one SGPR operand we are allowed to use.
2805   //
2806   // First we need to consider the instruction's operand requirements before
2807   // legalizing. Some operands are required to be SGPRs, such as implicit uses
2808   // of VCC, but we are still bound by the constant bus requirement to only use
2809   // one.
2810   //
2811   // If the operand's class is an SGPR, we can never move it.
2812 
2813   unsigned SGPRReg = findImplicitSGPRRead(*MI);
2814   if (SGPRReg != AMDGPU::NoRegister)
2815     return SGPRReg;
2816 
2817   unsigned UsedSGPRs[3] = { AMDGPU::NoRegister };
2818   const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
2819 
2820   for (unsigned i = 0; i < 3; ++i) {
2821     int Idx = OpIndices[i];
2822     if (Idx == -1)
2823       break;
2824 
2825     const MachineOperand &MO = MI->getOperand(Idx);
2826     if (!MO.isReg())
2827       continue;
2828 
2829     // Is this operand statically required to be an SGPR based on the operand
2830     // constraints?
2831     const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass);
2832     bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
2833     if (IsRequiredSGPR)
2834       return MO.getReg();
2835 
2836     // If this could be a VGPR or an SGPR, Check the dynamic register class.
2837     unsigned Reg = MO.getReg();
2838     const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
2839     if (RI.isSGPRClass(RegRC))
2840       UsedSGPRs[i] = Reg;
2841   }
2842 
2843   // We don't have a required SGPR operand, so we have a bit more freedom in
2844   // selecting operands to move.
2845 
2846   // Try to select the most used SGPR. If an SGPR is equal to one of the
2847   // others, we choose that.
2848   //
2849   // e.g.
2850   // V_FMA_F32 v0, s0, s0, s0 -> No moves
2851   // V_FMA_F32 v0, s0, s1, s0 -> Move s1
2852 
2853   // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
2854   // prefer those.
2855 
2856   if (UsedSGPRs[0] != AMDGPU::NoRegister) {
2857     if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
2858       SGPRReg = UsedSGPRs[0];
2859   }
2860 
2861   if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) {
2862     if (UsedSGPRs[1] == UsedSGPRs[2])
2863       SGPRReg = UsedSGPRs[1];
2864   }
2865 
2866   return SGPRReg;
2867 }
2868 
2869 void SIInstrInfo::reserveIndirectRegisters(BitVector &Reserved,
2870                                             const MachineFunction &MF) const {
2871   int End = getIndirectIndexEnd(MF);
2872   int Begin = getIndirectIndexBegin(MF);
2873 
2874   if (End == -1)
2875     return;
2876 
2877 
2878   for (int Index = Begin; Index <= End; ++Index)
2879     Reserved.set(AMDGPU::VGPR_32RegClass.getRegister(Index));
2880 
2881   for (int Index = std::max(0, Begin - 1); Index <= End; ++Index)
2882     Reserved.set(AMDGPU::VReg_64RegClass.getRegister(Index));
2883 
2884   for (int Index = std::max(0, Begin - 2); Index <= End; ++Index)
2885     Reserved.set(AMDGPU::VReg_96RegClass.getRegister(Index));
2886 
2887   for (int Index = std::max(0, Begin - 3); Index <= End; ++Index)
2888     Reserved.set(AMDGPU::VReg_128RegClass.getRegister(Index));
2889 
2890   for (int Index = std::max(0, Begin - 7); Index <= End; ++Index)
2891     Reserved.set(AMDGPU::VReg_256RegClass.getRegister(Index));
2892 
2893   for (int Index = std::max(0, Begin - 15); Index <= End; ++Index)
2894     Reserved.set(AMDGPU::VReg_512RegClass.getRegister(Index));
2895 }
2896 
2897 MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
2898                                              unsigned OperandName) const {
2899   int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
2900   if (Idx == -1)
2901     return nullptr;
2902 
2903   return &MI.getOperand(Idx);
2904 }
2905 
2906 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
2907   uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
2908   if (ST.isAmdHsaOS()) {
2909     RsrcDataFormat |= (1ULL << 56);
2910 
2911     if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
2912       // Set MTYPE = 2
2913       RsrcDataFormat |= (2ULL << 59);
2914   }
2915 
2916   return RsrcDataFormat;
2917 }
2918 
2919 uint64_t SIInstrInfo::getScratchRsrcWords23() const {
2920   uint64_t Rsrc23 = getDefaultRsrcDataFormat() |
2921                     AMDGPU::RSRC_TID_ENABLE |
2922                     0xffffffff; // Size;
2923 
2924   uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1;
2925 
2926   Rsrc23 |= (EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT);
2927 
2928   // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
2929   // Clear them unless we want a huge stride.
2930   if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
2931     Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
2932 
2933   return Rsrc23;
2934 }
2935 
2936 bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr *MI) const {
2937   unsigned Opc = MI->getOpcode();
2938 
2939   return isSMRD(Opc);
2940 }
2941 
2942 bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr *MI) const {
2943   unsigned Opc = MI->getOpcode();
2944 
2945   return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc);
2946 }
2947 
2948 ArrayRef<std::pair<int, const char *>>
2949 SIInstrInfo::getSerializableTargetIndices() const {
2950   static const std::pair<int, const char *> TargetIndices[] = {
2951       {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
2952       {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
2953       {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
2954       {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
2955       {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
2956   return makeArrayRef(TargetIndices);
2957 }
2958