1 //===- SIInstrInfo.cpp - SI Instruction Information  ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// SI Implementation of TargetInstrInfo.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "SIInstrInfo.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUSubtarget.h"
17 #include "GCNHazardRecognizer.h"
18 #include "SIDefines.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "SIRegisterInfo.h"
21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "llvm/ADT/APInt.h"
24 #include "llvm/ADT/ArrayRef.h"
25 #include "llvm/ADT/SmallVector.h"
26 #include "llvm/ADT/StringRef.h"
27 #include "llvm/ADT/iterator_range.h"
28 #include "llvm/Analysis/AliasAnalysis.h"
29 #include "llvm/Analysis/MemoryLocation.h"
30 #include "llvm/Analysis/ValueTracking.h"
31 #include "llvm/CodeGen/MachineBasicBlock.h"
32 #include "llvm/CodeGen/MachineDominators.h"
33 #include "llvm/CodeGen/MachineFrameInfo.h"
34 #include "llvm/CodeGen/MachineFunction.h"
35 #include "llvm/CodeGen/MachineInstr.h"
36 #include "llvm/CodeGen/MachineInstrBuilder.h"
37 #include "llvm/CodeGen/MachineInstrBundle.h"
38 #include "llvm/CodeGen/MachineMemOperand.h"
39 #include "llvm/CodeGen/MachineOperand.h"
40 #include "llvm/CodeGen/MachineRegisterInfo.h"
41 #include "llvm/CodeGen/RegisterScavenging.h"
42 #include "llvm/CodeGen/ScheduleDAG.h"
43 #include "llvm/CodeGen/SelectionDAGNodes.h"
44 #include "llvm/CodeGen/TargetOpcodes.h"
45 #include "llvm/CodeGen/TargetRegisterInfo.h"
46 #include "llvm/IR/DebugLoc.h"
47 #include "llvm/IR/DiagnosticInfo.h"
48 #include "llvm/IR/Function.h"
49 #include "llvm/IR/InlineAsm.h"
50 #include "llvm/IR/LLVMContext.h"
51 #include "llvm/MC/MCInstrDesc.h"
52 #include "llvm/Support/Casting.h"
53 #include "llvm/Support/CommandLine.h"
54 #include "llvm/Support/Compiler.h"
55 #include "llvm/Support/ErrorHandling.h"
56 #include "llvm/Support/MachineValueType.h"
57 #include "llvm/Support/MathExtras.h"
58 #include "llvm/Target/TargetMachine.h"
59 #include <cassert>
60 #include <cstdint>
61 #include <iterator>
62 #include <utility>
63 
64 using namespace llvm;
65 
66 #define GET_INSTRINFO_CTOR_DTOR
67 #include "AMDGPUGenInstrInfo.inc"
68 
69 namespace llvm {
70 namespace AMDGPU {
71 #define GET_D16ImageDimIntrinsics_IMPL
72 #define GET_ImageDimIntrinsicTable_IMPL
73 #define GET_RsrcIntrinsics_IMPL
74 #include "AMDGPUGenSearchableTables.inc"
75 }
76 }
77 
78 
79 // Must be at least 4 to be able to branch over minimum unconditional branch
80 // code. This is only for making it possible to write reasonably small tests for
81 // long branches.
82 static cl::opt<unsigned>
83 BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
84                  cl::desc("Restrict range of branch instructions (DEBUG)"));
85 
86 SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST)
87   : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
88     RI(ST), ST(ST) {}
89 
90 //===----------------------------------------------------------------------===//
91 // TargetInstrInfo callbacks
92 //===----------------------------------------------------------------------===//
93 
94 static unsigned getNumOperandsNoGlue(SDNode *Node) {
95   unsigned N = Node->getNumOperands();
96   while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
97     --N;
98   return N;
99 }
100 
101 static SDValue findChainOperand(SDNode *Load) {
102   SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1);
103   assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node");
104   return LastOp;
105 }
106 
107 /// Returns true if both nodes have the same value for the given
108 ///        operand \p Op, or if both nodes do not have this operand.
109 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
110   unsigned Opc0 = N0->getMachineOpcode();
111   unsigned Opc1 = N1->getMachineOpcode();
112 
113   int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
114   int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
115 
116   if (Op0Idx == -1 && Op1Idx == -1)
117     return true;
118 
119 
120   if ((Op0Idx == -1 && Op1Idx != -1) ||
121       (Op1Idx == -1 && Op0Idx != -1))
122     return false;
123 
124   // getNamedOperandIdx returns the index for the MachineInstr's operands,
125   // which includes the result as the first operand. We are indexing into the
126   // MachineSDNode's operands, so we need to skip the result operand to get
127   // the real index.
128   --Op0Idx;
129   --Op1Idx;
130 
131   return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
132 }
133 
134 bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
135                                                     AliasAnalysis *AA) const {
136   // TODO: The generic check fails for VALU instructions that should be
137   // rematerializable due to implicit reads of exec. We really want all of the
138   // generic logic for this except for this.
139   switch (MI.getOpcode()) {
140   case AMDGPU::V_MOV_B32_e32:
141   case AMDGPU::V_MOV_B32_e64:
142   case AMDGPU::V_MOV_B64_PSEUDO:
143     return true;
144   default:
145     return false;
146   }
147 }
148 
149 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
150                                           int64_t &Offset0,
151                                           int64_t &Offset1) const {
152   if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
153     return false;
154 
155   unsigned Opc0 = Load0->getMachineOpcode();
156   unsigned Opc1 = Load1->getMachineOpcode();
157 
158   // Make sure both are actually loads.
159   if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
160     return false;
161 
162   if (isDS(Opc0) && isDS(Opc1)) {
163 
164     // FIXME: Handle this case:
165     if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
166       return false;
167 
168     // Check base reg.
169     if (Load0->getOperand(1) != Load1->getOperand(1))
170       return false;
171 
172     // Check chain.
173     if (findChainOperand(Load0) != findChainOperand(Load1))
174       return false;
175 
176     // Skip read2 / write2 variants for simplicity.
177     // TODO: We should report true if the used offsets are adjacent (excluded
178     // st64 versions).
179     if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 ||
180         AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1)
181       return false;
182 
183     Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue();
184     Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue();
185     return true;
186   }
187 
188   if (isSMRD(Opc0) && isSMRD(Opc1)) {
189     // Skip time and cache invalidation instructions.
190     if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 ||
191         AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1)
192       return false;
193 
194     assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1));
195 
196     // Check base reg.
197     if (Load0->getOperand(0) != Load1->getOperand(0))
198       return false;
199 
200     const ConstantSDNode *Load0Offset =
201         dyn_cast<ConstantSDNode>(Load0->getOperand(1));
202     const ConstantSDNode *Load1Offset =
203         dyn_cast<ConstantSDNode>(Load1->getOperand(1));
204 
205     if (!Load0Offset || !Load1Offset)
206       return false;
207 
208     // Check chain.
209     if (findChainOperand(Load0) != findChainOperand(Load1))
210       return false;
211 
212     Offset0 = Load0Offset->getZExtValue();
213     Offset1 = Load1Offset->getZExtValue();
214     return true;
215   }
216 
217   // MUBUF and MTBUF can access the same addresses.
218   if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
219 
220     // MUBUF and MTBUF have vaddr at different indices.
221     if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
222         findChainOperand(Load0) != findChainOperand(Load1) ||
223         !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
224         !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
225       return false;
226 
227     int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
228     int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
229 
230     if (OffIdx0 == -1 || OffIdx1 == -1)
231       return false;
232 
233     // getNamedOperandIdx returns the index for MachineInstrs.  Since they
234     // inlcude the output in the operand list, but SDNodes don't, we need to
235     // subtract the index by one.
236     --OffIdx0;
237     --OffIdx1;
238 
239     SDValue Off0 = Load0->getOperand(OffIdx0);
240     SDValue Off1 = Load1->getOperand(OffIdx1);
241 
242     // The offset might be a FrameIndexSDNode.
243     if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
244       return false;
245 
246     Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue();
247     Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue();
248     return true;
249   }
250 
251   return false;
252 }
253 
254 static bool isStride64(unsigned Opc) {
255   switch (Opc) {
256   case AMDGPU::DS_READ2ST64_B32:
257   case AMDGPU::DS_READ2ST64_B64:
258   case AMDGPU::DS_WRITE2ST64_B32:
259   case AMDGPU::DS_WRITE2ST64_B64:
260     return true;
261   default:
262     return false;
263   }
264 }
265 
266 bool SIInstrInfo::getMemOperandWithOffset(MachineInstr &LdSt,
267                                           MachineOperand *&BaseOp,
268                                           int64_t &Offset,
269                                           const TargetRegisterInfo *TRI) const {
270   unsigned Opc = LdSt.getOpcode();
271 
272   if (isDS(LdSt)) {
273     const MachineOperand *OffsetImm =
274         getNamedOperand(LdSt, AMDGPU::OpName::offset);
275     if (OffsetImm) {
276       // Normal, single offset LDS instruction.
277       BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
278       // TODO: ds_consume/ds_append use M0 for the base address. Is it safe to
279       // report that here?
280       if (!BaseOp)
281         return false;
282 
283       Offset = OffsetImm->getImm();
284       assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
285                                 "operands of type register.");
286       return true;
287     }
288 
289     // The 2 offset instructions use offset0 and offset1 instead. We can treat
290     // these as a load with a single offset if the 2 offsets are consecutive. We
291     // will use this for some partially aligned loads.
292     const MachineOperand *Offset0Imm =
293         getNamedOperand(LdSt, AMDGPU::OpName::offset0);
294     const MachineOperand *Offset1Imm =
295         getNamedOperand(LdSt, AMDGPU::OpName::offset1);
296 
297     uint8_t Offset0 = Offset0Imm->getImm();
298     uint8_t Offset1 = Offset1Imm->getImm();
299 
300     if (Offset1 > Offset0 && Offset1 - Offset0 == 1) {
301       // Each of these offsets is in element sized units, so we need to convert
302       // to bytes of the individual reads.
303 
304       unsigned EltSize;
305       if (LdSt.mayLoad())
306         EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
307       else {
308         assert(LdSt.mayStore());
309         int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
310         EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
311       }
312 
313       if (isStride64(Opc))
314         EltSize *= 64;
315 
316       BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
317       Offset = EltSize * Offset0;
318       assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
319                                 "operands of type register.");
320       return true;
321     }
322 
323     return false;
324   }
325 
326   if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
327     const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset);
328     if (SOffset && SOffset->isReg())
329       return false;
330 
331     MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
332     if (!AddrReg)
333       return false;
334 
335     const MachineOperand *OffsetImm =
336         getNamedOperand(LdSt, AMDGPU::OpName::offset);
337     BaseOp = AddrReg;
338     Offset = OffsetImm->getImm();
339 
340     if (SOffset) // soffset can be an inline immediate.
341       Offset += SOffset->getImm();
342 
343     assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
344                               "operands of type register.");
345     return true;
346   }
347 
348   if (isSMRD(LdSt)) {
349     const MachineOperand *OffsetImm =
350         getNamedOperand(LdSt, AMDGPU::OpName::offset);
351     if (!OffsetImm)
352       return false;
353 
354     MachineOperand *SBaseReg = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
355     BaseOp = SBaseReg;
356     Offset = OffsetImm->getImm();
357     assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
358                               "operands of type register.");
359     return true;
360   }
361 
362   if (isFLAT(LdSt)) {
363     MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
364     if (VAddr) {
365       // Can't analyze 2 offsets.
366       if (getNamedOperand(LdSt, AMDGPU::OpName::saddr))
367         return false;
368 
369       BaseOp = VAddr;
370     } else {
371       // scratch instructions have either vaddr or saddr.
372       BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
373     }
374 
375     Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
376     assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
377                               "operands of type register.");
378     return true;
379   }
380 
381   return false;
382 }
383 
384 static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
385                                   const MachineOperand &BaseOp1,
386                                   const MachineInstr &MI2,
387                                   const MachineOperand &BaseOp2) {
388   // Support only base operands with base registers.
389   // Note: this could be extended to support FI operands.
390   if (!BaseOp1.isReg() || !BaseOp2.isReg())
391     return false;
392 
393   if (BaseOp1.isIdenticalTo(BaseOp2))
394     return true;
395 
396   if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
397     return false;
398 
399   auto MO1 = *MI1.memoperands_begin();
400   auto MO2 = *MI2.memoperands_begin();
401   if (MO1->getAddrSpace() != MO2->getAddrSpace())
402     return false;
403 
404   auto Base1 = MO1->getValue();
405   auto Base2 = MO2->getValue();
406   if (!Base1 || !Base2)
407     return false;
408   const MachineFunction &MF = *MI1.getParent()->getParent();
409   const DataLayout &DL = MF.getFunction().getParent()->getDataLayout();
410   Base1 = GetUnderlyingObject(Base1, DL);
411   Base2 = GetUnderlyingObject(Base1, DL);
412 
413   if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
414     return false;
415 
416   return Base1 == Base2;
417 }
418 
419 bool SIInstrInfo::shouldClusterMemOps(MachineOperand &BaseOp1,
420                                       MachineOperand &BaseOp2,
421                                       unsigned NumLoads) const {
422   MachineInstr &FirstLdSt = *BaseOp1.getParent();
423   MachineInstr &SecondLdSt = *BaseOp2.getParent();
424 
425   if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOp1, SecondLdSt, BaseOp2))
426     return false;
427 
428   const MachineOperand *FirstDst = nullptr;
429   const MachineOperand *SecondDst = nullptr;
430 
431   if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) ||
432       (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) ||
433       (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) {
434     const unsigned MaxGlobalLoadCluster = 6;
435     if (NumLoads > MaxGlobalLoadCluster)
436       return false;
437 
438     FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata);
439     if (!FirstDst)
440       FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
441     SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata);
442     if (!SecondDst)
443       SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
444   } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) {
445     FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst);
446     SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst);
447   } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) {
448     FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
449     SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
450   }
451 
452   if (!FirstDst || !SecondDst)
453     return false;
454 
455   // Try to limit clustering based on the total number of bytes loaded
456   // rather than the number of instructions.  This is done to help reduce
457   // register pressure.  The method used is somewhat inexact, though,
458   // because it assumes that all loads in the cluster will load the
459   // same number of bytes as FirstLdSt.
460 
461   // The unit of this value is bytes.
462   // FIXME: This needs finer tuning.
463   unsigned LoadClusterThreshold = 16;
464 
465   const MachineRegisterInfo &MRI =
466       FirstLdSt.getParent()->getParent()->getRegInfo();
467   const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg());
468 
469   return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold;
470 }
471 
472 // FIXME: This behaves strangely. If, for example, you have 32 load + stores,
473 // the first 16 loads will be interleaved with the stores, and the next 16 will
474 // be clustered as expected. It should really split into 2 16 store batches.
475 //
476 // Loads are clustered until this returns false, rather than trying to schedule
477 // groups of stores. This also means we have to deal with saying different
478 // address space loads should be clustered, and ones which might cause bank
479 // conflicts.
480 //
481 // This might be deprecated so it might not be worth that much effort to fix.
482 bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1,
483                                           int64_t Offset0, int64_t Offset1,
484                                           unsigned NumLoads) const {
485   assert(Offset1 > Offset0 &&
486          "Second offset should be larger than first offset!");
487   // If we have less than 16 loads in a row, and the offsets are within 64
488   // bytes, then schedule together.
489 
490   // A cacheline is 64 bytes (for global memory).
491   return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
492 }
493 
494 static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
495                               MachineBasicBlock::iterator MI,
496                               const DebugLoc &DL, unsigned DestReg,
497                               unsigned SrcReg, bool KillSrc) {
498   MachineFunction *MF = MBB.getParent();
499   DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(),
500                                         "illegal SGPR to VGPR copy",
501                                         DL, DS_Error);
502   LLVMContext &C = MF->getFunction().getContext();
503   C.diagnose(IllegalCopy);
504 
505   BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
506     .addReg(SrcReg, getKillRegState(KillSrc));
507 }
508 
509 void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
510                               MachineBasicBlock::iterator MI,
511                               const DebugLoc &DL, unsigned DestReg,
512                               unsigned SrcReg, bool KillSrc) const {
513   const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg);
514 
515   if (RC == &AMDGPU::VGPR_32RegClass) {
516     assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
517            AMDGPU::SReg_32RegClass.contains(SrcReg));
518     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
519       .addReg(SrcReg, getKillRegState(KillSrc));
520     return;
521   }
522 
523   if (RC == &AMDGPU::SReg_32_XM0RegClass ||
524       RC == &AMDGPU::SReg_32RegClass) {
525     if (SrcReg == AMDGPU::SCC) {
526       BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
527           .addImm(-1)
528           .addImm(0);
529       return;
530     }
531 
532     if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
533       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
534       return;
535     }
536 
537     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
538             .addReg(SrcReg, getKillRegState(KillSrc));
539     return;
540   }
541 
542   if (RC == &AMDGPU::SReg_64RegClass) {
543     if (DestReg == AMDGPU::VCC) {
544       if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
545         BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
546           .addReg(SrcReg, getKillRegState(KillSrc));
547       } else {
548         // FIXME: Hack until VReg_1 removed.
549         assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
550         BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
551           .addImm(0)
552           .addReg(SrcReg, getKillRegState(KillSrc));
553       }
554 
555       return;
556     }
557 
558     if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
559       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
560       return;
561     }
562 
563     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
564             .addReg(SrcReg, getKillRegState(KillSrc));
565     return;
566   }
567 
568   if (DestReg == AMDGPU::SCC) {
569     assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
570     BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
571       .addReg(SrcReg, getKillRegState(KillSrc))
572       .addImm(0);
573     return;
574   }
575 
576   unsigned EltSize = 4;
577   unsigned Opcode = AMDGPU::V_MOV_B32_e32;
578   if (RI.isSGPRClass(RC)) {
579     if (RI.getRegSizeInBits(*RC) > 32) {
580       Opcode =  AMDGPU::S_MOV_B64;
581       EltSize = 8;
582     } else {
583       Opcode = AMDGPU::S_MOV_B32;
584       EltSize = 4;
585     }
586 
587     if (!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))) {
588       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
589       return;
590     }
591   }
592 
593   ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
594   bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
595 
596   for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
597     unsigned SubIdx;
598     if (Forward)
599       SubIdx = SubIndices[Idx];
600     else
601       SubIdx = SubIndices[SubIndices.size() - Idx - 1];
602 
603     MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
604       get(Opcode), RI.getSubReg(DestReg, SubIdx));
605 
606     Builder.addReg(RI.getSubReg(SrcReg, SubIdx));
607 
608     if (Idx == 0)
609       Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
610 
611     bool UseKill = KillSrc && Idx == SubIndices.size() - 1;
612     Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
613   }
614 }
615 
616 int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
617   int NewOpc;
618 
619   // Try to map original to commuted opcode
620   NewOpc = AMDGPU::getCommuteRev(Opcode);
621   if (NewOpc != -1)
622     // Check if the commuted (REV) opcode exists on the target.
623     return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
624 
625   // Try to map commuted to original opcode
626   NewOpc = AMDGPU::getCommuteOrig(Opcode);
627   if (NewOpc != -1)
628     // Check if the original (non-REV) opcode exists on the target.
629     return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
630 
631   return Opcode;
632 }
633 
634 void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB,
635                                        MachineBasicBlock::iterator MI,
636                                        const DebugLoc &DL, unsigned DestReg,
637                                        int64_t Value) const {
638   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
639   const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
640   if (RegClass == &AMDGPU::SReg_32RegClass ||
641       RegClass == &AMDGPU::SGPR_32RegClass ||
642       RegClass == &AMDGPU::SReg_32_XM0RegClass ||
643       RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) {
644     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
645       .addImm(Value);
646     return;
647   }
648 
649   if (RegClass == &AMDGPU::SReg_64RegClass ||
650       RegClass == &AMDGPU::SGPR_64RegClass ||
651       RegClass == &AMDGPU::SReg_64_XEXECRegClass) {
652     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
653       .addImm(Value);
654     return;
655   }
656 
657   if (RegClass == &AMDGPU::VGPR_32RegClass) {
658     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
659       .addImm(Value);
660     return;
661   }
662   if (RegClass == &AMDGPU::VReg_64RegClass) {
663     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
664       .addImm(Value);
665     return;
666   }
667 
668   unsigned EltSize = 4;
669   unsigned Opcode = AMDGPU::V_MOV_B32_e32;
670   if (RI.isSGPRClass(RegClass)) {
671     if (RI.getRegSizeInBits(*RegClass) > 32) {
672       Opcode =  AMDGPU::S_MOV_B64;
673       EltSize = 8;
674     } else {
675       Opcode = AMDGPU::S_MOV_B32;
676       EltSize = 4;
677     }
678   }
679 
680   ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
681   for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
682     int64_t IdxValue = Idx == 0 ? Value : 0;
683 
684     MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
685       get(Opcode), RI.getSubReg(DestReg, Idx));
686     Builder.addImm(IdxValue);
687   }
688 }
689 
690 const TargetRegisterClass *
691 SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const {
692   return &AMDGPU::VGPR_32RegClass;
693 }
694 
695 void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
696                                      MachineBasicBlock::iterator I,
697                                      const DebugLoc &DL, unsigned DstReg,
698                                      ArrayRef<MachineOperand> Cond,
699                                      unsigned TrueReg,
700                                      unsigned FalseReg) const {
701   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
702   assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
703          "Not a VGPR32 reg");
704 
705   if (Cond.size() == 1) {
706     unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
707     BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
708       .add(Cond[0]);
709     BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
710       .addReg(FalseReg)
711       .addReg(TrueReg)
712       .addReg(SReg);
713   } else if (Cond.size() == 2) {
714     assert(Cond[0].isImm() && "Cond[0] is not an immediate");
715     switch (Cond[0].getImm()) {
716     case SIInstrInfo::SCC_TRUE: {
717       unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
718       BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
719         .addImm(-1)
720         .addImm(0);
721       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
722         .addReg(FalseReg)
723         .addReg(TrueReg)
724         .addReg(SReg);
725       break;
726     }
727     case SIInstrInfo::SCC_FALSE: {
728       unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
729       BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
730         .addImm(0)
731         .addImm(-1);
732       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
733         .addReg(FalseReg)
734         .addReg(TrueReg)
735         .addReg(SReg);
736       break;
737     }
738     case SIInstrInfo::VCCNZ: {
739       MachineOperand RegOp = Cond[1];
740       RegOp.setImplicit(false);
741       unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
742       BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
743         .add(RegOp);
744       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
745           .addReg(FalseReg)
746           .addReg(TrueReg)
747           .addReg(SReg);
748       break;
749     }
750     case SIInstrInfo::VCCZ: {
751       MachineOperand RegOp = Cond[1];
752       RegOp.setImplicit(false);
753       unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
754       BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
755         .add(RegOp);
756       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
757           .addReg(TrueReg)
758           .addReg(FalseReg)
759           .addReg(SReg);
760       break;
761     }
762     case SIInstrInfo::EXECNZ: {
763       unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
764       unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
765       BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
766         .addImm(0);
767       BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
768         .addImm(-1)
769         .addImm(0);
770       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
771         .addReg(FalseReg)
772         .addReg(TrueReg)
773         .addReg(SReg);
774       break;
775     }
776     case SIInstrInfo::EXECZ: {
777       unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
778       unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
779       BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
780         .addImm(0);
781       BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
782         .addImm(0)
783         .addImm(-1);
784       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
785         .addReg(FalseReg)
786         .addReg(TrueReg)
787         .addReg(SReg);
788       llvm_unreachable("Unhandled branch predicate EXECZ");
789       break;
790     }
791     default:
792       llvm_unreachable("invalid branch predicate");
793     }
794   } else {
795     llvm_unreachable("Can only handle Cond size 1 or 2");
796   }
797 }
798 
799 unsigned SIInstrInfo::insertEQ(MachineBasicBlock *MBB,
800                                MachineBasicBlock::iterator I,
801                                const DebugLoc &DL,
802                                unsigned SrcReg, int Value) const {
803   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
804   unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
805   BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
806     .addImm(Value)
807     .addReg(SrcReg);
808 
809   return Reg;
810 }
811 
812 unsigned SIInstrInfo::insertNE(MachineBasicBlock *MBB,
813                                MachineBasicBlock::iterator I,
814                                const DebugLoc &DL,
815                                unsigned SrcReg, int Value) const {
816   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
817   unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
818   BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
819     .addImm(Value)
820     .addReg(SrcReg);
821 
822   return Reg;
823 }
824 
825 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
826 
827   if (RI.getRegSizeInBits(*DstRC) == 32) {
828     return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
829   } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) {
830     return AMDGPU::S_MOV_B64;
831   } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) {
832     return  AMDGPU::V_MOV_B64_PSEUDO;
833   }
834   return AMDGPU::COPY;
835 }
836 
837 static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
838   switch (Size) {
839   case 4:
840     return AMDGPU::SI_SPILL_S32_SAVE;
841   case 8:
842     return AMDGPU::SI_SPILL_S64_SAVE;
843   case 16:
844     return AMDGPU::SI_SPILL_S128_SAVE;
845   case 32:
846     return AMDGPU::SI_SPILL_S256_SAVE;
847   case 64:
848     return AMDGPU::SI_SPILL_S512_SAVE;
849   default:
850     llvm_unreachable("unknown register size");
851   }
852 }
853 
854 static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
855   switch (Size) {
856   case 4:
857     return AMDGPU::SI_SPILL_V32_SAVE;
858   case 8:
859     return AMDGPU::SI_SPILL_V64_SAVE;
860   case 12:
861     return AMDGPU::SI_SPILL_V96_SAVE;
862   case 16:
863     return AMDGPU::SI_SPILL_V128_SAVE;
864   case 32:
865     return AMDGPU::SI_SPILL_V256_SAVE;
866   case 64:
867     return AMDGPU::SI_SPILL_V512_SAVE;
868   default:
869     llvm_unreachable("unknown register size");
870   }
871 }
872 
873 void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
874                                       MachineBasicBlock::iterator MI,
875                                       unsigned SrcReg, bool isKill,
876                                       int FrameIndex,
877                                       const TargetRegisterClass *RC,
878                                       const TargetRegisterInfo *TRI) const {
879   MachineFunction *MF = MBB.getParent();
880   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
881   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
882   const DebugLoc &DL = MBB.findDebugLoc(MI);
883 
884   unsigned Size = FrameInfo.getObjectSize(FrameIndex);
885   unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
886   MachinePointerInfo PtrInfo
887     = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
888   MachineMemOperand *MMO
889     = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
890                                Size, Align);
891   unsigned SpillSize = TRI->getSpillSize(*RC);
892 
893   if (RI.isSGPRClass(RC)) {
894     MFI->setHasSpilledSGPRs();
895 
896     // We are only allowed to create one new instruction when spilling
897     // registers, so we need to use pseudo instruction for spilling SGPRs.
898     const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
899 
900     // The SGPR spill/restore instructions only work on number sgprs, so we need
901     // to make sure we are using the correct register class.
902     if (TargetRegisterInfo::isVirtualRegister(SrcReg) && SpillSize == 4) {
903       MachineRegisterInfo &MRI = MF->getRegInfo();
904       MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass);
905     }
906 
907     MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc)
908       .addReg(SrcReg, getKillRegState(isKill)) // data
909       .addFrameIndex(FrameIndex)               // addr
910       .addMemOperand(MMO)
911       .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
912       .addReg(MFI->getFrameOffsetReg(), RegState::Implicit);
913     // Add the scratch resource registers as implicit uses because we may end up
914     // needing them, and need to ensure that the reserved registers are
915     // correctly handled.
916 
917     FrameInfo.setStackID(FrameIndex, SIStackID::SGPR_SPILL);
918     if (ST.hasScalarStores()) {
919       // m0 is used for offset to scalar stores if used to spill.
920       Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
921     }
922 
923     return;
924   }
925 
926   assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
927 
928   unsigned Opcode = getVGPRSpillSaveOpcode(SpillSize);
929   MFI->setHasSpilledVGPRs();
930   BuildMI(MBB, MI, DL, get(Opcode))
931     .addReg(SrcReg, getKillRegState(isKill)) // data
932     .addFrameIndex(FrameIndex)               // addr
933     .addReg(MFI->getScratchRSrcReg())        // scratch_rsrc
934     .addReg(MFI->getFrameOffsetReg())        // scratch_offset
935     .addImm(0)                               // offset
936     .addMemOperand(MMO);
937 }
938 
939 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
940   switch (Size) {
941   case 4:
942     return AMDGPU::SI_SPILL_S32_RESTORE;
943   case 8:
944     return AMDGPU::SI_SPILL_S64_RESTORE;
945   case 16:
946     return AMDGPU::SI_SPILL_S128_RESTORE;
947   case 32:
948     return AMDGPU::SI_SPILL_S256_RESTORE;
949   case 64:
950     return AMDGPU::SI_SPILL_S512_RESTORE;
951   default:
952     llvm_unreachable("unknown register size");
953   }
954 }
955 
956 static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
957   switch (Size) {
958   case 4:
959     return AMDGPU::SI_SPILL_V32_RESTORE;
960   case 8:
961     return AMDGPU::SI_SPILL_V64_RESTORE;
962   case 12:
963     return AMDGPU::SI_SPILL_V96_RESTORE;
964   case 16:
965     return AMDGPU::SI_SPILL_V128_RESTORE;
966   case 32:
967     return AMDGPU::SI_SPILL_V256_RESTORE;
968   case 64:
969     return AMDGPU::SI_SPILL_V512_RESTORE;
970   default:
971     llvm_unreachable("unknown register size");
972   }
973 }
974 
975 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
976                                        MachineBasicBlock::iterator MI,
977                                        unsigned DestReg, int FrameIndex,
978                                        const TargetRegisterClass *RC,
979                                        const TargetRegisterInfo *TRI) const {
980   MachineFunction *MF = MBB.getParent();
981   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
982   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
983   const DebugLoc &DL = MBB.findDebugLoc(MI);
984   unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
985   unsigned Size = FrameInfo.getObjectSize(FrameIndex);
986   unsigned SpillSize = TRI->getSpillSize(*RC);
987 
988   MachinePointerInfo PtrInfo
989     = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
990 
991   MachineMemOperand *MMO = MF->getMachineMemOperand(
992     PtrInfo, MachineMemOperand::MOLoad, Size, Align);
993 
994   if (RI.isSGPRClass(RC)) {
995     MFI->setHasSpilledSGPRs();
996 
997     // FIXME: Maybe this should not include a memoperand because it will be
998     // lowered to non-memory instructions.
999     const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1000     if (TargetRegisterInfo::isVirtualRegister(DestReg) && SpillSize == 4) {
1001       MachineRegisterInfo &MRI = MF->getRegInfo();
1002       MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
1003     }
1004 
1005     FrameInfo.setStackID(FrameIndex, SIStackID::SGPR_SPILL);
1006     MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg)
1007       .addFrameIndex(FrameIndex) // addr
1008       .addMemOperand(MMO)
1009       .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
1010       .addReg(MFI->getFrameOffsetReg(), RegState::Implicit);
1011 
1012     if (ST.hasScalarStores()) {
1013       // m0 is used for offset to scalar stores if used to spill.
1014       Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
1015     }
1016 
1017     return;
1018   }
1019 
1020   assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
1021 
1022   unsigned Opcode = getVGPRSpillRestoreOpcode(SpillSize);
1023   BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1024     .addFrameIndex(FrameIndex)        // vaddr
1025     .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
1026     .addReg(MFI->getFrameOffsetReg()) // scratch_offset
1027     .addImm(0)                        // offset
1028     .addMemOperand(MMO);
1029 }
1030 
1031 /// \param @Offset Offset in bytes of the FrameIndex being spilled
1032 unsigned SIInstrInfo::calculateLDSSpillAddress(
1033     MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg,
1034     unsigned FrameOffset, unsigned Size) const {
1035   MachineFunction *MF = MBB.getParent();
1036   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1037   const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
1038   const DebugLoc &DL = MBB.findDebugLoc(MI);
1039   unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize();
1040   unsigned WavefrontSize = ST.getWavefrontSize();
1041 
1042   unsigned TIDReg = MFI->getTIDReg();
1043   if (!MFI->hasCalculatedTID()) {
1044     MachineBasicBlock &Entry = MBB.getParent()->front();
1045     MachineBasicBlock::iterator Insert = Entry.front();
1046     const DebugLoc &DL = Insert->getDebugLoc();
1047 
1048     TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass,
1049                                    *MF);
1050     if (TIDReg == AMDGPU::NoRegister)
1051       return TIDReg;
1052 
1053     if (!AMDGPU::isShader(MF->getFunction().getCallingConv()) &&
1054         WorkGroupSize > WavefrontSize) {
1055       unsigned TIDIGXReg
1056         = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
1057       unsigned TIDIGYReg
1058         = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
1059       unsigned TIDIGZReg
1060         = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
1061       unsigned InputPtrReg =
1062           MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1063       for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) {
1064         if (!Entry.isLiveIn(Reg))
1065           Entry.addLiveIn(Reg);
1066       }
1067 
1068       RS->enterBasicBlock(Entry);
1069       // FIXME: Can we scavenge an SReg_64 and access the subregs?
1070       unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
1071       unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
1072       BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0)
1073               .addReg(InputPtrReg)
1074               .addImm(SI::KernelInputOffsets::NGROUPS_Z);
1075       BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1)
1076               .addReg(InputPtrReg)
1077               .addImm(SI::KernelInputOffsets::NGROUPS_Y);
1078 
1079       // NGROUPS.X * NGROUPS.Y
1080       BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1)
1081               .addReg(STmp1)
1082               .addReg(STmp0);
1083       // (NGROUPS.X * NGROUPS.Y) * TIDIG.X
1084       BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg)
1085               .addReg(STmp1)
1086               .addReg(TIDIGXReg);
1087       // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)
1088       BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg)
1089               .addReg(STmp0)
1090               .addReg(TIDIGYReg)
1091               .addReg(TIDReg);
1092       // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z
1093       getAddNoCarry(Entry, Insert, DL, TIDReg)
1094         .addReg(TIDReg)
1095         .addReg(TIDIGZReg);
1096     } else {
1097       // Get the wave id
1098       BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
1099               TIDReg)
1100               .addImm(-1)
1101               .addImm(0);
1102 
1103       BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64),
1104               TIDReg)
1105               .addImm(-1)
1106               .addReg(TIDReg);
1107     }
1108 
1109     BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32),
1110             TIDReg)
1111             .addImm(2)
1112             .addReg(TIDReg);
1113     MFI->setTIDReg(TIDReg);
1114   }
1115 
1116   // Add FrameIndex to LDS offset
1117   unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize);
1118   getAddNoCarry(MBB, MI, DL, TmpReg)
1119     .addImm(LDSOffset)
1120     .addReg(TIDReg);
1121 
1122   return TmpReg;
1123 }
1124 
1125 void SIInstrInfo::insertWaitStates(MachineBasicBlock &MBB,
1126                                    MachineBasicBlock::iterator MI,
1127                                    int Count) const {
1128   DebugLoc DL = MBB.findDebugLoc(MI);
1129   while (Count > 0) {
1130     int Arg;
1131     if (Count >= 8)
1132       Arg = 7;
1133     else
1134       Arg = Count - 1;
1135     Count -= 8;
1136     BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP))
1137             .addImm(Arg);
1138   }
1139 }
1140 
1141 void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,
1142                              MachineBasicBlock::iterator MI) const {
1143   insertWaitStates(MBB, MI, 1);
1144 }
1145 
1146 void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const {
1147   auto MF = MBB.getParent();
1148   SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1149 
1150   assert(Info->isEntryFunction());
1151 
1152   if (MBB.succ_empty()) {
1153     bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1154     if (HasNoTerminator)
1155       BuildMI(MBB, MBB.end(), DebugLoc(),
1156               get(Info->returnsVoid() ? AMDGPU::S_ENDPGM : AMDGPU::SI_RETURN_TO_EPILOG));
1157   }
1158 }
1159 
1160 unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
1161   switch (MI.getOpcode()) {
1162   default: return 1; // FIXME: Do wait states equal cycles?
1163 
1164   case AMDGPU::S_NOP:
1165     return MI.getOperand(0).getImm() + 1;
1166   }
1167 }
1168 
1169 bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
1170   MachineBasicBlock &MBB = *MI.getParent();
1171   DebugLoc DL = MBB.findDebugLoc(MI);
1172   switch (MI.getOpcode()) {
1173   default: return TargetInstrInfo::expandPostRAPseudo(MI);
1174   case AMDGPU::S_MOV_B64_term:
1175     // This is only a terminator to get the correct spill code placement during
1176     // register allocation.
1177     MI.setDesc(get(AMDGPU::S_MOV_B64));
1178     break;
1179 
1180   case AMDGPU::S_XOR_B64_term:
1181     // This is only a terminator to get the correct spill code placement during
1182     // register allocation.
1183     MI.setDesc(get(AMDGPU::S_XOR_B64));
1184     break;
1185 
1186   case AMDGPU::S_ANDN2_B64_term:
1187     // This is only a terminator to get the correct spill code placement during
1188     // register allocation.
1189     MI.setDesc(get(AMDGPU::S_ANDN2_B64));
1190     break;
1191 
1192   case AMDGPU::V_MOV_B64_PSEUDO: {
1193     unsigned Dst = MI.getOperand(0).getReg();
1194     unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
1195     unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
1196 
1197     const MachineOperand &SrcOp = MI.getOperand(1);
1198     // FIXME: Will this work for 64-bit floating point immediates?
1199     assert(!SrcOp.isFPImm());
1200     if (SrcOp.isImm()) {
1201       APInt Imm(64, SrcOp.getImm());
1202       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
1203         .addImm(Imm.getLoBits(32).getZExtValue())
1204         .addReg(Dst, RegState::Implicit | RegState::Define);
1205       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
1206         .addImm(Imm.getHiBits(32).getZExtValue())
1207         .addReg(Dst, RegState::Implicit | RegState::Define);
1208     } else {
1209       assert(SrcOp.isReg());
1210       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
1211         .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
1212         .addReg(Dst, RegState::Implicit | RegState::Define);
1213       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
1214         .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
1215         .addReg(Dst, RegState::Implicit | RegState::Define);
1216     }
1217     MI.eraseFromParent();
1218     break;
1219   }
1220   case AMDGPU::V_SET_INACTIVE_B32: {
1221     BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
1222       .addReg(AMDGPU::EXEC);
1223     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
1224       .add(MI.getOperand(2));
1225     BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
1226       .addReg(AMDGPU::EXEC);
1227     MI.eraseFromParent();
1228     break;
1229   }
1230   case AMDGPU::V_SET_INACTIVE_B64: {
1231     BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
1232       .addReg(AMDGPU::EXEC);
1233     MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
1234                                  MI.getOperand(0).getReg())
1235       .add(MI.getOperand(2));
1236     expandPostRAPseudo(*Copy);
1237     BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
1238       .addReg(AMDGPU::EXEC);
1239     MI.eraseFromParent();
1240     break;
1241   }
1242   case AMDGPU::V_MOVRELD_B32_V1:
1243   case AMDGPU::V_MOVRELD_B32_V2:
1244   case AMDGPU::V_MOVRELD_B32_V4:
1245   case AMDGPU::V_MOVRELD_B32_V8:
1246   case AMDGPU::V_MOVRELD_B32_V16: {
1247     const MCInstrDesc &MovRelDesc = get(AMDGPU::V_MOVRELD_B32_e32);
1248     unsigned VecReg = MI.getOperand(0).getReg();
1249     bool IsUndef = MI.getOperand(1).isUndef();
1250     unsigned SubReg = AMDGPU::sub0 + MI.getOperand(3).getImm();
1251     assert(VecReg == MI.getOperand(1).getReg());
1252 
1253     MachineInstr *MovRel =
1254         BuildMI(MBB, MI, DL, MovRelDesc)
1255             .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
1256             .add(MI.getOperand(2))
1257             .addReg(VecReg, RegState::ImplicitDefine)
1258             .addReg(VecReg,
1259                     RegState::Implicit | (IsUndef ? RegState::Undef : 0));
1260 
1261     const int ImpDefIdx =
1262         MovRelDesc.getNumOperands() + MovRelDesc.getNumImplicitUses();
1263     const int ImpUseIdx = ImpDefIdx + 1;
1264     MovRel->tieOperands(ImpDefIdx, ImpUseIdx);
1265 
1266     MI.eraseFromParent();
1267     break;
1268   }
1269   case AMDGPU::SI_PC_ADD_REL_OFFSET: {
1270     MachineFunction &MF = *MBB.getParent();
1271     unsigned Reg = MI.getOperand(0).getReg();
1272     unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
1273     unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
1274 
1275     // Create a bundle so these instructions won't be re-ordered by the
1276     // post-RA scheduler.
1277     MIBundleBuilder Bundler(MBB, MI);
1278     Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
1279 
1280     // Add 32-bit offset from this instruction to the start of the
1281     // constant data.
1282     Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo)
1283                        .addReg(RegLo)
1284                        .add(MI.getOperand(1)));
1285 
1286     MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
1287                                   .addReg(RegHi);
1288     if (MI.getOperand(2).getTargetFlags() == SIInstrInfo::MO_NONE)
1289       MIB.addImm(0);
1290     else
1291       MIB.add(MI.getOperand(2));
1292 
1293     Bundler.append(MIB);
1294     finalizeBundle(MBB, Bundler.begin());
1295 
1296     MI.eraseFromParent();
1297     break;
1298   }
1299   case AMDGPU::EXIT_WWM: {
1300     // This only gets its own opcode so that SIFixWWMLiveness can tell when WWM
1301     // is exited.
1302     MI.setDesc(get(AMDGPU::S_MOV_B64));
1303     break;
1304   }
1305   case TargetOpcode::BUNDLE: {
1306     if (!MI.mayLoad())
1307       return false;
1308 
1309     // If it is a load it must be a memory clause
1310     for (MachineBasicBlock::instr_iterator I = MI.getIterator();
1311          I->isBundledWithSucc(); ++I) {
1312       I->unbundleFromSucc();
1313       for (MachineOperand &MO : I->operands())
1314         if (MO.isReg())
1315           MO.setIsInternalRead(false);
1316     }
1317 
1318     MI.eraseFromParent();
1319     break;
1320   }
1321   }
1322   return true;
1323 }
1324 
1325 bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI,
1326                                       MachineOperand &Src0,
1327                                       unsigned Src0OpName,
1328                                       MachineOperand &Src1,
1329                                       unsigned Src1OpName) const {
1330   MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
1331   if (!Src0Mods)
1332     return false;
1333 
1334   MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
1335   assert(Src1Mods &&
1336          "All commutable instructions have both src0 and src1 modifiers");
1337 
1338   int Src0ModsVal = Src0Mods->getImm();
1339   int Src1ModsVal = Src1Mods->getImm();
1340 
1341   Src1Mods->setImm(Src0ModsVal);
1342   Src0Mods->setImm(Src1ModsVal);
1343   return true;
1344 }
1345 
1346 static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI,
1347                                              MachineOperand &RegOp,
1348                                              MachineOperand &NonRegOp) {
1349   unsigned Reg = RegOp.getReg();
1350   unsigned SubReg = RegOp.getSubReg();
1351   bool IsKill = RegOp.isKill();
1352   bool IsDead = RegOp.isDead();
1353   bool IsUndef = RegOp.isUndef();
1354   bool IsDebug = RegOp.isDebug();
1355 
1356   if (NonRegOp.isImm())
1357     RegOp.ChangeToImmediate(NonRegOp.getImm());
1358   else if (NonRegOp.isFI())
1359     RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
1360   else
1361     return nullptr;
1362 
1363   NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
1364   NonRegOp.setSubReg(SubReg);
1365 
1366   return &MI;
1367 }
1368 
1369 MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
1370                                                   unsigned Src0Idx,
1371                                                   unsigned Src1Idx) const {
1372   assert(!NewMI && "this should never be used");
1373 
1374   unsigned Opc = MI.getOpcode();
1375   int CommutedOpcode = commuteOpcode(Opc);
1376   if (CommutedOpcode == -1)
1377     return nullptr;
1378 
1379   assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
1380            static_cast<int>(Src0Idx) &&
1381          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
1382            static_cast<int>(Src1Idx) &&
1383          "inconsistency with findCommutedOpIndices");
1384 
1385   MachineOperand &Src0 = MI.getOperand(Src0Idx);
1386   MachineOperand &Src1 = MI.getOperand(Src1Idx);
1387 
1388   MachineInstr *CommutedMI = nullptr;
1389   if (Src0.isReg() && Src1.isReg()) {
1390     if (isOperandLegal(MI, Src1Idx, &Src0)) {
1391       // Be sure to copy the source modifiers to the right place.
1392       CommutedMI
1393         = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
1394     }
1395 
1396   } else if (Src0.isReg() && !Src1.isReg()) {
1397     // src0 should always be able to support any operand type, so no need to
1398     // check operand legality.
1399     CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
1400   } else if (!Src0.isReg() && Src1.isReg()) {
1401     if (isOperandLegal(MI, Src1Idx, &Src0))
1402       CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
1403   } else {
1404     // FIXME: Found two non registers to commute. This does happen.
1405     return nullptr;
1406   }
1407 
1408   if (CommutedMI) {
1409     swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
1410                         Src1, AMDGPU::OpName::src1_modifiers);
1411 
1412     CommutedMI->setDesc(get(CommutedOpcode));
1413   }
1414 
1415   return CommutedMI;
1416 }
1417 
1418 // This needs to be implemented because the source modifiers may be inserted
1419 // between the true commutable operands, and the base
1420 // TargetInstrInfo::commuteInstruction uses it.
1421 bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0,
1422                                         unsigned &SrcOpIdx1) const {
1423   return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
1424 }
1425 
1426 bool SIInstrInfo::findCommutedOpIndices(MCInstrDesc Desc, unsigned &SrcOpIdx0,
1427                                         unsigned &SrcOpIdx1) const {
1428   if (!Desc.isCommutable())
1429     return false;
1430 
1431   unsigned Opc = Desc.getOpcode();
1432   int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1433   if (Src0Idx == -1)
1434     return false;
1435 
1436   int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
1437   if (Src1Idx == -1)
1438     return false;
1439 
1440   return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
1441 }
1442 
1443 bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
1444                                         int64_t BrOffset) const {
1445   // BranchRelaxation should never have to check s_setpc_b64 because its dest
1446   // block is unanalyzable.
1447   assert(BranchOp != AMDGPU::S_SETPC_B64);
1448 
1449   // Convert to dwords.
1450   BrOffset /= 4;
1451 
1452   // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
1453   // from the next instruction.
1454   BrOffset -= 1;
1455 
1456   return isIntN(BranchOffsetBits, BrOffset);
1457 }
1458 
1459 MachineBasicBlock *SIInstrInfo::getBranchDestBlock(
1460   const MachineInstr &MI) const {
1461   if (MI.getOpcode() == AMDGPU::S_SETPC_B64) {
1462     // This would be a difficult analysis to perform, but can always be legal so
1463     // there's no need to analyze it.
1464     return nullptr;
1465   }
1466 
1467   return MI.getOperand(0).getMBB();
1468 }
1469 
1470 unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
1471                                            MachineBasicBlock &DestBB,
1472                                            const DebugLoc &DL,
1473                                            int64_t BrOffset,
1474                                            RegScavenger *RS) const {
1475   assert(RS && "RegScavenger required for long branching");
1476   assert(MBB.empty() &&
1477          "new block should be inserted for expanding unconditional branch");
1478   assert(MBB.pred_size() == 1);
1479 
1480   MachineFunction *MF = MBB.getParent();
1481   MachineRegisterInfo &MRI = MF->getRegInfo();
1482 
1483   // FIXME: Virtual register workaround for RegScavenger not working with empty
1484   // blocks.
1485   unsigned PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
1486 
1487   auto I = MBB.end();
1488 
1489   // We need to compute the offset relative to the instruction immediately after
1490   // s_getpc_b64. Insert pc arithmetic code before last terminator.
1491   MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
1492 
1493   // TODO: Handle > 32-bit block address.
1494   if (BrOffset >= 0) {
1495     BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
1496       .addReg(PCReg, RegState::Define, AMDGPU::sub0)
1497       .addReg(PCReg, 0, AMDGPU::sub0)
1498       .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_FORWARD);
1499     BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
1500       .addReg(PCReg, RegState::Define, AMDGPU::sub1)
1501       .addReg(PCReg, 0, AMDGPU::sub1)
1502       .addImm(0);
1503   } else {
1504     // Backwards branch.
1505     BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32))
1506       .addReg(PCReg, RegState::Define, AMDGPU::sub0)
1507       .addReg(PCReg, 0, AMDGPU::sub0)
1508       .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_BACKWARD);
1509     BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32))
1510       .addReg(PCReg, RegState::Define, AMDGPU::sub1)
1511       .addReg(PCReg, 0, AMDGPU::sub1)
1512       .addImm(0);
1513   }
1514 
1515   // Insert the indirect branch after the other terminator.
1516   BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
1517     .addReg(PCReg);
1518 
1519   // FIXME: If spilling is necessary, this will fail because this scavenger has
1520   // no emergency stack slots. It is non-trivial to spill in this situation,
1521   // because the restore code needs to be specially placed after the
1522   // jump. BranchRelaxation then needs to be made aware of the newly inserted
1523   // block.
1524   //
1525   // If a spill is needed for the pc register pair, we need to insert a spill
1526   // restore block right before the destination block, and insert a short branch
1527   // into the old destination block's fallthrough predecessor.
1528   // e.g.:
1529   //
1530   // s_cbranch_scc0 skip_long_branch:
1531   //
1532   // long_branch_bb:
1533   //   spill s[8:9]
1534   //   s_getpc_b64 s[8:9]
1535   //   s_add_u32 s8, s8, restore_bb
1536   //   s_addc_u32 s9, s9, 0
1537   //   s_setpc_b64 s[8:9]
1538   //
1539   // skip_long_branch:
1540   //   foo;
1541   //
1542   // .....
1543   //
1544   // dest_bb_fallthrough_predecessor:
1545   // bar;
1546   // s_branch dest_bb
1547   //
1548   // restore_bb:
1549   //  restore s[8:9]
1550   //  fallthrough dest_bb
1551   ///
1552   // dest_bb:
1553   //   buzz;
1554 
1555   RS->enterBasicBlockEnd(MBB);
1556   unsigned Scav = RS->scavengeRegisterBackwards(
1557     AMDGPU::SReg_64RegClass,
1558     MachineBasicBlock::iterator(GetPC), false, 0);
1559   MRI.replaceRegWith(PCReg, Scav);
1560   MRI.clearVirtRegs();
1561   RS->setRegUsed(Scav);
1562 
1563   return 4 + 8 + 4 + 4;
1564 }
1565 
1566 unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
1567   switch (Cond) {
1568   case SIInstrInfo::SCC_TRUE:
1569     return AMDGPU::S_CBRANCH_SCC1;
1570   case SIInstrInfo::SCC_FALSE:
1571     return AMDGPU::S_CBRANCH_SCC0;
1572   case SIInstrInfo::VCCNZ:
1573     return AMDGPU::S_CBRANCH_VCCNZ;
1574   case SIInstrInfo::VCCZ:
1575     return AMDGPU::S_CBRANCH_VCCZ;
1576   case SIInstrInfo::EXECNZ:
1577     return AMDGPU::S_CBRANCH_EXECNZ;
1578   case SIInstrInfo::EXECZ:
1579     return AMDGPU::S_CBRANCH_EXECZ;
1580   default:
1581     llvm_unreachable("invalid branch predicate");
1582   }
1583 }
1584 
1585 SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
1586   switch (Opcode) {
1587   case AMDGPU::S_CBRANCH_SCC0:
1588     return SCC_FALSE;
1589   case AMDGPU::S_CBRANCH_SCC1:
1590     return SCC_TRUE;
1591   case AMDGPU::S_CBRANCH_VCCNZ:
1592     return VCCNZ;
1593   case AMDGPU::S_CBRANCH_VCCZ:
1594     return VCCZ;
1595   case AMDGPU::S_CBRANCH_EXECNZ:
1596     return EXECNZ;
1597   case AMDGPU::S_CBRANCH_EXECZ:
1598     return EXECZ;
1599   default:
1600     return INVALID_BR;
1601   }
1602 }
1603 
1604 bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB,
1605                                     MachineBasicBlock::iterator I,
1606                                     MachineBasicBlock *&TBB,
1607                                     MachineBasicBlock *&FBB,
1608                                     SmallVectorImpl<MachineOperand> &Cond,
1609                                     bool AllowModify) const {
1610   if (I->getOpcode() == AMDGPU::S_BRANCH) {
1611     // Unconditional Branch
1612     TBB = I->getOperand(0).getMBB();
1613     return false;
1614   }
1615 
1616   MachineBasicBlock *CondBB = nullptr;
1617 
1618   if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
1619     CondBB = I->getOperand(1).getMBB();
1620     Cond.push_back(I->getOperand(0));
1621   } else {
1622     BranchPredicate Pred = getBranchPredicate(I->getOpcode());
1623     if (Pred == INVALID_BR)
1624       return true;
1625 
1626     CondBB = I->getOperand(0).getMBB();
1627     Cond.push_back(MachineOperand::CreateImm(Pred));
1628     Cond.push_back(I->getOperand(1)); // Save the branch register.
1629   }
1630   ++I;
1631 
1632   if (I == MBB.end()) {
1633     // Conditional branch followed by fall-through.
1634     TBB = CondBB;
1635     return false;
1636   }
1637 
1638   if (I->getOpcode() == AMDGPU::S_BRANCH) {
1639     TBB = CondBB;
1640     FBB = I->getOperand(0).getMBB();
1641     return false;
1642   }
1643 
1644   return true;
1645 }
1646 
1647 bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
1648                                 MachineBasicBlock *&FBB,
1649                                 SmallVectorImpl<MachineOperand> &Cond,
1650                                 bool AllowModify) const {
1651   MachineBasicBlock::iterator I = MBB.getFirstTerminator();
1652   auto E = MBB.end();
1653   if (I == E)
1654     return false;
1655 
1656   // Skip over the instructions that are artificially terminators for special
1657   // exec management.
1658   while (I != E && !I->isBranch() && !I->isReturn() &&
1659          I->getOpcode() != AMDGPU::SI_MASK_BRANCH) {
1660     switch (I->getOpcode()) {
1661     case AMDGPU::SI_MASK_BRANCH:
1662     case AMDGPU::S_MOV_B64_term:
1663     case AMDGPU::S_XOR_B64_term:
1664     case AMDGPU::S_ANDN2_B64_term:
1665       break;
1666     case AMDGPU::SI_IF:
1667     case AMDGPU::SI_ELSE:
1668     case AMDGPU::SI_KILL_I1_TERMINATOR:
1669     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1670       // FIXME: It's messy that these need to be considered here at all.
1671       return true;
1672     default:
1673       llvm_unreachable("unexpected non-branch terminator inst");
1674     }
1675 
1676     ++I;
1677   }
1678 
1679   if (I == E)
1680     return false;
1681 
1682   if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH)
1683     return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
1684 
1685   ++I;
1686 
1687   // TODO: Should be able to treat as fallthrough?
1688   if (I == MBB.end())
1689     return true;
1690 
1691   if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify))
1692     return true;
1693 
1694   MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB();
1695 
1696   // Specifically handle the case where the conditional branch is to the same
1697   // destination as the mask branch. e.g.
1698   //
1699   // si_mask_branch BB8
1700   // s_cbranch_execz BB8
1701   // s_cbranch BB9
1702   //
1703   // This is required to understand divergent loops which may need the branches
1704   // to be relaxed.
1705   if (TBB != MaskBrDest || Cond.empty())
1706     return true;
1707 
1708   auto Pred = Cond[0].getImm();
1709   return (Pred != EXECZ && Pred != EXECNZ);
1710 }
1711 
1712 unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB,
1713                                    int *BytesRemoved) const {
1714   MachineBasicBlock::iterator I = MBB.getFirstTerminator();
1715 
1716   unsigned Count = 0;
1717   unsigned RemovedSize = 0;
1718   while (I != MBB.end()) {
1719     MachineBasicBlock::iterator Next = std::next(I);
1720     if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) {
1721       I = Next;
1722       continue;
1723     }
1724 
1725     RemovedSize += getInstSizeInBytes(*I);
1726     I->eraseFromParent();
1727     ++Count;
1728     I = Next;
1729   }
1730 
1731   if (BytesRemoved)
1732     *BytesRemoved = RemovedSize;
1733 
1734   return Count;
1735 }
1736 
1737 // Copy the flags onto the implicit condition register operand.
1738 static void preserveCondRegFlags(MachineOperand &CondReg,
1739                                  const MachineOperand &OrigCond) {
1740   CondReg.setIsUndef(OrigCond.isUndef());
1741   CondReg.setIsKill(OrigCond.isKill());
1742 }
1743 
1744 unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
1745                                    MachineBasicBlock *TBB,
1746                                    MachineBasicBlock *FBB,
1747                                    ArrayRef<MachineOperand> Cond,
1748                                    const DebugLoc &DL,
1749                                    int *BytesAdded) const {
1750   if (!FBB && Cond.empty()) {
1751     BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
1752       .addMBB(TBB);
1753     if (BytesAdded)
1754       *BytesAdded = 4;
1755     return 1;
1756   }
1757 
1758   if(Cond.size() == 1 && Cond[0].isReg()) {
1759      BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO))
1760        .add(Cond[0])
1761        .addMBB(TBB);
1762      return 1;
1763   }
1764 
1765   assert(TBB && Cond[0].isImm());
1766 
1767   unsigned Opcode
1768     = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
1769 
1770   if (!FBB) {
1771     Cond[1].isUndef();
1772     MachineInstr *CondBr =
1773       BuildMI(&MBB, DL, get(Opcode))
1774       .addMBB(TBB);
1775 
1776     // Copy the flags onto the implicit condition register operand.
1777     preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
1778 
1779     if (BytesAdded)
1780       *BytesAdded = 4;
1781     return 1;
1782   }
1783 
1784   assert(TBB && FBB);
1785 
1786   MachineInstr *CondBr =
1787     BuildMI(&MBB, DL, get(Opcode))
1788     .addMBB(TBB);
1789   BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
1790     .addMBB(FBB);
1791 
1792   MachineOperand &CondReg = CondBr->getOperand(1);
1793   CondReg.setIsUndef(Cond[1].isUndef());
1794   CondReg.setIsKill(Cond[1].isKill());
1795 
1796   if (BytesAdded)
1797       *BytesAdded = 8;
1798 
1799   return 2;
1800 }
1801 
1802 bool SIInstrInfo::reverseBranchCondition(
1803   SmallVectorImpl<MachineOperand> &Cond) const {
1804   if (Cond.size() != 2) {
1805     return true;
1806   }
1807 
1808   if (Cond[0].isImm()) {
1809     Cond[0].setImm(-Cond[0].getImm());
1810     return false;
1811   }
1812 
1813   return true;
1814 }
1815 
1816 bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
1817                                   ArrayRef<MachineOperand> Cond,
1818                                   unsigned TrueReg, unsigned FalseReg,
1819                                   int &CondCycles,
1820                                   int &TrueCycles, int &FalseCycles) const {
1821   switch (Cond[0].getImm()) {
1822   case VCCNZ:
1823   case VCCZ: {
1824     const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1825     const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
1826     assert(MRI.getRegClass(FalseReg) == RC);
1827 
1828     int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
1829     CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
1830 
1831     // Limit to equal cost for branch vs. N v_cndmask_b32s.
1832     return !RI.isSGPRClass(RC) && NumInsts <= 6;
1833   }
1834   case SCC_TRUE:
1835   case SCC_FALSE: {
1836     // FIXME: We could insert for VGPRs if we could replace the original compare
1837     // with a vector one.
1838     const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1839     const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
1840     assert(MRI.getRegClass(FalseReg) == RC);
1841 
1842     int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
1843 
1844     // Multiples of 8 can do s_cselect_b64
1845     if (NumInsts % 2 == 0)
1846       NumInsts /= 2;
1847 
1848     CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
1849     return RI.isSGPRClass(RC);
1850   }
1851   default:
1852     return false;
1853   }
1854 }
1855 
1856 void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
1857                                MachineBasicBlock::iterator I, const DebugLoc &DL,
1858                                unsigned DstReg, ArrayRef<MachineOperand> Cond,
1859                                unsigned TrueReg, unsigned FalseReg) const {
1860   BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
1861   if (Pred == VCCZ || Pred == SCC_FALSE) {
1862     Pred = static_cast<BranchPredicate>(-Pred);
1863     std::swap(TrueReg, FalseReg);
1864   }
1865 
1866   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1867   const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
1868   unsigned DstSize = RI.getRegSizeInBits(*DstRC);
1869 
1870   if (DstSize == 32) {
1871     unsigned SelOp = Pred == SCC_TRUE ?
1872       AMDGPU::S_CSELECT_B32 : AMDGPU::V_CNDMASK_B32_e32;
1873 
1874     // Instruction's operands are backwards from what is expected.
1875     MachineInstr *Select =
1876       BuildMI(MBB, I, DL, get(SelOp), DstReg)
1877       .addReg(FalseReg)
1878       .addReg(TrueReg);
1879 
1880     preserveCondRegFlags(Select->getOperand(3), Cond[1]);
1881     return;
1882   }
1883 
1884   if (DstSize == 64 && Pred == SCC_TRUE) {
1885     MachineInstr *Select =
1886       BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
1887       .addReg(FalseReg)
1888       .addReg(TrueReg);
1889 
1890     preserveCondRegFlags(Select->getOperand(3), Cond[1]);
1891     return;
1892   }
1893 
1894   static const int16_t Sub0_15[] = {
1895     AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
1896     AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
1897     AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
1898     AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
1899   };
1900 
1901   static const int16_t Sub0_15_64[] = {
1902     AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
1903     AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
1904     AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
1905     AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
1906   };
1907 
1908   unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
1909   const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
1910   const int16_t *SubIndices = Sub0_15;
1911   int NElts = DstSize / 32;
1912 
1913   // 64-bit select is only avaialble for SALU.
1914   if (Pred == SCC_TRUE) {
1915     SelOp = AMDGPU::S_CSELECT_B64;
1916     EltRC = &AMDGPU::SGPR_64RegClass;
1917     SubIndices = Sub0_15_64;
1918 
1919     assert(NElts % 2 == 0);
1920     NElts /= 2;
1921   }
1922 
1923   MachineInstrBuilder MIB = BuildMI(
1924     MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
1925 
1926   I = MIB->getIterator();
1927 
1928   SmallVector<unsigned, 8> Regs;
1929   for (int Idx = 0; Idx != NElts; ++Idx) {
1930     unsigned DstElt = MRI.createVirtualRegister(EltRC);
1931     Regs.push_back(DstElt);
1932 
1933     unsigned SubIdx = SubIndices[Idx];
1934 
1935     MachineInstr *Select =
1936       BuildMI(MBB, I, DL, get(SelOp), DstElt)
1937       .addReg(FalseReg, 0, SubIdx)
1938       .addReg(TrueReg, 0, SubIdx);
1939     preserveCondRegFlags(Select->getOperand(3), Cond[1]);
1940 
1941     MIB.addReg(DstElt)
1942        .addImm(SubIdx);
1943   }
1944 }
1945 
1946 bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const {
1947   switch (MI.getOpcode()) {
1948   case AMDGPU::V_MOV_B32_e32:
1949   case AMDGPU::V_MOV_B32_e64:
1950   case AMDGPU::V_MOV_B64_PSEUDO: {
1951     // If there are additional implicit register operands, this may be used for
1952     // register indexing so the source register operand isn't simply copied.
1953     unsigned NumOps = MI.getDesc().getNumOperands() +
1954       MI.getDesc().getNumImplicitUses();
1955 
1956     return MI.getNumOperands() == NumOps;
1957   }
1958   case AMDGPU::S_MOV_B32:
1959   case AMDGPU::S_MOV_B64:
1960   case AMDGPU::COPY:
1961     return true;
1962   default:
1963     return false;
1964   }
1965 }
1966 
1967 unsigned SIInstrInfo::getAddressSpaceForPseudoSourceKind(
1968     unsigned Kind) const {
1969   switch(Kind) {
1970   case PseudoSourceValue::Stack:
1971   case PseudoSourceValue::FixedStack:
1972     return AMDGPUAS::PRIVATE_ADDRESS;
1973   case PseudoSourceValue::ConstantPool:
1974   case PseudoSourceValue::GOT:
1975   case PseudoSourceValue::JumpTable:
1976   case PseudoSourceValue::GlobalValueCallEntry:
1977   case PseudoSourceValue::ExternalSymbolCallEntry:
1978   case PseudoSourceValue::TargetCustom:
1979     return AMDGPUAS::CONSTANT_ADDRESS;
1980   }
1981   return AMDGPUAS::FLAT_ADDRESS;
1982 }
1983 
1984 static void removeModOperands(MachineInstr &MI) {
1985   unsigned Opc = MI.getOpcode();
1986   int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc,
1987                                               AMDGPU::OpName::src0_modifiers);
1988   int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc,
1989                                               AMDGPU::OpName::src1_modifiers);
1990   int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc,
1991                                               AMDGPU::OpName::src2_modifiers);
1992 
1993   MI.RemoveOperand(Src2ModIdx);
1994   MI.RemoveOperand(Src1ModIdx);
1995   MI.RemoveOperand(Src0ModIdx);
1996 }
1997 
1998 bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
1999                                 unsigned Reg, MachineRegisterInfo *MRI) const {
2000   if (!MRI->hasOneNonDBGUse(Reg))
2001     return false;
2002 
2003   switch (DefMI.getOpcode()) {
2004   default:
2005     return false;
2006   case AMDGPU::S_MOV_B64:
2007     // TODO: We could fold 64-bit immediates, but this get compilicated
2008     // when there are sub-registers.
2009     return false;
2010 
2011   case AMDGPU::V_MOV_B32_e32:
2012   case AMDGPU::S_MOV_B32:
2013     break;
2014   }
2015 
2016   const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0);
2017   assert(ImmOp);
2018   // FIXME: We could handle FrameIndex values here.
2019   if (!ImmOp->isImm())
2020     return false;
2021 
2022   unsigned Opc = UseMI.getOpcode();
2023   if (Opc == AMDGPU::COPY) {
2024     bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg());
2025     unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2026     UseMI.setDesc(get(NewOpc));
2027     UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm());
2028     UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
2029     return true;
2030   }
2031 
2032   if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 ||
2033       Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) {
2034     // Don't fold if we are using source or output modifiers. The new VOP2
2035     // instructions don't have them.
2036     if (hasAnyModifiersSet(UseMI))
2037       return false;
2038 
2039     // If this is a free constant, there's no reason to do this.
2040     // TODO: We could fold this here instead of letting SIFoldOperands do it
2041     // later.
2042     MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
2043 
2044     // Any src operand can be used for the legality check.
2045     if (isInlineConstant(UseMI, *Src0, *ImmOp))
2046       return false;
2047 
2048     bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64;
2049     MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
2050     MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
2051 
2052     // Multiplied part is the constant: Use v_madmk_{f16, f32}.
2053     // We should only expect these to be on src0 due to canonicalizations.
2054     if (Src0->isReg() && Src0->getReg() == Reg) {
2055       if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
2056         return false;
2057 
2058       if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
2059         return false;
2060 
2061       // We need to swap operands 0 and 1 since madmk constant is at operand 1.
2062 
2063       const int64_t Imm = ImmOp->getImm();
2064 
2065       // FIXME: This would be a lot easier if we could return a new instruction
2066       // instead of having to modify in place.
2067 
2068       // Remove these first since they are at the end.
2069       UseMI.RemoveOperand(
2070           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
2071       UseMI.RemoveOperand(
2072           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
2073 
2074       unsigned Src1Reg = Src1->getReg();
2075       unsigned Src1SubReg = Src1->getSubReg();
2076       Src0->setReg(Src1Reg);
2077       Src0->setSubReg(Src1SubReg);
2078       Src0->setIsKill(Src1->isKill());
2079 
2080       if (Opc == AMDGPU::V_MAC_F32_e64 ||
2081           Opc == AMDGPU::V_MAC_F16_e64)
2082         UseMI.untieRegOperand(
2083             AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
2084 
2085       Src1->ChangeToImmediate(Imm);
2086 
2087       removeModOperands(UseMI);
2088       UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16));
2089 
2090       bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
2091       if (DeleteDef)
2092         DefMI.eraseFromParent();
2093 
2094       return true;
2095     }
2096 
2097     // Added part is the constant: Use v_madak_{f16, f32}.
2098     if (Src2->isReg() && Src2->getReg() == Reg) {
2099       // Not allowed to use constant bus for another operand.
2100       // We can however allow an inline immediate as src0.
2101       bool Src0Inlined = false;
2102       if (Src0->isReg()) {
2103         // Try to inline constant if possible.
2104         // If the Def moves immediate and the use is single
2105         // We are saving VGPR here.
2106         MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
2107         if (Def && Def->isMoveImmediate() &&
2108           isInlineConstant(Def->getOperand(1)) &&
2109           MRI->hasOneUse(Src0->getReg())) {
2110           Src0->ChangeToImmediate(Def->getOperand(1).getImm());
2111           Src0Inlined = true;
2112         } else if ((RI.isPhysicalRegister(Src0->getReg()) &&
2113             RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg()))) ||
2114             (RI.isVirtualRegister(Src0->getReg()) &&
2115             RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))
2116           return false;
2117           // VGPR is okay as Src0 - fallthrough
2118       }
2119 
2120       if (Src1->isReg() && !Src0Inlined ) {
2121         // We have one slot for inlinable constant so far - try to fill it
2122         MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
2123         if (Def && Def->isMoveImmediate() &&
2124             isInlineConstant(Def->getOperand(1)) &&
2125             MRI->hasOneUse(Src1->getReg()) &&
2126             commuteInstruction(UseMI)) {
2127             Src0->ChangeToImmediate(Def->getOperand(1).getImm());
2128         } else if ((RI.isPhysicalRegister(Src1->getReg()) &&
2129             RI.isSGPRClass(RI.getPhysRegClass(Src1->getReg()))) ||
2130             (RI.isVirtualRegister(Src1->getReg()) &&
2131             RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
2132           return false;
2133           // VGPR is okay as Src1 - fallthrough
2134       }
2135 
2136       const int64_t Imm = ImmOp->getImm();
2137 
2138       // FIXME: This would be a lot easier if we could return a new instruction
2139       // instead of having to modify in place.
2140 
2141       // Remove these first since they are at the end.
2142       UseMI.RemoveOperand(
2143           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
2144       UseMI.RemoveOperand(
2145           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
2146 
2147       if (Opc == AMDGPU::V_MAC_F32_e64 ||
2148           Opc == AMDGPU::V_MAC_F16_e64)
2149         UseMI.untieRegOperand(
2150             AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
2151 
2152       // ChangingToImmediate adds Src2 back to the instruction.
2153       Src2->ChangeToImmediate(Imm);
2154 
2155       // These come before src2.
2156       removeModOperands(UseMI);
2157       UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16));
2158 
2159       bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
2160       if (DeleteDef)
2161         DefMI.eraseFromParent();
2162 
2163       return true;
2164     }
2165   }
2166 
2167   return false;
2168 }
2169 
2170 static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
2171                                 int WidthB, int OffsetB) {
2172   int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
2173   int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
2174   int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
2175   return LowOffset + LowWidth <= HighOffset;
2176 }
2177 
2178 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa,
2179                                                MachineInstr &MIb) const {
2180   MachineOperand *BaseOp0, *BaseOp1;
2181   int64_t Offset0, Offset1;
2182 
2183   if (getMemOperandWithOffset(MIa, BaseOp0, Offset0, &RI) &&
2184       getMemOperandWithOffset(MIb, BaseOp1, Offset1, &RI)) {
2185     if (!BaseOp0->isIdenticalTo(*BaseOp1))
2186       return false;
2187 
2188     if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
2189       // FIXME: Handle ds_read2 / ds_write2.
2190       return false;
2191     }
2192     unsigned Width0 = (*MIa.memoperands_begin())->getSize();
2193     unsigned Width1 = (*MIb.memoperands_begin())->getSize();
2194     if (offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) {
2195       return true;
2196     }
2197   }
2198 
2199   return false;
2200 }
2201 
2202 bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa,
2203                                                   MachineInstr &MIb,
2204                                                   AliasAnalysis *AA) const {
2205   assert((MIa.mayLoad() || MIa.mayStore()) &&
2206          "MIa must load from or modify a memory location");
2207   assert((MIb.mayLoad() || MIb.mayStore()) &&
2208          "MIb must load from or modify a memory location");
2209 
2210   if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects())
2211     return false;
2212 
2213   // XXX - Can we relax this between address spaces?
2214   if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
2215     return false;
2216 
2217   if (AA && MIa.hasOneMemOperand() && MIb.hasOneMemOperand()) {
2218     const MachineMemOperand *MMOa = *MIa.memoperands_begin();
2219     const MachineMemOperand *MMOb = *MIb.memoperands_begin();
2220     if (MMOa->getValue() && MMOb->getValue()) {
2221       MemoryLocation LocA(MMOa->getValue(), MMOa->getSize(), MMOa->getAAInfo());
2222       MemoryLocation LocB(MMOb->getValue(), MMOb->getSize(), MMOb->getAAInfo());
2223       if (!AA->alias(LocA, LocB))
2224         return true;
2225     }
2226   }
2227 
2228   // TODO: Should we check the address space from the MachineMemOperand? That
2229   // would allow us to distinguish objects we know don't alias based on the
2230   // underlying address space, even if it was lowered to a different one,
2231   // e.g. private accesses lowered to use MUBUF instructions on a scratch
2232   // buffer.
2233   if (isDS(MIa)) {
2234     if (isDS(MIb))
2235       return checkInstOffsetsDoNotOverlap(MIa, MIb);
2236 
2237     return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
2238   }
2239 
2240   if (isMUBUF(MIa) || isMTBUF(MIa)) {
2241     if (isMUBUF(MIb) || isMTBUF(MIb))
2242       return checkInstOffsetsDoNotOverlap(MIa, MIb);
2243 
2244     return !isFLAT(MIb) && !isSMRD(MIb);
2245   }
2246 
2247   if (isSMRD(MIa)) {
2248     if (isSMRD(MIb))
2249       return checkInstOffsetsDoNotOverlap(MIa, MIb);
2250 
2251     return !isFLAT(MIb) && !isMUBUF(MIa) && !isMTBUF(MIa);
2252   }
2253 
2254   if (isFLAT(MIa)) {
2255     if (isFLAT(MIb))
2256       return checkInstOffsetsDoNotOverlap(MIa, MIb);
2257 
2258     return false;
2259   }
2260 
2261   return false;
2262 }
2263 
2264 static int64_t getFoldableImm(const MachineOperand* MO) {
2265   if (!MO->isReg())
2266     return false;
2267   const MachineFunction *MF = MO->getParent()->getParent()->getParent();
2268   const MachineRegisterInfo &MRI = MF->getRegInfo();
2269   auto Def = MRI.getUniqueVRegDef(MO->getReg());
2270   if (Def && Def->getOpcode() == AMDGPU::V_MOV_B32_e32 &&
2271       Def->getOperand(1).isImm())
2272     return Def->getOperand(1).getImm();
2273   return AMDGPU::NoRegister;
2274 }
2275 
2276 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
2277                                                  MachineInstr &MI,
2278                                                  LiveVariables *LV) const {
2279   unsigned Opc = MI.getOpcode();
2280   bool IsF16 = false;
2281   bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64;
2282 
2283   switch (Opc) {
2284   default:
2285     return nullptr;
2286   case AMDGPU::V_MAC_F16_e64:
2287     IsF16 = true;
2288     LLVM_FALLTHROUGH;
2289   case AMDGPU::V_MAC_F32_e64:
2290   case AMDGPU::V_FMAC_F32_e64:
2291     break;
2292   case AMDGPU::V_MAC_F16_e32:
2293     IsF16 = true;
2294     LLVM_FALLTHROUGH;
2295   case AMDGPU::V_MAC_F32_e32:
2296   case AMDGPU::V_FMAC_F32_e32: {
2297     int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
2298                                              AMDGPU::OpName::src0);
2299     const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
2300     if (!Src0->isReg() && !Src0->isImm())
2301       return nullptr;
2302 
2303     if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
2304       return nullptr;
2305 
2306     break;
2307   }
2308   }
2309 
2310   const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
2311   const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
2312   const MachineOperand *Src0Mods =
2313     getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2314   const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
2315   const MachineOperand *Src1Mods =
2316     getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2317   const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
2318   const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
2319   const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
2320 
2321   if (!IsFMA && !Src0Mods && !Src1Mods && !Clamp && !Omod &&
2322       // If we have an SGPR input, we will violate the constant bus restriction.
2323       (!Src0->isReg() || !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) {
2324     if (auto Imm = getFoldableImm(Src2)) {
2325       return BuildMI(*MBB, MI, MI.getDebugLoc(),
2326                      get(IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32))
2327                .add(*Dst)
2328                .add(*Src0)
2329                .add(*Src1)
2330                .addImm(Imm);
2331     }
2332     if (auto Imm = getFoldableImm(Src1)) {
2333       return BuildMI(*MBB, MI, MI.getDebugLoc(),
2334                      get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32))
2335                .add(*Dst)
2336                .add(*Src0)
2337                .addImm(Imm)
2338                .add(*Src2);
2339     }
2340     if (auto Imm = getFoldableImm(Src0)) {
2341       if (isOperandLegal(MI, AMDGPU::getNamedOperandIdx(AMDGPU::V_MADMK_F32,
2342                            AMDGPU::OpName::src0), Src1))
2343         return BuildMI(*MBB, MI, MI.getDebugLoc(),
2344                        get(IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32))
2345                  .add(*Dst)
2346                  .add(*Src1)
2347                  .addImm(Imm)
2348                  .add(*Src2);
2349     }
2350   }
2351 
2352   assert((!IsFMA || !IsF16) && "fmac only expected with f32");
2353   unsigned NewOpc = IsFMA ? AMDGPU::V_FMA_F32 :
2354     (IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32);
2355   return BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
2356       .add(*Dst)
2357       .addImm(Src0Mods ? Src0Mods->getImm() : 0)
2358       .add(*Src0)
2359       .addImm(Src1Mods ? Src1Mods->getImm() : 0)
2360       .add(*Src1)
2361       .addImm(0) // Src mods
2362       .add(*Src2)
2363       .addImm(Clamp ? Clamp->getImm() : 0)
2364       .addImm(Omod ? Omod->getImm() : 0);
2365 }
2366 
2367 // It's not generally safe to move VALU instructions across these since it will
2368 // start using the register as a base index rather than directly.
2369 // XXX - Why isn't hasSideEffects sufficient for these?
2370 static bool changesVGPRIndexingMode(const MachineInstr &MI) {
2371   switch (MI.getOpcode()) {
2372   case AMDGPU::S_SET_GPR_IDX_ON:
2373   case AMDGPU::S_SET_GPR_IDX_MODE:
2374   case AMDGPU::S_SET_GPR_IDX_OFF:
2375     return true;
2376   default:
2377     return false;
2378   }
2379 }
2380 
2381 bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
2382                                        const MachineBasicBlock *MBB,
2383                                        const MachineFunction &MF) const {
2384   // XXX - Do we want the SP check in the base implementation?
2385 
2386   // Target-independent instructions do not have an implicit-use of EXEC, even
2387   // when they operate on VGPRs. Treating EXEC modifications as scheduling
2388   // boundaries prevents incorrect movements of such instructions.
2389   return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) ||
2390          MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
2391          MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
2392          MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
2393          changesVGPRIndexingMode(MI);
2394 }
2395 
2396 bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const {
2397   return Opcode == AMDGPU::DS_ORDERED_COUNT ||
2398          Opcode == AMDGPU::DS_GWS_INIT ||
2399          Opcode == AMDGPU::DS_GWS_SEMA_V ||
2400          Opcode == AMDGPU::DS_GWS_SEMA_BR ||
2401          Opcode == AMDGPU::DS_GWS_SEMA_P ||
2402          Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL ||
2403          Opcode == AMDGPU::DS_GWS_BARRIER;
2404 }
2405 
2406 bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const {
2407   unsigned Opcode = MI.getOpcode();
2408 
2409   if (MI.mayStore() && isSMRD(MI))
2410     return true; // scalar store or atomic
2411 
2412   // These instructions cause shader I/O that may cause hardware lockups
2413   // when executed with an empty EXEC mask.
2414   //
2415   // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
2416   //       EXEC = 0, but checking for that case here seems not worth it
2417   //       given the typical code patterns.
2418   if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
2419       Opcode == AMDGPU::EXP || Opcode == AMDGPU::EXP_DONE ||
2420       Opcode == AMDGPU::DS_ORDERED_COUNT)
2421     return true;
2422 
2423   if (MI.isInlineAsm())
2424     return true; // conservative assumption
2425 
2426   // These are like SALU instructions in terms of effects, so it's questionable
2427   // whether we should return true for those.
2428   //
2429   // However, executing them with EXEC = 0 causes them to operate on undefined
2430   // data, which we avoid by returning true here.
2431   if (Opcode == AMDGPU::V_READFIRSTLANE_B32 || Opcode == AMDGPU::V_READLANE_B32)
2432     return true;
2433 
2434   return false;
2435 }
2436 
2437 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
2438   switch (Imm.getBitWidth()) {
2439   case 32:
2440     return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
2441                                         ST.hasInv2PiInlineImm());
2442   case 64:
2443     return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
2444                                         ST.hasInv2PiInlineImm());
2445   case 16:
2446     return ST.has16BitInsts() &&
2447            AMDGPU::isInlinableLiteral16(Imm.getSExtValue(),
2448                                         ST.hasInv2PiInlineImm());
2449   default:
2450     llvm_unreachable("invalid bitwidth");
2451   }
2452 }
2453 
2454 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
2455                                    uint8_t OperandType) const {
2456   if (!MO.isImm() ||
2457       OperandType < AMDGPU::OPERAND_SRC_FIRST ||
2458       OperandType > AMDGPU::OPERAND_SRC_LAST)
2459     return false;
2460 
2461   // MachineOperand provides no way to tell the true operand size, since it only
2462   // records a 64-bit value. We need to know the size to determine if a 32-bit
2463   // floating point immediate bit pattern is legal for an integer immediate. It
2464   // would be for any 32-bit integer operand, but would not be for a 64-bit one.
2465 
2466   int64_t Imm = MO.getImm();
2467   switch (OperandType) {
2468   case AMDGPU::OPERAND_REG_IMM_INT32:
2469   case AMDGPU::OPERAND_REG_IMM_FP32:
2470   case AMDGPU::OPERAND_REG_INLINE_C_INT32:
2471   case AMDGPU::OPERAND_REG_INLINE_C_FP32: {
2472     int32_t Trunc = static_cast<int32_t>(Imm);
2473     return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
2474   }
2475   case AMDGPU::OPERAND_REG_IMM_INT64:
2476   case AMDGPU::OPERAND_REG_IMM_FP64:
2477   case AMDGPU::OPERAND_REG_INLINE_C_INT64:
2478   case AMDGPU::OPERAND_REG_INLINE_C_FP64:
2479     return AMDGPU::isInlinableLiteral64(MO.getImm(),
2480                                         ST.hasInv2PiInlineImm());
2481   case AMDGPU::OPERAND_REG_IMM_INT16:
2482   case AMDGPU::OPERAND_REG_IMM_FP16:
2483   case AMDGPU::OPERAND_REG_INLINE_C_INT16:
2484   case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
2485     if (isInt<16>(Imm) || isUInt<16>(Imm)) {
2486       // A few special case instructions have 16-bit operands on subtargets
2487       // where 16-bit instructions are not legal.
2488       // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
2489       // constants in these cases
2490       int16_t Trunc = static_cast<int16_t>(Imm);
2491       return ST.has16BitInsts() &&
2492              AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm());
2493     }
2494 
2495     return false;
2496   }
2497   case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
2498   case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: {
2499     if (isUInt<16>(Imm)) {
2500       int16_t Trunc = static_cast<int16_t>(Imm);
2501       return ST.has16BitInsts() &&
2502              AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm());
2503     }
2504     if (!(Imm & 0xffff)) {
2505       return ST.has16BitInsts() &&
2506              AMDGPU::isInlinableLiteral16(Imm >> 16, ST.hasInv2PiInlineImm());
2507     }
2508     uint32_t Trunc = static_cast<uint32_t>(Imm);
2509     return  AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm());
2510   }
2511   default:
2512     llvm_unreachable("invalid bitwidth");
2513   }
2514 }
2515 
2516 bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO,
2517                                         const MCOperandInfo &OpInfo) const {
2518   switch (MO.getType()) {
2519   case MachineOperand::MO_Register:
2520     return false;
2521   case MachineOperand::MO_Immediate:
2522     return !isInlineConstant(MO, OpInfo);
2523   case MachineOperand::MO_FrameIndex:
2524   case MachineOperand::MO_MachineBasicBlock:
2525   case MachineOperand::MO_ExternalSymbol:
2526   case MachineOperand::MO_GlobalAddress:
2527   case MachineOperand::MO_MCSymbol:
2528     return true;
2529   default:
2530     llvm_unreachable("unexpected operand type");
2531   }
2532 }
2533 
2534 static bool compareMachineOp(const MachineOperand &Op0,
2535                              const MachineOperand &Op1) {
2536   if (Op0.getType() != Op1.getType())
2537     return false;
2538 
2539   switch (Op0.getType()) {
2540   case MachineOperand::MO_Register:
2541     return Op0.getReg() == Op1.getReg();
2542   case MachineOperand::MO_Immediate:
2543     return Op0.getImm() == Op1.getImm();
2544   default:
2545     llvm_unreachable("Didn't expect to be comparing these operand types");
2546   }
2547 }
2548 
2549 bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
2550                                     const MachineOperand &MO) const {
2551   const MCOperandInfo &OpInfo = get(MI.getOpcode()).OpInfo[OpNo];
2552 
2553   assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
2554 
2555   if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
2556     return true;
2557 
2558   if (OpInfo.RegClass < 0)
2559     return false;
2560 
2561   if (MO.isImm() && isInlineConstant(MO, OpInfo))
2562     return RI.opCanUseInlineConstant(OpInfo.OperandType);
2563 
2564   return RI.opCanUseLiteralConstant(OpInfo.OperandType);
2565 }
2566 
2567 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
2568   int Op32 = AMDGPU::getVOPe32(Opcode);
2569   if (Op32 == -1)
2570     return false;
2571 
2572   return pseudoToMCOpcode(Op32) != -1;
2573 }
2574 
2575 bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
2576   // The src0_modifier operand is present on all instructions
2577   // that have modifiers.
2578 
2579   return AMDGPU::getNamedOperandIdx(Opcode,
2580                                     AMDGPU::OpName::src0_modifiers) != -1;
2581 }
2582 
2583 bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI,
2584                                   unsigned OpName) const {
2585   const MachineOperand *Mods = getNamedOperand(MI, OpName);
2586   return Mods && Mods->getImm();
2587 }
2588 
2589 bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const {
2590   return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
2591          hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
2592          hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) ||
2593          hasModifiersSet(MI, AMDGPU::OpName::clamp) ||
2594          hasModifiersSet(MI, AMDGPU::OpName::omod);
2595 }
2596 
2597 bool SIInstrInfo::canShrink(const MachineInstr &MI,
2598                             const MachineRegisterInfo &MRI) const {
2599   const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
2600   // Can't shrink instruction with three operands.
2601   // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add
2602   // a special case for it.  It can only be shrunk if the third operand
2603   // is vcc.  We should handle this the same way we handle vopc, by addding
2604   // a register allocation hint pre-regalloc and then do the shrinking
2605   // post-regalloc.
2606   if (Src2) {
2607     switch (MI.getOpcode()) {
2608       default: return false;
2609 
2610       case AMDGPU::V_ADDC_U32_e64:
2611       case AMDGPU::V_SUBB_U32_e64:
2612       case AMDGPU::V_SUBBREV_U32_e64: {
2613         const MachineOperand *Src1
2614           = getNamedOperand(MI, AMDGPU::OpName::src1);
2615         if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
2616           return false;
2617         // Additional verification is needed for sdst/src2.
2618         return true;
2619       }
2620       case AMDGPU::V_MAC_F32_e64:
2621       case AMDGPU::V_MAC_F16_e64:
2622       case AMDGPU::V_FMAC_F32_e64:
2623         if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
2624             hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
2625           return false;
2626         break;
2627 
2628       case AMDGPU::V_CNDMASK_B32_e64:
2629         break;
2630     }
2631   }
2632 
2633   const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
2634   if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
2635                hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
2636     return false;
2637 
2638   // We don't need to check src0, all input types are legal, so just make sure
2639   // src0 isn't using any modifiers.
2640   if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
2641     return false;
2642 
2643   // Can it be shrunk to a valid 32 bit opcode?
2644   if (!hasVALU32BitEncoding(MI.getOpcode()))
2645     return false;
2646 
2647   // Check output modifiers
2648   return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
2649          !hasModifiersSet(MI, AMDGPU::OpName::clamp);
2650 }
2651 
2652 // Set VCC operand with all flags from \p Orig, except for setting it as
2653 // implicit.
2654 static void copyFlagsToImplicitVCC(MachineInstr &MI,
2655                                    const MachineOperand &Orig) {
2656 
2657   for (MachineOperand &Use : MI.implicit_operands()) {
2658     if (Use.isUse() && Use.getReg() == AMDGPU::VCC) {
2659       Use.setIsUndef(Orig.isUndef());
2660       Use.setIsKill(Orig.isKill());
2661       return;
2662     }
2663   }
2664 }
2665 
2666 MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI,
2667                                            unsigned Op32) const {
2668   MachineBasicBlock *MBB = MI.getParent();;
2669   MachineInstrBuilder Inst32 =
2670     BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32));
2671 
2672   // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
2673   // For VOPC instructions, this is replaced by an implicit def of vcc.
2674   int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst);
2675   if (Op32DstIdx != -1) {
2676     // dst
2677     Inst32.add(MI.getOperand(0));
2678   } else {
2679     assert(MI.getOperand(0).getReg() == AMDGPU::VCC &&
2680            "Unexpected case");
2681   }
2682 
2683   Inst32.add(*getNamedOperand(MI, AMDGPU::OpName::src0));
2684 
2685   const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
2686   if (Src1)
2687     Inst32.add(*Src1);
2688 
2689   const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
2690 
2691   if (Src2) {
2692     int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
2693     if (Op32Src2Idx != -1) {
2694       Inst32.add(*Src2);
2695     } else {
2696       // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
2697       // replaced with an implicit read of vcc. This was already added
2698       // during the initial BuildMI, so find it to preserve the flags.
2699       copyFlagsToImplicitVCC(*Inst32, *Src2);
2700     }
2701   }
2702 
2703   return Inst32;
2704 }
2705 
2706 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
2707                                   const MachineOperand &MO,
2708                                   const MCOperandInfo &OpInfo) const {
2709   // Literal constants use the constant bus.
2710   //if (isLiteralConstantLike(MO, OpInfo))
2711   // return true;
2712   if (MO.isImm())
2713     return !isInlineConstant(MO, OpInfo);
2714 
2715   if (!MO.isReg())
2716     return true; // Misc other operands like FrameIndex
2717 
2718   if (!MO.isUse())
2719     return false;
2720 
2721   if (TargetRegisterInfo::isVirtualRegister(MO.getReg()))
2722     return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
2723 
2724   // FLAT_SCR is just an SGPR pair.
2725   if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR))
2726     return true;
2727 
2728   // EXEC register uses the constant bus.
2729   if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC)
2730     return true;
2731 
2732   // SGPRs use the constant bus
2733   return (MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::M0 ||
2734           (!MO.isImplicit() &&
2735            (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) ||
2736             AMDGPU::SGPR_64RegClass.contains(MO.getReg()))));
2737 }
2738 
2739 static unsigned findImplicitSGPRRead(const MachineInstr &MI) {
2740   for (const MachineOperand &MO : MI.implicit_operands()) {
2741     // We only care about reads.
2742     if (MO.isDef())
2743       continue;
2744 
2745     switch (MO.getReg()) {
2746     case AMDGPU::VCC:
2747     case AMDGPU::M0:
2748     case AMDGPU::FLAT_SCR:
2749       return MO.getReg();
2750 
2751     default:
2752       break;
2753     }
2754   }
2755 
2756   return AMDGPU::NoRegister;
2757 }
2758 
2759 static bool shouldReadExec(const MachineInstr &MI) {
2760   if (SIInstrInfo::isVALU(MI)) {
2761     switch (MI.getOpcode()) {
2762     case AMDGPU::V_READLANE_B32:
2763     case AMDGPU::V_READLANE_B32_si:
2764     case AMDGPU::V_READLANE_B32_vi:
2765     case AMDGPU::V_WRITELANE_B32:
2766     case AMDGPU::V_WRITELANE_B32_si:
2767     case AMDGPU::V_WRITELANE_B32_vi:
2768       return false;
2769     }
2770 
2771     return true;
2772   }
2773 
2774   if (SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
2775       SIInstrInfo::isSALU(MI) ||
2776       SIInstrInfo::isSMRD(MI))
2777     return false;
2778 
2779   return true;
2780 }
2781 
2782 static bool isSubRegOf(const SIRegisterInfo &TRI,
2783                        const MachineOperand &SuperVec,
2784                        const MachineOperand &SubReg) {
2785   if (TargetRegisterInfo::isPhysicalRegister(SubReg.getReg()))
2786     return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
2787 
2788   return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
2789          SubReg.getReg() == SuperVec.getReg();
2790 }
2791 
2792 bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
2793                                     StringRef &ErrInfo) const {
2794   uint16_t Opcode = MI.getOpcode();
2795   if (SIInstrInfo::isGenericOpcode(MI.getOpcode()))
2796     return true;
2797 
2798   const MachineFunction *MF = MI.getParent()->getParent();
2799   const MachineRegisterInfo &MRI = MF->getRegInfo();
2800 
2801   int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
2802   int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
2803   int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
2804 
2805   // Make sure the number of operands is correct.
2806   const MCInstrDesc &Desc = get(Opcode);
2807   if (!Desc.isVariadic() &&
2808       Desc.getNumOperands() != MI.getNumExplicitOperands()) {
2809     ErrInfo = "Instruction has wrong number of operands.";
2810     return false;
2811   }
2812 
2813   if (MI.isInlineAsm()) {
2814     // Verify register classes for inlineasm constraints.
2815     for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
2816          I != E; ++I) {
2817       const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
2818       if (!RC)
2819         continue;
2820 
2821       const MachineOperand &Op = MI.getOperand(I);
2822       if (!Op.isReg())
2823         continue;
2824 
2825       unsigned Reg = Op.getReg();
2826       if (!TargetRegisterInfo::isVirtualRegister(Reg) && !RC->contains(Reg)) {
2827         ErrInfo = "inlineasm operand has incorrect register class.";
2828         return false;
2829       }
2830     }
2831 
2832     return true;
2833   }
2834 
2835   // Make sure the register classes are correct.
2836   for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
2837     if (MI.getOperand(i).isFPImm()) {
2838       ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
2839                 "all fp values to integers.";
2840       return false;
2841     }
2842 
2843     int RegClass = Desc.OpInfo[i].RegClass;
2844 
2845     switch (Desc.OpInfo[i].OperandType) {
2846     case MCOI::OPERAND_REGISTER:
2847       if (MI.getOperand(i).isImm()) {
2848         ErrInfo = "Illegal immediate value for operand.";
2849         return false;
2850       }
2851       break;
2852     case AMDGPU::OPERAND_REG_IMM_INT32:
2853     case AMDGPU::OPERAND_REG_IMM_FP32:
2854       break;
2855     case AMDGPU::OPERAND_REG_INLINE_C_INT32:
2856     case AMDGPU::OPERAND_REG_INLINE_C_FP32:
2857     case AMDGPU::OPERAND_REG_INLINE_C_INT64:
2858     case AMDGPU::OPERAND_REG_INLINE_C_FP64:
2859     case AMDGPU::OPERAND_REG_INLINE_C_INT16:
2860     case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
2861       const MachineOperand &MO = MI.getOperand(i);
2862       if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
2863         ErrInfo = "Illegal immediate value for operand.";
2864         return false;
2865       }
2866       break;
2867     }
2868     case MCOI::OPERAND_IMMEDIATE:
2869     case AMDGPU::OPERAND_KIMM32:
2870       // Check if this operand is an immediate.
2871       // FrameIndex operands will be replaced by immediates, so they are
2872       // allowed.
2873       if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
2874         ErrInfo = "Expected immediate, but got non-immediate";
2875         return false;
2876       }
2877       LLVM_FALLTHROUGH;
2878     default:
2879       continue;
2880     }
2881 
2882     if (!MI.getOperand(i).isReg())
2883       continue;
2884 
2885     if (RegClass != -1) {
2886       unsigned Reg = MI.getOperand(i).getReg();
2887       if (Reg == AMDGPU::NoRegister ||
2888           TargetRegisterInfo::isVirtualRegister(Reg))
2889         continue;
2890 
2891       const TargetRegisterClass *RC = RI.getRegClass(RegClass);
2892       if (!RC->contains(Reg)) {
2893         ErrInfo = "Operand has incorrect register class.";
2894         return false;
2895       }
2896     }
2897   }
2898 
2899   // Verify SDWA
2900   if (isSDWA(MI)) {
2901     if (!ST.hasSDWA()) {
2902       ErrInfo = "SDWA is not supported on this target";
2903       return false;
2904     }
2905 
2906     int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
2907 
2908     const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx };
2909 
2910     for (int OpIdx: OpIndicies) {
2911       if (OpIdx == -1)
2912         continue;
2913       const MachineOperand &MO = MI.getOperand(OpIdx);
2914 
2915       if (!ST.hasSDWAScalar()) {
2916         // Only VGPRS on VI
2917         if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
2918           ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
2919           return false;
2920         }
2921       } else {
2922         // No immediates on GFX9
2923         if (!MO.isReg()) {
2924           ErrInfo = "Only reg allowed as operands in SDWA instructions on GFX9";
2925           return false;
2926         }
2927       }
2928     }
2929 
2930     if (!ST.hasSDWAOmod()) {
2931       // No omod allowed on VI
2932       const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
2933       if (OMod != nullptr &&
2934         (!OMod->isImm() || OMod->getImm() != 0)) {
2935         ErrInfo = "OMod not allowed in SDWA instructions on VI";
2936         return false;
2937       }
2938     }
2939 
2940     uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
2941     if (isVOPC(BasicOpcode)) {
2942       if (!ST.hasSDWASdst() && DstIdx != -1) {
2943         // Only vcc allowed as dst on VI for VOPC
2944         const MachineOperand &Dst = MI.getOperand(DstIdx);
2945         if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
2946           ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
2947           return false;
2948         }
2949       } else if (!ST.hasSDWAOutModsVOPC()) {
2950         // No clamp allowed on GFX9 for VOPC
2951         const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
2952         if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
2953           ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
2954           return false;
2955         }
2956 
2957         // No omod allowed on GFX9 for VOPC
2958         const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
2959         if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
2960           ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
2961           return false;
2962         }
2963       }
2964     }
2965 
2966     const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
2967     if (DstUnused && DstUnused->isImm() &&
2968         DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
2969       const MachineOperand &Dst = MI.getOperand(DstIdx);
2970       if (!Dst.isReg() || !Dst.isTied()) {
2971         ErrInfo = "Dst register should have tied register";
2972         return false;
2973       }
2974 
2975       const MachineOperand &TiedMO =
2976           MI.getOperand(MI.findTiedOperandIdx(DstIdx));
2977       if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
2978         ErrInfo =
2979             "Dst register should be tied to implicit use of preserved register";
2980         return false;
2981       } else if (TargetRegisterInfo::isPhysicalRegister(TiedMO.getReg()) &&
2982                  Dst.getReg() != TiedMO.getReg()) {
2983         ErrInfo = "Dst register should use same physical register as preserved";
2984         return false;
2985       }
2986     }
2987   }
2988 
2989   // Verify MIMG
2990   if (isMIMG(MI.getOpcode()) && !MI.mayStore()) {
2991     // Ensure that the return type used is large enough for all the options
2992     // being used TFE/LWE require an extra result register.
2993     const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
2994     if (DMask) {
2995       uint64_t DMaskImm = DMask->getImm();
2996       uint32_t RegCount =
2997           isGather4(MI.getOpcode()) ? 4 : countPopulation(DMaskImm);
2998       const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
2999       const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
3000       const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
3001 
3002       // Adjust for packed 16 bit values
3003       if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
3004         RegCount >>= 1;
3005 
3006       // Adjust if using LWE or TFE
3007       if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
3008         RegCount += 1;
3009 
3010       const uint32_t DstIdx =
3011           AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
3012       const MachineOperand &Dst = MI.getOperand(DstIdx);
3013       if (Dst.isReg()) {
3014         const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
3015         uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
3016         if (RegCount > DstSize) {
3017           ErrInfo = "MIMG instruction returns too many registers for dst "
3018                     "register class";
3019           return false;
3020         }
3021       }
3022     }
3023   }
3024 
3025   // Verify VOP*. Ignore multiple sgpr operands on writelane.
3026   if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32
3027       && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) {
3028     // Only look at the true operands. Only a real operand can use the constant
3029     // bus, and we don't want to check pseudo-operands like the source modifier
3030     // flags.
3031     const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
3032 
3033     unsigned ConstantBusCount = 0;
3034     unsigned LiteralCount = 0;
3035 
3036     if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1)
3037       ++ConstantBusCount;
3038 
3039     unsigned SGPRUsed = findImplicitSGPRRead(MI);
3040     if (SGPRUsed != AMDGPU::NoRegister)
3041       ++ConstantBusCount;
3042 
3043     for (int OpIdx : OpIndices) {
3044       if (OpIdx == -1)
3045         break;
3046       const MachineOperand &MO = MI.getOperand(OpIdx);
3047       if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) {
3048         if (MO.isReg()) {
3049           if (MO.getReg() != SGPRUsed)
3050             ++ConstantBusCount;
3051           SGPRUsed = MO.getReg();
3052         } else {
3053           ++ConstantBusCount;
3054           ++LiteralCount;
3055         }
3056       }
3057     }
3058     if (ConstantBusCount > 1) {
3059       ErrInfo = "VOP* instruction uses the constant bus more than once";
3060       return false;
3061     }
3062 
3063     if (isVOP3(MI) && LiteralCount) {
3064       ErrInfo = "VOP3 instruction uses literal";
3065       return false;
3066     }
3067   }
3068 
3069   // Verify misc. restrictions on specific instructions.
3070   if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 ||
3071       Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) {
3072     const MachineOperand &Src0 = MI.getOperand(Src0Idx);
3073     const MachineOperand &Src1 = MI.getOperand(Src1Idx);
3074     const MachineOperand &Src2 = MI.getOperand(Src2Idx);
3075     if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
3076       if (!compareMachineOp(Src0, Src1) &&
3077           !compareMachineOp(Src0, Src2)) {
3078         ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
3079         return false;
3080       }
3081     }
3082   }
3083 
3084   if (isSOPK(MI)) {
3085     int64_t Imm = getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm();
3086     if (sopkIsZext(MI)) {
3087       if (!isUInt<16>(Imm)) {
3088         ErrInfo = "invalid immediate for SOPK instruction";
3089         return false;
3090       }
3091     } else {
3092       if (!isInt<16>(Imm)) {
3093         ErrInfo = "invalid immediate for SOPK instruction";
3094         return false;
3095       }
3096     }
3097   }
3098 
3099   if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
3100       Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
3101       Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
3102       Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
3103     const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
3104                        Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
3105 
3106     const unsigned StaticNumOps = Desc.getNumOperands() +
3107       Desc.getNumImplicitUses();
3108     const unsigned NumImplicitOps = IsDst ? 2 : 1;
3109 
3110     // Allow additional implicit operands. This allows a fixup done by the post
3111     // RA scheduler where the main implicit operand is killed and implicit-defs
3112     // are added for sub-registers that remain live after this instruction.
3113     if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
3114       ErrInfo = "missing implicit register operands";
3115       return false;
3116     }
3117 
3118     const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
3119     if (IsDst) {
3120       if (!Dst->isUse()) {
3121         ErrInfo = "v_movreld_b32 vdst should be a use operand";
3122         return false;
3123       }
3124 
3125       unsigned UseOpIdx;
3126       if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
3127           UseOpIdx != StaticNumOps + 1) {
3128         ErrInfo = "movrel implicit operands should be tied";
3129         return false;
3130       }
3131     }
3132 
3133     const MachineOperand &Src0 = MI.getOperand(Src0Idx);
3134     const MachineOperand &ImpUse
3135       = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
3136     if (!ImpUse.isReg() || !ImpUse.isUse() ||
3137         !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
3138       ErrInfo = "src0 should be subreg of implicit vector use";
3139       return false;
3140     }
3141   }
3142 
3143   // Make sure we aren't losing exec uses in the td files. This mostly requires
3144   // being careful when using let Uses to try to add other use registers.
3145   if (shouldReadExec(MI)) {
3146     if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
3147       ErrInfo = "VALU instruction does not implicitly read exec mask";
3148       return false;
3149     }
3150   }
3151 
3152   if (isSMRD(MI)) {
3153     if (MI.mayStore()) {
3154       // The register offset form of scalar stores may only use m0 as the
3155       // soffset register.
3156       const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff);
3157       if (Soff && Soff->getReg() != AMDGPU::M0) {
3158         ErrInfo = "scalar stores must use m0 as offset register";
3159         return false;
3160       }
3161     }
3162   }
3163 
3164   if (isFLAT(MI) && !MF->getSubtarget<GCNSubtarget>().hasFlatInstOffsets()) {
3165     const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
3166     if (Offset->getImm() != 0) {
3167       ErrInfo = "subtarget does not support offsets in flat instructions";
3168       return false;
3169     }
3170   }
3171 
3172   const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
3173   if (DppCt) {
3174     using namespace AMDGPU::DPP;
3175 
3176     unsigned DC = DppCt->getImm();
3177     if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
3178         DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
3179         (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
3180         (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
3181         (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
3182         (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST)) {
3183       ErrInfo = "Invalid dpp_ctrl value";
3184       return false;
3185     }
3186   }
3187 
3188   return true;
3189 }
3190 
3191 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
3192   switch (MI.getOpcode()) {
3193   default: return AMDGPU::INSTRUCTION_LIST_END;
3194   case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
3195   case AMDGPU::COPY: return AMDGPU::COPY;
3196   case AMDGPU::PHI: return AMDGPU::PHI;
3197   case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
3198   case AMDGPU::WQM: return AMDGPU::WQM;
3199   case AMDGPU::WWM: return AMDGPU::WWM;
3200   case AMDGPU::S_MOV_B32:
3201     return MI.getOperand(1).isReg() ?
3202            AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
3203   case AMDGPU::S_ADD_I32:
3204     return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_I32_e32;
3205   case AMDGPU::S_ADDC_U32:
3206     return AMDGPU::V_ADDC_U32_e32;
3207   case AMDGPU::S_SUB_I32:
3208     return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_I32_e32;
3209     // FIXME: These are not consistently handled, and selected when the carry is
3210     // used.
3211   case AMDGPU::S_ADD_U32:
3212     return AMDGPU::V_ADD_I32_e32;
3213   case AMDGPU::S_SUB_U32:
3214     return AMDGPU::V_SUB_I32_e32;
3215   case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
3216   case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32;
3217   case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
3218   case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
3219   case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
3220   case AMDGPU::S_XNOR_B32:
3221     return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
3222   case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
3223   case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
3224   case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
3225   case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
3226   case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
3227   case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64;
3228   case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
3229   case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64;
3230   case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
3231   case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64;
3232   case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32;
3233   case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32;
3234   case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32;
3235   case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32;
3236   case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
3237   case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
3238   case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
3239   case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
3240   case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32;
3241   case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32;
3242   case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32;
3243   case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32;
3244   case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
3245   case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
3246   case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32;
3247   case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32;
3248   case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32;
3249   case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32;
3250   case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32;
3251   case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32;
3252   case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32;
3253   case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32;
3254   case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
3255   case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
3256   case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
3257   case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
3258   case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
3259   case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
3260   }
3261 }
3262 
3263 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
3264                                                       unsigned OpNo) const {
3265   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
3266   const MCInstrDesc &Desc = get(MI.getOpcode());
3267   if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
3268       Desc.OpInfo[OpNo].RegClass == -1) {
3269     unsigned Reg = MI.getOperand(OpNo).getReg();
3270 
3271     if (TargetRegisterInfo::isVirtualRegister(Reg))
3272       return MRI.getRegClass(Reg);
3273     return RI.getPhysRegClass(Reg);
3274   }
3275 
3276   unsigned RCID = Desc.OpInfo[OpNo].RegClass;
3277   return RI.getRegClass(RCID);
3278 }
3279 
3280 void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
3281   MachineBasicBlock::iterator I = MI;
3282   MachineBasicBlock *MBB = MI.getParent();
3283   MachineOperand &MO = MI.getOperand(OpIdx);
3284   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
3285   unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass;
3286   const TargetRegisterClass *RC = RI.getRegClass(RCID);
3287   unsigned Opcode = AMDGPU::V_MOV_B32_e32;
3288   if (MO.isReg())
3289     Opcode = AMDGPU::COPY;
3290   else if (RI.isSGPRClass(RC))
3291     Opcode = AMDGPU::S_MOV_B32;
3292 
3293   const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
3294   if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC))
3295     VRC = &AMDGPU::VReg_64RegClass;
3296   else
3297     VRC = &AMDGPU::VGPR_32RegClass;
3298 
3299   unsigned Reg = MRI.createVirtualRegister(VRC);
3300   DebugLoc DL = MBB->findDebugLoc(I);
3301   BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
3302   MO.ChangeToRegister(Reg, false);
3303 }
3304 
3305 unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
3306                                          MachineRegisterInfo &MRI,
3307                                          MachineOperand &SuperReg,
3308                                          const TargetRegisterClass *SuperRC,
3309                                          unsigned SubIdx,
3310                                          const TargetRegisterClass *SubRC)
3311                                          const {
3312   MachineBasicBlock *MBB = MI->getParent();
3313   DebugLoc DL = MI->getDebugLoc();
3314   unsigned SubReg = MRI.createVirtualRegister(SubRC);
3315 
3316   if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) {
3317     BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
3318       .addReg(SuperReg.getReg(), 0, SubIdx);
3319     return SubReg;
3320   }
3321 
3322   // Just in case the super register is itself a sub-register, copy it to a new
3323   // value so we don't need to worry about merging its subreg index with the
3324   // SubIdx passed to this function. The register coalescer should be able to
3325   // eliminate this extra copy.
3326   unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC);
3327 
3328   BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
3329     .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());
3330 
3331   BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
3332     .addReg(NewSuperReg, 0, SubIdx);
3333 
3334   return SubReg;
3335 }
3336 
3337 MachineOperand SIInstrInfo::buildExtractSubRegOrImm(
3338   MachineBasicBlock::iterator MII,
3339   MachineRegisterInfo &MRI,
3340   MachineOperand &Op,
3341   const TargetRegisterClass *SuperRC,
3342   unsigned SubIdx,
3343   const TargetRegisterClass *SubRC) const {
3344   if (Op.isImm()) {
3345     if (SubIdx == AMDGPU::sub0)
3346       return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
3347     if (SubIdx == AMDGPU::sub1)
3348       return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
3349 
3350     llvm_unreachable("Unhandled register index for immediate");
3351   }
3352 
3353   unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
3354                                        SubIdx, SubRC);
3355   return MachineOperand::CreateReg(SubReg, false);
3356 }
3357 
3358 // Change the order of operands from (0, 1, 2) to (0, 2, 1)
3359 void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
3360   assert(Inst.getNumExplicitOperands() == 3);
3361   MachineOperand Op1 = Inst.getOperand(1);
3362   Inst.RemoveOperand(1);
3363   Inst.addOperand(Op1);
3364 }
3365 
3366 bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
3367                                     const MCOperandInfo &OpInfo,
3368                                     const MachineOperand &MO) const {
3369   if (!MO.isReg())
3370     return false;
3371 
3372   unsigned Reg = MO.getReg();
3373   const TargetRegisterClass *RC =
3374     TargetRegisterInfo::isVirtualRegister(Reg) ?
3375     MRI.getRegClass(Reg) :
3376     RI.getPhysRegClass(Reg);
3377 
3378   const SIRegisterInfo *TRI =
3379       static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
3380   RC = TRI->getSubRegClass(RC, MO.getSubReg());
3381 
3382   // In order to be legal, the common sub-class must be equal to the
3383   // class of the current operand.  For example:
3384   //
3385   // v_mov_b32 s0 ; Operand defined as vsrc_b32
3386   //              ; RI.getCommonSubClass(s0,vsrc_b32) = sgpr ; LEGAL
3387   //
3388   // s_sendmsg 0, s0 ; Operand defined as m0reg
3389   //                 ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL
3390 
3391   return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC;
3392 }
3393 
3394 bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
3395                                      const MCOperandInfo &OpInfo,
3396                                      const MachineOperand &MO) const {
3397   if (MO.isReg())
3398     return isLegalRegOperand(MRI, OpInfo, MO);
3399 
3400   // Handle non-register types that are treated like immediates.
3401   assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
3402   return true;
3403 }
3404 
3405 bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
3406                                  const MachineOperand *MO) const {
3407   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
3408   const MCInstrDesc &InstDesc = MI.getDesc();
3409   const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx];
3410   const TargetRegisterClass *DefinedRC =
3411       OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
3412   if (!MO)
3413     MO = &MI.getOperand(OpIdx);
3414 
3415   if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
3416 
3417     RegSubRegPair SGPRUsed;
3418     if (MO->isReg())
3419       SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg());
3420 
3421     for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
3422       if (i == OpIdx)
3423         continue;
3424       const MachineOperand &Op = MI.getOperand(i);
3425       if (Op.isReg()) {
3426         if ((Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) &&
3427             usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) {
3428           return false;
3429         }
3430       } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) {
3431         return false;
3432       }
3433     }
3434   }
3435 
3436   if (MO->isReg()) {
3437     assert(DefinedRC);
3438     return isLegalRegOperand(MRI, OpInfo, *MO);
3439   }
3440 
3441   // Handle non-register types that are treated like immediates.
3442   assert(MO->isImm() || MO->isTargetIndex() || MO->isFI());
3443 
3444   if (!DefinedRC) {
3445     // This operand expects an immediate.
3446     return true;
3447   }
3448 
3449   return isImmOperandLegal(MI, OpIdx, *MO);
3450 }
3451 
3452 void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
3453                                        MachineInstr &MI) const {
3454   unsigned Opc = MI.getOpcode();
3455   const MCInstrDesc &InstrDesc = get(Opc);
3456 
3457   int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
3458   MachineOperand &Src1 = MI.getOperand(Src1Idx);
3459 
3460   // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
3461   // we need to only have one constant bus use.
3462   //
3463   // Note we do not need to worry about literal constants here. They are
3464   // disabled for the operand type for instructions because they will always
3465   // violate the one constant bus use rule.
3466   bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister;
3467   if (HasImplicitSGPR) {
3468     int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
3469     MachineOperand &Src0 = MI.getOperand(Src0Idx);
3470 
3471     if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg()))
3472       legalizeOpWithMove(MI, Src0Idx);
3473   }
3474 
3475   // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
3476   // both the value to write (src0) and lane select (src1).  Fix up non-SGPR
3477   // src0/src1 with V_READFIRSTLANE.
3478   if (Opc == AMDGPU::V_WRITELANE_B32) {
3479     int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
3480     MachineOperand &Src0 = MI.getOperand(Src0Idx);
3481     const DebugLoc &DL = MI.getDebugLoc();
3482     if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
3483       unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3484       BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
3485           .add(Src0);
3486       Src0.ChangeToRegister(Reg, false);
3487     }
3488     if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
3489       unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3490       const DebugLoc &DL = MI.getDebugLoc();
3491       BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
3492           .add(Src1);
3493       Src1.ChangeToRegister(Reg, false);
3494     }
3495     return;
3496   }
3497 
3498   // VOP2 src0 instructions support all operand types, so we don't need to check
3499   // their legality. If src1 is already legal, we don't need to do anything.
3500   if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
3501     return;
3502 
3503   // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
3504   // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
3505   // select is uniform.
3506   if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
3507       RI.isVGPR(MRI, Src1.getReg())) {
3508     unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3509     const DebugLoc &DL = MI.getDebugLoc();
3510     BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
3511         .add(Src1);
3512     Src1.ChangeToRegister(Reg, false);
3513     return;
3514   }
3515 
3516   // We do not use commuteInstruction here because it is too aggressive and will
3517   // commute if it is possible. We only want to commute here if it improves
3518   // legality. This can be called a fairly large number of times so don't waste
3519   // compile time pointlessly swapping and checking legality again.
3520   if (HasImplicitSGPR || !MI.isCommutable()) {
3521     legalizeOpWithMove(MI, Src1Idx);
3522     return;
3523   }
3524 
3525   int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
3526   MachineOperand &Src0 = MI.getOperand(Src0Idx);
3527 
3528   // If src0 can be used as src1, commuting will make the operands legal.
3529   // Otherwise we have to give up and insert a move.
3530   //
3531   // TODO: Other immediate-like operand kinds could be commuted if there was a
3532   // MachineOperand::ChangeTo* for them.
3533   if ((!Src1.isImm() && !Src1.isReg()) ||
3534       !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) {
3535     legalizeOpWithMove(MI, Src1Idx);
3536     return;
3537   }
3538 
3539   int CommutedOpc = commuteOpcode(MI);
3540   if (CommutedOpc == -1) {
3541     legalizeOpWithMove(MI, Src1Idx);
3542     return;
3543   }
3544 
3545   MI.setDesc(get(CommutedOpc));
3546 
3547   unsigned Src0Reg = Src0.getReg();
3548   unsigned Src0SubReg = Src0.getSubReg();
3549   bool Src0Kill = Src0.isKill();
3550 
3551   if (Src1.isImm())
3552     Src0.ChangeToImmediate(Src1.getImm());
3553   else if (Src1.isReg()) {
3554     Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
3555     Src0.setSubReg(Src1.getSubReg());
3556   } else
3557     llvm_unreachable("Should only have register or immediate operands");
3558 
3559   Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
3560   Src1.setSubReg(Src0SubReg);
3561 }
3562 
3563 // Legalize VOP3 operands. Because all operand types are supported for any
3564 // operand, and since literal constants are not allowed and should never be
3565 // seen, we only need to worry about inserting copies if we use multiple SGPR
3566 // operands.
3567 void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
3568                                        MachineInstr &MI) const {
3569   unsigned Opc = MI.getOpcode();
3570 
3571   int VOP3Idx[3] = {
3572     AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
3573     AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
3574     AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
3575   };
3576 
3577   // Find the one SGPR operand we are allowed to use.
3578   unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx);
3579 
3580   for (unsigned i = 0; i < 3; ++i) {
3581     int Idx = VOP3Idx[i];
3582     if (Idx == -1)
3583       break;
3584     MachineOperand &MO = MI.getOperand(Idx);
3585 
3586     // We should never see a VOP3 instruction with an illegal immediate operand.
3587     if (!MO.isReg())
3588       continue;
3589 
3590     if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
3591       continue; // VGPRs are legal
3592 
3593     if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) {
3594       SGPRReg = MO.getReg();
3595       // We can use one SGPR in each VOP3 instruction.
3596       continue;
3597     }
3598 
3599     // If we make it this far, then the operand is not legal and we must
3600     // legalize it.
3601     legalizeOpWithMove(MI, Idx);
3602   }
3603 }
3604 
3605 unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI,
3606                                          MachineRegisterInfo &MRI) const {
3607   const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
3608   const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
3609   unsigned DstReg = MRI.createVirtualRegister(SRC);
3610   unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
3611 
3612   if (SubRegs == 1) {
3613     BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
3614             get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
3615         .addReg(SrcReg);
3616     return DstReg;
3617   }
3618 
3619   SmallVector<unsigned, 8> SRegs;
3620   for (unsigned i = 0; i < SubRegs; ++i) {
3621     unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3622     BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
3623             get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
3624         .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
3625     SRegs.push_back(SGPR);
3626   }
3627 
3628   MachineInstrBuilder MIB =
3629       BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
3630               get(AMDGPU::REG_SEQUENCE), DstReg);
3631   for (unsigned i = 0; i < SubRegs; ++i) {
3632     MIB.addReg(SRegs[i]);
3633     MIB.addImm(RI.getSubRegFromChannel(i));
3634   }
3635   return DstReg;
3636 }
3637 
3638 void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
3639                                        MachineInstr &MI) const {
3640 
3641   // If the pointer is store in VGPRs, then we need to move them to
3642   // SGPRs using v_readfirstlane.  This is safe because we only select
3643   // loads with uniform pointers to SMRD instruction so we know the
3644   // pointer value is uniform.
3645   MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
3646   if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
3647     unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
3648     SBase->setReg(SGPR);
3649   }
3650   MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soff);
3651   if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) {
3652     unsigned SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
3653     SOff->setReg(SGPR);
3654   }
3655 }
3656 
3657 void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
3658                                          MachineBasicBlock::iterator I,
3659                                          const TargetRegisterClass *DstRC,
3660                                          MachineOperand &Op,
3661                                          MachineRegisterInfo &MRI,
3662                                          const DebugLoc &DL) const {
3663   unsigned OpReg = Op.getReg();
3664   unsigned OpSubReg = Op.getSubReg();
3665 
3666   const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
3667       RI.getRegClassForReg(MRI, OpReg), OpSubReg);
3668 
3669   // Check if operand is already the correct register class.
3670   if (DstRC == OpRC)
3671     return;
3672 
3673   unsigned DstReg = MRI.createVirtualRegister(DstRC);
3674   MachineInstr *Copy =
3675       BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op);
3676 
3677   Op.setReg(DstReg);
3678   Op.setSubReg(0);
3679 
3680   MachineInstr *Def = MRI.getVRegDef(OpReg);
3681   if (!Def)
3682     return;
3683 
3684   // Try to eliminate the copy if it is copying an immediate value.
3685   if (Def->isMoveImmediate())
3686     FoldImmediate(*Copy, *Def, OpReg, &MRI);
3687 }
3688 
3689 // Emit the actual waterfall loop, executing the wrapped instruction for each
3690 // unique value of \p Rsrc across all lanes. In the best case we execute 1
3691 // iteration, in the worst case we execute 64 (once per lane).
3692 static void
3693 emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI,
3694                           MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
3695                           const DebugLoc &DL, MachineOperand &Rsrc) {
3696   MachineBasicBlock::iterator I = LoopBB.begin();
3697 
3698   unsigned VRsrc = Rsrc.getReg();
3699   unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef());
3700 
3701   unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3702   unsigned CondReg0 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3703   unsigned CondReg1 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3704   unsigned AndCond = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3705   unsigned SRsrcSub0 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3706   unsigned SRsrcSub1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3707   unsigned SRsrcSub2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3708   unsigned SRsrcSub3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3709   unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
3710 
3711   // Beginning of the loop, read the next Rsrc variant.
3712   BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub0)
3713       .addReg(VRsrc, VRsrcUndef, AMDGPU::sub0);
3714   BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub1)
3715       .addReg(VRsrc, VRsrcUndef, AMDGPU::sub1);
3716   BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub2)
3717       .addReg(VRsrc, VRsrcUndef, AMDGPU::sub2);
3718   BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub3)
3719       .addReg(VRsrc, VRsrcUndef, AMDGPU::sub3);
3720 
3721   BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SRsrc)
3722       .addReg(SRsrcSub0)
3723       .addImm(AMDGPU::sub0)
3724       .addReg(SRsrcSub1)
3725       .addImm(AMDGPU::sub1)
3726       .addReg(SRsrcSub2)
3727       .addImm(AMDGPU::sub2)
3728       .addReg(SRsrcSub3)
3729       .addImm(AMDGPU::sub3);
3730 
3731   // Update Rsrc operand to use the SGPR Rsrc.
3732   Rsrc.setReg(SRsrc);
3733   Rsrc.setIsKill(true);
3734 
3735   // Identify all lanes with identical Rsrc operands in their VGPRs.
3736   BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg0)
3737       .addReg(SRsrc, 0, AMDGPU::sub0_sub1)
3738       .addReg(VRsrc, 0, AMDGPU::sub0_sub1);
3739   BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), CondReg1)
3740       .addReg(SRsrc, 0, AMDGPU::sub2_sub3)
3741       .addReg(VRsrc, 0, AMDGPU::sub2_sub3);
3742   BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_AND_B64), AndCond)
3743       .addReg(CondReg0)
3744       .addReg(CondReg1);
3745 
3746   MRI.setSimpleHint(SaveExec, AndCond);
3747 
3748   // Update EXEC to matching lanes, saving original to SaveExec.
3749   BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_AND_SAVEEXEC_B64), SaveExec)
3750       .addReg(AndCond, RegState::Kill);
3751 
3752   // The original instruction is here; we insert the terminators after it.
3753   I = LoopBB.end();
3754 
3755   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
3756   BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_XOR_B64_term), AMDGPU::EXEC)
3757       .addReg(AMDGPU::EXEC)
3758       .addReg(SaveExec);
3759   BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&LoopBB);
3760 }
3761 
3762 // Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register
3763 // with SGPRs by iterating over all unique values across all lanes.
3764 static void loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
3765                               MachineOperand &Rsrc, MachineDominatorTree *MDT) {
3766   MachineBasicBlock &MBB = *MI.getParent();
3767   MachineFunction &MF = *MBB.getParent();
3768   MachineRegisterInfo &MRI = MF.getRegInfo();
3769   MachineBasicBlock::iterator I(&MI);
3770   const DebugLoc &DL = MI.getDebugLoc();
3771 
3772   unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
3773 
3774   // Save the EXEC mask
3775   BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B64), SaveExec)
3776       .addReg(AMDGPU::EXEC);
3777 
3778   // Killed uses in the instruction we are waterfalling around will be
3779   // incorrect due to the added control-flow.
3780   for (auto &MO : MI.uses()) {
3781     if (MO.isReg() && MO.isUse()) {
3782       MRI.clearKillFlags(MO.getReg());
3783     }
3784   }
3785 
3786   // To insert the loop we need to split the block. Move everything after this
3787   // point to a new block, and insert a new empty block between the two.
3788   MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock();
3789   MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
3790   MachineFunction::iterator MBBI(MBB);
3791   ++MBBI;
3792 
3793   MF.insert(MBBI, LoopBB);
3794   MF.insert(MBBI, RemainderBB);
3795 
3796   LoopBB->addSuccessor(LoopBB);
3797   LoopBB->addSuccessor(RemainderBB);
3798 
3799   // Move MI to the LoopBB, and the remainder of the block to RemainderBB.
3800   MachineBasicBlock::iterator J = I++;
3801   RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
3802   RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
3803   LoopBB->splice(LoopBB->begin(), &MBB, J);
3804 
3805   MBB.addSuccessor(LoopBB);
3806 
3807   // Update dominators. We know that MBB immediately dominates LoopBB, that
3808   // LoopBB immediately dominates RemainderBB, and that RemainderBB immediately
3809   // dominates all of the successors transferred to it from MBB that MBB used
3810   // to dominate.
3811   if (MDT) {
3812     MDT->addNewBlock(LoopBB, &MBB);
3813     MDT->addNewBlock(RemainderBB, LoopBB);
3814     for (auto &Succ : RemainderBB->successors()) {
3815       if (MDT->dominates(&MBB, Succ)) {
3816         MDT->changeImmediateDominator(Succ, RemainderBB);
3817       }
3818     }
3819   }
3820 
3821   emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, Rsrc);
3822 
3823   // Restore the EXEC mask
3824   MachineBasicBlock::iterator First = RemainderBB->begin();
3825   BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
3826       .addReg(SaveExec);
3827 }
3828 
3829 // Extract pointer from Rsrc and return a zero-value Rsrc replacement.
3830 static std::tuple<unsigned, unsigned>
3831 extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) {
3832   MachineBasicBlock &MBB = *MI.getParent();
3833   MachineFunction &MF = *MBB.getParent();
3834   MachineRegisterInfo &MRI = MF.getRegInfo();
3835 
3836   // Extract the ptr from the resource descriptor.
3837   unsigned RsrcPtr =
3838       TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
3839                              AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
3840 
3841   // Create an empty resource descriptor
3842   unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3843   unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3844   unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3845   unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
3846   uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
3847 
3848   // Zero64 = 0
3849   BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
3850       .addImm(0);
3851 
3852   // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
3853   BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
3854       .addImm(RsrcDataFormat & 0xFFFFFFFF);
3855 
3856   // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
3857   BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
3858       .addImm(RsrcDataFormat >> 32);
3859 
3860   // NewSRsrc = {Zero64, SRsrcFormat}
3861   BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
3862       .addReg(Zero64)
3863       .addImm(AMDGPU::sub0_sub1)
3864       .addReg(SRsrcFormatLo)
3865       .addImm(AMDGPU::sub2)
3866       .addReg(SRsrcFormatHi)
3867       .addImm(AMDGPU::sub3);
3868 
3869   return std::make_tuple(RsrcPtr, NewSRsrc);
3870 }
3871 
3872 void SIInstrInfo::legalizeOperands(MachineInstr &MI,
3873                                    MachineDominatorTree *MDT) const {
3874   MachineFunction &MF = *MI.getParent()->getParent();
3875   MachineRegisterInfo &MRI = MF.getRegInfo();
3876 
3877   // Legalize VOP2
3878   if (isVOP2(MI) || isVOPC(MI)) {
3879     legalizeOperandsVOP2(MRI, MI);
3880     return;
3881   }
3882 
3883   // Legalize VOP3
3884   if (isVOP3(MI)) {
3885     legalizeOperandsVOP3(MRI, MI);
3886     return;
3887   }
3888 
3889   // Legalize SMRD
3890   if (isSMRD(MI)) {
3891     legalizeOperandsSMRD(MRI, MI);
3892     return;
3893   }
3894 
3895   // Legalize REG_SEQUENCE and PHI
3896   // The register class of the operands much be the same type as the register
3897   // class of the output.
3898   if (MI.getOpcode() == AMDGPU::PHI) {
3899     const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
3900     for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
3901       if (!MI.getOperand(i).isReg() ||
3902           !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg()))
3903         continue;
3904       const TargetRegisterClass *OpRC =
3905           MRI.getRegClass(MI.getOperand(i).getReg());
3906       if (RI.hasVGPRs(OpRC)) {
3907         VRC = OpRC;
3908       } else {
3909         SRC = OpRC;
3910       }
3911     }
3912 
3913     // If any of the operands are VGPR registers, then they all most be
3914     // otherwise we will create illegal VGPR->SGPR copies when legalizing
3915     // them.
3916     if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
3917       if (!VRC) {
3918         assert(SRC);
3919         VRC = RI.getEquivalentVGPRClass(SRC);
3920       }
3921       RC = VRC;
3922     } else {
3923       RC = SRC;
3924     }
3925 
3926     // Update all the operands so they have the same type.
3927     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3928       MachineOperand &Op = MI.getOperand(I);
3929       if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
3930         continue;
3931 
3932       // MI is a PHI instruction.
3933       MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
3934       MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator();
3935 
3936       // Avoid creating no-op copies with the same src and dst reg class.  These
3937       // confuse some of the machine passes.
3938       legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
3939     }
3940   }
3941 
3942   // REG_SEQUENCE doesn't really require operand legalization, but if one has a
3943   // VGPR dest type and SGPR sources, insert copies so all operands are
3944   // VGPRs. This seems to help operand folding / the register coalescer.
3945   if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
3946     MachineBasicBlock *MBB = MI.getParent();
3947     const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
3948     if (RI.hasVGPRs(DstRC)) {
3949       // Update all the operands so they are VGPR register classes. These may
3950       // not be the same register class because REG_SEQUENCE supports mixing
3951       // subregister index types e.g. sub0_sub1 + sub2 + sub3
3952       for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3953         MachineOperand &Op = MI.getOperand(I);
3954         if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
3955           continue;
3956 
3957         const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
3958         const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
3959         if (VRC == OpRC)
3960           continue;
3961 
3962         legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
3963         Op.setIsKill();
3964       }
3965     }
3966 
3967     return;
3968   }
3969 
3970   // Legalize INSERT_SUBREG
3971   // src0 must have the same register class as dst
3972   if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
3973     unsigned Dst = MI.getOperand(0).getReg();
3974     unsigned Src0 = MI.getOperand(1).getReg();
3975     const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
3976     const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
3977     if (DstRC != Src0RC) {
3978       MachineBasicBlock *MBB = MI.getParent();
3979       MachineOperand &Op = MI.getOperand(1);
3980       legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
3981     }
3982     return;
3983   }
3984 
3985   // Legalize SI_INIT_M0
3986   if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
3987     MachineOperand &Src = MI.getOperand(0);
3988     if (Src.isReg() && RI.hasVGPRs(MRI.getRegClass(Src.getReg())))
3989       Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
3990     return;
3991   }
3992 
3993   // Legalize MIMG and MUBUF/MTBUF for shaders.
3994   //
3995   // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
3996   // scratch memory access. In both cases, the legalization never involves
3997   // conversion to the addr64 form.
3998   if (isMIMG(MI) ||
3999       (AMDGPU::isShader(MF.getFunction().getCallingConv()) &&
4000        (isMUBUF(MI) || isMTBUF(MI)))) {
4001     MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc);
4002     if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) {
4003       unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI);
4004       SRsrc->setReg(SGPR);
4005     }
4006 
4007     MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp);
4008     if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) {
4009       unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI);
4010       SSamp->setReg(SGPR);
4011     }
4012     return;
4013   }
4014 
4015   // Legalize MUBUF* instructions.
4016   int RsrcIdx =
4017       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
4018   if (RsrcIdx != -1) {
4019     // We have an MUBUF instruction
4020     MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
4021     unsigned RsrcRC = get(MI.getOpcode()).OpInfo[RsrcIdx].RegClass;
4022     if (RI.getCommonSubClass(MRI.getRegClass(Rsrc->getReg()),
4023                              RI.getRegClass(RsrcRC))) {
4024       // The operands are legal.
4025       // FIXME: We may need to legalize operands besided srsrc.
4026       return;
4027     }
4028 
4029     // Legalize a VGPR Rsrc.
4030     //
4031     // If the instruction is _ADDR64, we can avoid a waterfall by extracting
4032     // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
4033     // a zero-value SRsrc.
4034     //
4035     // If the instruction is _OFFSET (both idxen and offen disabled), and we
4036     // support ADDR64 instructions, we can convert to ADDR64 and do the same as
4037     // above.
4038     //
4039     // Otherwise we are on non-ADDR64 hardware, and/or we have
4040     // idxen/offen/bothen and we fall back to a waterfall loop.
4041 
4042     MachineBasicBlock &MBB = *MI.getParent();
4043 
4044     MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
4045     if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
4046       // This is already an ADDR64 instruction so we need to add the pointer
4047       // extracted from the resource descriptor to the current value of VAddr.
4048       unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4049       unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4050       unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
4051 
4052       unsigned RsrcPtr, NewSRsrc;
4053       std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
4054 
4055       // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
4056       DebugLoc DL = MI.getDebugLoc();
4057       BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo)
4058           .addReg(RsrcPtr, 0, AMDGPU::sub0)
4059           .addReg(VAddr->getReg(), 0, AMDGPU::sub0);
4060 
4061       // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
4062       BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi)
4063           .addReg(RsrcPtr, 0, AMDGPU::sub1)
4064           .addReg(VAddr->getReg(), 0, AMDGPU::sub1);
4065 
4066       // NewVaddr = {NewVaddrHi, NewVaddrLo}
4067       BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
4068           .addReg(NewVAddrLo)
4069           .addImm(AMDGPU::sub0)
4070           .addReg(NewVAddrHi)
4071           .addImm(AMDGPU::sub1);
4072 
4073       VAddr->setReg(NewVAddr);
4074       Rsrc->setReg(NewSRsrc);
4075     } else if (!VAddr && ST.hasAddr64()) {
4076       // This instructions is the _OFFSET variant, so we need to convert it to
4077       // ADDR64.
4078       assert(MBB.getParent()->getSubtarget<GCNSubtarget>().getGeneration()
4079              < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
4080              "FIXME: Need to emit flat atomics here");
4081 
4082       unsigned RsrcPtr, NewSRsrc;
4083       std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
4084 
4085       unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
4086       MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
4087       MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
4088       MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
4089       unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
4090 
4091       // Atomics rith return have have an additional tied operand and are
4092       // missing some of the special bits.
4093       MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
4094       MachineInstr *Addr64;
4095 
4096       if (!VDataIn) {
4097         // Regular buffer load / store.
4098         MachineInstrBuilder MIB =
4099             BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
4100                 .add(*VData)
4101                 .addReg(NewVAddr)
4102                 .addReg(NewSRsrc)
4103                 .add(*SOffset)
4104                 .add(*Offset);
4105 
4106         // Atomics do not have this operand.
4107         if (const MachineOperand *GLC =
4108                 getNamedOperand(MI, AMDGPU::OpName::glc)) {
4109           MIB.addImm(GLC->getImm());
4110         }
4111 
4112         MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc));
4113 
4114         if (const MachineOperand *TFE =
4115                 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
4116           MIB.addImm(TFE->getImm());
4117         }
4118 
4119         MIB.cloneMemRefs(MI);
4120         Addr64 = MIB;
4121       } else {
4122         // Atomics with return.
4123         Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
4124                      .add(*VData)
4125                      .add(*VDataIn)
4126                      .addReg(NewVAddr)
4127                      .addReg(NewSRsrc)
4128                      .add(*SOffset)
4129                      .add(*Offset)
4130                      .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc))
4131                      .cloneMemRefs(MI);
4132       }
4133 
4134       MI.removeFromParent();
4135 
4136       // NewVaddr = {NewVaddrHi, NewVaddrLo}
4137       BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
4138               NewVAddr)
4139           .addReg(RsrcPtr, 0, AMDGPU::sub0)
4140           .addImm(AMDGPU::sub0)
4141           .addReg(RsrcPtr, 0, AMDGPU::sub1)
4142           .addImm(AMDGPU::sub1);
4143     } else {
4144       // This is another variant; legalize Rsrc with waterfall loop from VGPRs
4145       // to SGPRs.
4146       loadSRsrcFromVGPR(*this, MI, *Rsrc, MDT);
4147     }
4148   }
4149 }
4150 
4151 void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
4152                              MachineDominatorTree *MDT) const {
4153   SetVectorType Worklist;
4154   Worklist.insert(&TopInst);
4155 
4156   while (!Worklist.empty()) {
4157     MachineInstr &Inst = *Worklist.pop_back_val();
4158     MachineBasicBlock *MBB = Inst.getParent();
4159     MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
4160 
4161     unsigned Opcode = Inst.getOpcode();
4162     unsigned NewOpcode = getVALUOp(Inst);
4163 
4164     // Handle some special cases
4165     switch (Opcode) {
4166     default:
4167       break;
4168     case AMDGPU::S_ADD_U64_PSEUDO:
4169     case AMDGPU::S_SUB_U64_PSEUDO:
4170       splitScalar64BitAddSub(Worklist, Inst, MDT);
4171       Inst.eraseFromParent();
4172       continue;
4173     case AMDGPU::S_ADD_I32:
4174     case AMDGPU::S_SUB_I32:
4175       // FIXME: The u32 versions currently selected use the carry.
4176       if (moveScalarAddSub(Worklist, Inst, MDT))
4177         continue;
4178 
4179       // Default handling
4180       break;
4181     case AMDGPU::S_AND_B64:
4182       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
4183       Inst.eraseFromParent();
4184       continue;
4185 
4186     case AMDGPU::S_OR_B64:
4187       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
4188       Inst.eraseFromParent();
4189       continue;
4190 
4191     case AMDGPU::S_XOR_B64:
4192       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
4193       Inst.eraseFromParent();
4194       continue;
4195 
4196     case AMDGPU::S_NAND_B64:
4197       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
4198       Inst.eraseFromParent();
4199       continue;
4200 
4201     case AMDGPU::S_NOR_B64:
4202       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
4203       Inst.eraseFromParent();
4204       continue;
4205 
4206     case AMDGPU::S_XNOR_B64:
4207       if (ST.hasDLInsts())
4208         splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
4209       else
4210         splitScalar64BitXnor(Worklist, Inst, MDT);
4211       Inst.eraseFromParent();
4212       continue;
4213 
4214     case AMDGPU::S_ANDN2_B64:
4215       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
4216       Inst.eraseFromParent();
4217       continue;
4218 
4219     case AMDGPU::S_ORN2_B64:
4220       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
4221       Inst.eraseFromParent();
4222       continue;
4223 
4224     case AMDGPU::S_NOT_B64:
4225       splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
4226       Inst.eraseFromParent();
4227       continue;
4228 
4229     case AMDGPU::S_BCNT1_I32_B64:
4230       splitScalar64BitBCNT(Worklist, Inst);
4231       Inst.eraseFromParent();
4232       continue;
4233 
4234     case AMDGPU::S_BFE_I64:
4235       splitScalar64BitBFE(Worklist, Inst);
4236       Inst.eraseFromParent();
4237       continue;
4238 
4239     case AMDGPU::S_LSHL_B32:
4240       if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
4241         NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
4242         swapOperands(Inst);
4243       }
4244       break;
4245     case AMDGPU::S_ASHR_I32:
4246       if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
4247         NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
4248         swapOperands(Inst);
4249       }
4250       break;
4251     case AMDGPU::S_LSHR_B32:
4252       if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
4253         NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
4254         swapOperands(Inst);
4255       }
4256       break;
4257     case AMDGPU::S_LSHL_B64:
4258       if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
4259         NewOpcode = AMDGPU::V_LSHLREV_B64;
4260         swapOperands(Inst);
4261       }
4262       break;
4263     case AMDGPU::S_ASHR_I64:
4264       if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
4265         NewOpcode = AMDGPU::V_ASHRREV_I64;
4266         swapOperands(Inst);
4267       }
4268       break;
4269     case AMDGPU::S_LSHR_B64:
4270       if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
4271         NewOpcode = AMDGPU::V_LSHRREV_B64;
4272         swapOperands(Inst);
4273       }
4274       break;
4275 
4276     case AMDGPU::S_ABS_I32:
4277       lowerScalarAbs(Worklist, Inst);
4278       Inst.eraseFromParent();
4279       continue;
4280 
4281     case AMDGPU::S_CBRANCH_SCC0:
4282     case AMDGPU::S_CBRANCH_SCC1:
4283       // Clear unused bits of vcc
4284       BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64),
4285               AMDGPU::VCC)
4286           .addReg(AMDGPU::EXEC)
4287           .addReg(AMDGPU::VCC);
4288       break;
4289 
4290     case AMDGPU::S_BFE_U64:
4291     case AMDGPU::S_BFM_B64:
4292       llvm_unreachable("Moving this op to VALU not implemented");
4293 
4294     case AMDGPU::S_PACK_LL_B32_B16:
4295     case AMDGPU::S_PACK_LH_B32_B16:
4296     case AMDGPU::S_PACK_HH_B32_B16:
4297       movePackToVALU(Worklist, MRI, Inst);
4298       Inst.eraseFromParent();
4299       continue;
4300 
4301     case AMDGPU::S_XNOR_B32:
4302       lowerScalarXnor(Worklist, Inst);
4303       Inst.eraseFromParent();
4304       continue;
4305 
4306     case AMDGPU::S_NAND_B32:
4307       splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
4308       Inst.eraseFromParent();
4309       continue;
4310 
4311     case AMDGPU::S_NOR_B32:
4312       splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
4313       Inst.eraseFromParent();
4314       continue;
4315 
4316     case AMDGPU::S_ANDN2_B32:
4317       splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
4318       Inst.eraseFromParent();
4319       continue;
4320 
4321     case AMDGPU::S_ORN2_B32:
4322       splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
4323       Inst.eraseFromParent();
4324       continue;
4325     }
4326 
4327     if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
4328       // We cannot move this instruction to the VALU, so we should try to
4329       // legalize its operands instead.
4330       legalizeOperands(Inst, MDT);
4331       continue;
4332     }
4333 
4334     // Use the new VALU Opcode.
4335     const MCInstrDesc &NewDesc = get(NewOpcode);
4336     Inst.setDesc(NewDesc);
4337 
4338     // Remove any references to SCC. Vector instructions can't read from it, and
4339     // We're just about to add the implicit use / defs of VCC, and we don't want
4340     // both.
4341     for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) {
4342       MachineOperand &Op = Inst.getOperand(i);
4343       if (Op.isReg() && Op.getReg() == AMDGPU::SCC) {
4344         Inst.RemoveOperand(i);
4345         addSCCDefUsersToVALUWorklist(Inst, Worklist);
4346       }
4347     }
4348 
4349     if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
4350       // We are converting these to a BFE, so we need to add the missing
4351       // operands for the size and offset.
4352       unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
4353       Inst.addOperand(MachineOperand::CreateImm(0));
4354       Inst.addOperand(MachineOperand::CreateImm(Size));
4355 
4356     } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
4357       // The VALU version adds the second operand to the result, so insert an
4358       // extra 0 operand.
4359       Inst.addOperand(MachineOperand::CreateImm(0));
4360     }
4361 
4362     Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent());
4363 
4364     if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
4365       const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
4366       // If we need to move this to VGPRs, we need to unpack the second operand
4367       // back into the 2 separate ones for bit offset and width.
4368       assert(OffsetWidthOp.isImm() &&
4369              "Scalar BFE is only implemented for constant width and offset");
4370       uint32_t Imm = OffsetWidthOp.getImm();
4371 
4372       uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
4373       uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
4374       Inst.RemoveOperand(2);                     // Remove old immediate.
4375       Inst.addOperand(MachineOperand::CreateImm(Offset));
4376       Inst.addOperand(MachineOperand::CreateImm(BitWidth));
4377     }
4378 
4379     bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef();
4380     unsigned NewDstReg = AMDGPU::NoRegister;
4381     if (HasDst) {
4382       unsigned DstReg = Inst.getOperand(0).getReg();
4383       if (TargetRegisterInfo::isPhysicalRegister(DstReg))
4384         continue;
4385 
4386       // Update the destination register class.
4387       const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
4388       if (!NewDstRC)
4389         continue;
4390 
4391       if (Inst.isCopy() &&
4392           TargetRegisterInfo::isVirtualRegister(Inst.getOperand(1).getReg()) &&
4393           NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
4394         // Instead of creating a copy where src and dst are the same register
4395         // class, we just replace all uses of dst with src.  These kinds of
4396         // copies interfere with the heuristics MachineSink uses to decide
4397         // whether or not to split a critical edge.  Since the pass assumes
4398         // that copies will end up as machine instructions and not be
4399         // eliminated.
4400         addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
4401         MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
4402         MRI.clearKillFlags(Inst.getOperand(1).getReg());
4403         Inst.getOperand(0).setReg(DstReg);
4404 
4405         // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
4406         // these are deleted later, but at -O0 it would leave a suspicious
4407         // looking illegal copy of an undef register.
4408         for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
4409           Inst.RemoveOperand(I);
4410         Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
4411         continue;
4412       }
4413 
4414       NewDstReg = MRI.createVirtualRegister(NewDstRC);
4415       MRI.replaceRegWith(DstReg, NewDstReg);
4416     }
4417 
4418     // Legalize the operands
4419     legalizeOperands(Inst, MDT);
4420 
4421     if (HasDst)
4422      addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
4423   }
4424 }
4425 
4426 // Add/sub require special handling to deal with carry outs.
4427 bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
4428                                    MachineDominatorTree *MDT) const {
4429   if (ST.hasAddNoCarry()) {
4430     // Assume there is no user of scc since we don't select this in that case.
4431     // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
4432     // is used.
4433 
4434     MachineBasicBlock &MBB = *Inst.getParent();
4435     MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4436 
4437     unsigned OldDstReg = Inst.getOperand(0).getReg();
4438     unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4439 
4440     unsigned Opc = Inst.getOpcode();
4441     assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
4442 
4443     unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
4444       AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
4445 
4446     assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
4447     Inst.RemoveOperand(3);
4448 
4449     Inst.setDesc(get(NewOpc));
4450     Inst.addImplicitDefUseOperands(*MBB.getParent());
4451     MRI.replaceRegWith(OldDstReg, ResultReg);
4452     legalizeOperands(Inst, MDT);
4453 
4454     addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4455     return true;
4456   }
4457 
4458   return false;
4459 }
4460 
4461 void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist,
4462                                  MachineInstr &Inst) const {
4463   MachineBasicBlock &MBB = *Inst.getParent();
4464   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4465   MachineBasicBlock::iterator MII = Inst;
4466   DebugLoc DL = Inst.getDebugLoc();
4467 
4468   MachineOperand &Dest = Inst.getOperand(0);
4469   MachineOperand &Src = Inst.getOperand(1);
4470   unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4471   unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4472 
4473   unsigned SubOp = ST.hasAddNoCarry() ?
4474     AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_I32_e32;
4475 
4476   BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
4477     .addImm(0)
4478     .addReg(Src.getReg());
4479 
4480   BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
4481     .addReg(Src.getReg())
4482     .addReg(TmpReg);
4483 
4484   MRI.replaceRegWith(Dest.getReg(), ResultReg);
4485   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4486 }
4487 
4488 void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
4489                                   MachineInstr &Inst) const {
4490   MachineBasicBlock &MBB = *Inst.getParent();
4491   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4492   MachineBasicBlock::iterator MII = Inst;
4493   const DebugLoc &DL = Inst.getDebugLoc();
4494 
4495   MachineOperand &Dest = Inst.getOperand(0);
4496   MachineOperand &Src0 = Inst.getOperand(1);
4497   MachineOperand &Src1 = Inst.getOperand(2);
4498 
4499   if (ST.hasDLInsts()) {
4500     unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4501     legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
4502     legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
4503 
4504     BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
4505       .add(Src0)
4506       .add(Src1);
4507 
4508     MRI.replaceRegWith(Dest.getReg(), NewDest);
4509     addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
4510   } else {
4511     // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
4512     // invert either source and then perform the XOR. If either source is a
4513     // scalar register, then we can leave the inversion on the scalar unit to
4514     // acheive a better distrubution of scalar and vector instructions.
4515     bool Src0IsSGPR = Src0.isReg() &&
4516                       RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
4517     bool Src1IsSGPR = Src1.isReg() &&
4518                       RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
4519     MachineInstr *Not = nullptr;
4520     MachineInstr *Xor = nullptr;
4521     unsigned Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4522     unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4523 
4524     // Build a pair of scalar instructions and add them to the work list.
4525     // The next iteration over the work list will lower these to the vector
4526     // unit as necessary.
4527     if (Src0IsSGPR) {
4528       Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp)
4529         .add(Src0);
4530       Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
4531       .addReg(Temp)
4532       .add(Src1);
4533     } else if (Src1IsSGPR) {
4534       Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp)
4535         .add(Src1);
4536       Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
4537       .add(Src0)
4538       .addReg(Temp);
4539     } else {
4540       Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
4541         .add(Src0)
4542         .add(Src1);
4543       Not = BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
4544         .addReg(Temp);
4545       Worklist.insert(Not);
4546     }
4547 
4548     MRI.replaceRegWith(Dest.getReg(), NewDest);
4549 
4550     Worklist.insert(Xor);
4551 
4552     addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
4553   }
4554 }
4555 
4556 void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist,
4557                                       MachineInstr &Inst,
4558                                       unsigned Opcode) const {
4559   MachineBasicBlock &MBB = *Inst.getParent();
4560   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4561   MachineBasicBlock::iterator MII = Inst;
4562   const DebugLoc &DL = Inst.getDebugLoc();
4563 
4564   MachineOperand &Dest = Inst.getOperand(0);
4565   MachineOperand &Src0 = Inst.getOperand(1);
4566   MachineOperand &Src1 = Inst.getOperand(2);
4567 
4568   unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4569   unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4570 
4571   MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
4572     .add(Src0)
4573     .add(Src1);
4574 
4575   MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
4576     .addReg(Interm);
4577 
4578   Worklist.insert(&Op);
4579   Worklist.insert(&Not);
4580 
4581   MRI.replaceRegWith(Dest.getReg(), NewDest);
4582   addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
4583 }
4584 
4585 void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist,
4586                                      MachineInstr &Inst,
4587                                      unsigned Opcode) const {
4588   MachineBasicBlock &MBB = *Inst.getParent();
4589   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4590   MachineBasicBlock::iterator MII = Inst;
4591   const DebugLoc &DL = Inst.getDebugLoc();
4592 
4593   MachineOperand &Dest = Inst.getOperand(0);
4594   MachineOperand &Src0 = Inst.getOperand(1);
4595   MachineOperand &Src1 = Inst.getOperand(2);
4596 
4597   unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4598   unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4599 
4600   MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
4601     .add(Src1);
4602 
4603   MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
4604     .add(Src0)
4605     .addReg(Interm);
4606 
4607   Worklist.insert(&Not);
4608   Worklist.insert(&Op);
4609 
4610   MRI.replaceRegWith(Dest.getReg(), NewDest);
4611   addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
4612 }
4613 
4614 void SIInstrInfo::splitScalar64BitUnaryOp(
4615     SetVectorType &Worklist, MachineInstr &Inst,
4616     unsigned Opcode) const {
4617   MachineBasicBlock &MBB = *Inst.getParent();
4618   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4619 
4620   MachineOperand &Dest = Inst.getOperand(0);
4621   MachineOperand &Src0 = Inst.getOperand(1);
4622   DebugLoc DL = Inst.getDebugLoc();
4623 
4624   MachineBasicBlock::iterator MII = Inst;
4625 
4626   const MCInstrDesc &InstDesc = get(Opcode);
4627   const TargetRegisterClass *Src0RC = Src0.isReg() ?
4628     MRI.getRegClass(Src0.getReg()) :
4629     &AMDGPU::SGPR_32RegClass;
4630 
4631   const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
4632 
4633   MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4634                                                        AMDGPU::sub0, Src0SubRC);
4635 
4636   const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
4637   const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
4638   const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
4639 
4640   unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
4641   MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
4642 
4643   MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4644                                                        AMDGPU::sub1, Src0SubRC);
4645 
4646   unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
4647   MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
4648 
4649   unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
4650   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
4651     .addReg(DestSub0)
4652     .addImm(AMDGPU::sub0)
4653     .addReg(DestSub1)
4654     .addImm(AMDGPU::sub1);
4655 
4656   MRI.replaceRegWith(Dest.getReg(), FullDestReg);
4657 
4658   Worklist.insert(&LoHalf);
4659   Worklist.insert(&HiHalf);
4660 
4661   // We don't need to legalizeOperands here because for a single operand, src0
4662   // will support any kind of input.
4663 
4664   // Move all users of this moved value.
4665   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
4666 }
4667 
4668 void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist,
4669                                          MachineInstr &Inst,
4670                                          MachineDominatorTree *MDT) const {
4671   bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
4672 
4673   MachineBasicBlock &MBB = *Inst.getParent();
4674   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4675 
4676   unsigned FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
4677   unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4678   unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4679 
4680   unsigned CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
4681   unsigned DeadCarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
4682 
4683   MachineOperand &Dest = Inst.getOperand(0);
4684   MachineOperand &Src0 = Inst.getOperand(1);
4685   MachineOperand &Src1 = Inst.getOperand(2);
4686   const DebugLoc &DL = Inst.getDebugLoc();
4687   MachineBasicBlock::iterator MII = Inst;
4688 
4689   const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
4690   const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
4691   const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
4692   const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
4693 
4694   MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4695                                                        AMDGPU::sub0, Src0SubRC);
4696   MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
4697                                                        AMDGPU::sub0, Src1SubRC);
4698 
4699 
4700   MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4701                                                        AMDGPU::sub1, Src0SubRC);
4702   MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
4703                                                        AMDGPU::sub1, Src1SubRC);
4704 
4705   unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;
4706   MachineInstr *LoHalf =
4707     BuildMI(MBB, MII, DL, get(LoOpc), DestSub0)
4708     .addReg(CarryReg, RegState::Define)
4709     .add(SrcReg0Sub0)
4710     .add(SrcReg1Sub0);
4711 
4712   unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
4713   MachineInstr *HiHalf =
4714     BuildMI(MBB, MII, DL, get(HiOpc), DestSub1)
4715     .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
4716     .add(SrcReg0Sub1)
4717     .add(SrcReg1Sub1)
4718     .addReg(CarryReg, RegState::Kill);
4719 
4720   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
4721     .addReg(DestSub0)
4722     .addImm(AMDGPU::sub0)
4723     .addReg(DestSub1)
4724     .addImm(AMDGPU::sub1);
4725 
4726   MRI.replaceRegWith(Dest.getReg(), FullDestReg);
4727 
4728   // Try to legalize the operands in case we need to swap the order to keep it
4729   // valid.
4730   legalizeOperands(*LoHalf, MDT);
4731   legalizeOperands(*HiHalf, MDT);
4732 
4733   // Move all users of this moved vlaue.
4734   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
4735 }
4736 
4737 void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist,
4738                                            MachineInstr &Inst, unsigned Opcode,
4739                                            MachineDominatorTree *MDT) const {
4740   MachineBasicBlock &MBB = *Inst.getParent();
4741   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4742 
4743   MachineOperand &Dest = Inst.getOperand(0);
4744   MachineOperand &Src0 = Inst.getOperand(1);
4745   MachineOperand &Src1 = Inst.getOperand(2);
4746   DebugLoc DL = Inst.getDebugLoc();
4747 
4748   MachineBasicBlock::iterator MII = Inst;
4749 
4750   const MCInstrDesc &InstDesc = get(Opcode);
4751   const TargetRegisterClass *Src0RC = Src0.isReg() ?
4752     MRI.getRegClass(Src0.getReg()) :
4753     &AMDGPU::SGPR_32RegClass;
4754 
4755   const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
4756   const TargetRegisterClass *Src1RC = Src1.isReg() ?
4757     MRI.getRegClass(Src1.getReg()) :
4758     &AMDGPU::SGPR_32RegClass;
4759 
4760   const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
4761 
4762   MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4763                                                        AMDGPU::sub0, Src0SubRC);
4764   MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
4765                                                        AMDGPU::sub0, Src1SubRC);
4766   MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
4767                                                        AMDGPU::sub1, Src0SubRC);
4768   MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
4769                                                        AMDGPU::sub1, Src1SubRC);
4770 
4771   const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
4772   const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
4773   const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
4774 
4775   unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
4776   MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
4777                               .add(SrcReg0Sub0)
4778                               .add(SrcReg1Sub0);
4779 
4780   unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
4781   MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
4782                               .add(SrcReg0Sub1)
4783                               .add(SrcReg1Sub1);
4784 
4785   unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
4786   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
4787     .addReg(DestSub0)
4788     .addImm(AMDGPU::sub0)
4789     .addReg(DestSub1)
4790     .addImm(AMDGPU::sub1);
4791 
4792   MRI.replaceRegWith(Dest.getReg(), FullDestReg);
4793 
4794   Worklist.insert(&LoHalf);
4795   Worklist.insert(&HiHalf);
4796 
4797   // Move all users of this moved vlaue.
4798   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
4799 }
4800 
4801 void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist,
4802                                        MachineInstr &Inst,
4803                                        MachineDominatorTree *MDT) const {
4804   MachineBasicBlock &MBB = *Inst.getParent();
4805   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4806 
4807   MachineOperand &Dest = Inst.getOperand(0);
4808   MachineOperand &Src0 = Inst.getOperand(1);
4809   MachineOperand &Src1 = Inst.getOperand(2);
4810   const DebugLoc &DL = Inst.getDebugLoc();
4811 
4812   MachineBasicBlock::iterator MII = Inst;
4813 
4814   const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
4815 
4816   unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
4817 
4818   MachineOperand* Op0;
4819   MachineOperand* Op1;
4820 
4821   if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
4822     Op0 = &Src0;
4823     Op1 = &Src1;
4824   } else {
4825     Op0 = &Src1;
4826     Op1 = &Src0;
4827   }
4828 
4829   BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
4830     .add(*Op0);
4831 
4832   unsigned NewDest = MRI.createVirtualRegister(DestRC);
4833 
4834   MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
4835     .addReg(Interm)
4836     .add(*Op1);
4837 
4838   MRI.replaceRegWith(Dest.getReg(), NewDest);
4839 
4840   Worklist.insert(&Xor);
4841 }
4842 
4843 void SIInstrInfo::splitScalar64BitBCNT(
4844     SetVectorType &Worklist, MachineInstr &Inst) const {
4845   MachineBasicBlock &MBB = *Inst.getParent();
4846   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4847 
4848   MachineBasicBlock::iterator MII = Inst;
4849   const DebugLoc &DL = Inst.getDebugLoc();
4850 
4851   MachineOperand &Dest = Inst.getOperand(0);
4852   MachineOperand &Src = Inst.getOperand(1);
4853 
4854   const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
4855   const TargetRegisterClass *SrcRC = Src.isReg() ?
4856     MRI.getRegClass(Src.getReg()) :
4857     &AMDGPU::SGPR_32RegClass;
4858 
4859   unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4860   unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4861 
4862   const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0);
4863 
4864   MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
4865                                                       AMDGPU::sub0, SrcSubRC);
4866   MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
4867                                                       AMDGPU::sub1, SrcSubRC);
4868 
4869   BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
4870 
4871   BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
4872 
4873   MRI.replaceRegWith(Dest.getReg(), ResultReg);
4874 
4875   // We don't need to legalize operands here. src0 for etiher instruction can be
4876   // an SGPR, and the second input is unused or determined here.
4877   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4878 }
4879 
4880 void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
4881                                       MachineInstr &Inst) const {
4882   MachineBasicBlock &MBB = *Inst.getParent();
4883   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4884   MachineBasicBlock::iterator MII = Inst;
4885   const DebugLoc &DL = Inst.getDebugLoc();
4886 
4887   MachineOperand &Dest = Inst.getOperand(0);
4888   uint32_t Imm = Inst.getOperand(2).getImm();
4889   uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
4890   uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
4891 
4892   (void) Offset;
4893 
4894   // Only sext_inreg cases handled.
4895   assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
4896          Offset == 0 && "Not implemented");
4897 
4898   if (BitWidth < 32) {
4899     unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4900     unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4901     unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
4902 
4903     BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo)
4904         .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
4905         .addImm(0)
4906         .addImm(BitWidth);
4907 
4908     BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
4909       .addImm(31)
4910       .addReg(MidRegLo);
4911 
4912     BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
4913       .addReg(MidRegLo)
4914       .addImm(AMDGPU::sub0)
4915       .addReg(MidRegHi)
4916       .addImm(AMDGPU::sub1);
4917 
4918     MRI.replaceRegWith(Dest.getReg(), ResultReg);
4919     addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4920     return;
4921   }
4922 
4923   MachineOperand &Src = Inst.getOperand(1);
4924   unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4925   unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
4926 
4927   BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
4928     .addImm(31)
4929     .addReg(Src.getReg(), 0, AMDGPU::sub0);
4930 
4931   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
4932     .addReg(Src.getReg(), 0, AMDGPU::sub0)
4933     .addImm(AMDGPU::sub0)
4934     .addReg(TmpReg)
4935     .addImm(AMDGPU::sub1);
4936 
4937   MRI.replaceRegWith(Dest.getReg(), ResultReg);
4938   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4939 }
4940 
4941 void SIInstrInfo::addUsersToMoveToVALUWorklist(
4942   unsigned DstReg,
4943   MachineRegisterInfo &MRI,
4944   SetVectorType &Worklist) const {
4945   for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
4946          E = MRI.use_end(); I != E;) {
4947     MachineInstr &UseMI = *I->getParent();
4948 
4949     unsigned OpNo = 0;
4950 
4951     switch (UseMI.getOpcode()) {
4952     case AMDGPU::COPY:
4953     case AMDGPU::WQM:
4954     case AMDGPU::WWM:
4955     case AMDGPU::REG_SEQUENCE:
4956     case AMDGPU::PHI:
4957     case AMDGPU::INSERT_SUBREG:
4958       break;
4959     default:
4960       OpNo = I.getOperandNo();
4961       break;
4962     }
4963 
4964     if (!RI.hasVGPRs(getOpRegClass(UseMI, OpNo))) {
4965       Worklist.insert(&UseMI);
4966 
4967       do {
4968         ++I;
4969       } while (I != E && I->getParent() == &UseMI);
4970     } else {
4971       ++I;
4972     }
4973   }
4974 }
4975 
4976 void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
4977                                  MachineRegisterInfo &MRI,
4978                                  MachineInstr &Inst) const {
4979   unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4980   MachineBasicBlock *MBB = Inst.getParent();
4981   MachineOperand &Src0 = Inst.getOperand(1);
4982   MachineOperand &Src1 = Inst.getOperand(2);
4983   const DebugLoc &DL = Inst.getDebugLoc();
4984 
4985   switch (Inst.getOpcode()) {
4986   case AMDGPU::S_PACK_LL_B32_B16: {
4987     unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4988     unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4989 
4990     // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
4991     // 0.
4992     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
4993       .addImm(0xffff);
4994 
4995     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
4996       .addReg(ImmReg, RegState::Kill)
4997       .add(Src0);
4998 
4999     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32), ResultReg)
5000       .add(Src1)
5001       .addImm(16)
5002       .addReg(TmpReg, RegState::Kill);
5003     break;
5004   }
5005   case AMDGPU::S_PACK_LH_B32_B16: {
5006     unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5007     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
5008       .addImm(0xffff);
5009     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32), ResultReg)
5010       .addReg(ImmReg, RegState::Kill)
5011       .add(Src0)
5012       .add(Src1);
5013     break;
5014   }
5015   case AMDGPU::S_PACK_HH_B32_B16: {
5016     unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5017     unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5018     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
5019       .addImm(16)
5020       .add(Src0);
5021     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
5022       .addImm(0xffff0000);
5023     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32), ResultReg)
5024       .add(Src1)
5025       .addReg(ImmReg, RegState::Kill)
5026       .addReg(TmpReg, RegState::Kill);
5027     break;
5028   }
5029   default:
5030     llvm_unreachable("unhandled s_pack_* instruction");
5031   }
5032 
5033   MachineOperand &Dest = Inst.getOperand(0);
5034   MRI.replaceRegWith(Dest.getReg(), ResultReg);
5035   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
5036 }
5037 
5038 void SIInstrInfo::addSCCDefUsersToVALUWorklist(
5039     MachineInstr &SCCDefInst, SetVectorType &Worklist) const {
5040   // This assumes that all the users of SCC are in the same block
5041   // as the SCC def.
5042   for (MachineInstr &MI :
5043        make_range(MachineBasicBlock::iterator(SCCDefInst),
5044                       SCCDefInst.getParent()->end())) {
5045     // Exit if we find another SCC def.
5046     if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1)
5047       return;
5048 
5049     if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1)
5050       Worklist.insert(&MI);
5051   }
5052 }
5053 
5054 const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
5055   const MachineInstr &Inst) const {
5056   const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
5057 
5058   switch (Inst.getOpcode()) {
5059   // For target instructions, getOpRegClass just returns the virtual register
5060   // class associated with the operand, so we need to find an equivalent VGPR
5061   // register class in order to move the instruction to the VALU.
5062   case AMDGPU::COPY:
5063   case AMDGPU::PHI:
5064   case AMDGPU::REG_SEQUENCE:
5065   case AMDGPU::INSERT_SUBREG:
5066   case AMDGPU::WQM:
5067   case AMDGPU::WWM:
5068     if (RI.hasVGPRs(NewDstRC))
5069       return nullptr;
5070 
5071     NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
5072     if (!NewDstRC)
5073       return nullptr;
5074     return NewDstRC;
5075   default:
5076     return NewDstRC;
5077   }
5078 }
5079 
5080 // Find the one SGPR operand we are allowed to use.
5081 unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
5082                                    int OpIndices[3]) const {
5083   const MCInstrDesc &Desc = MI.getDesc();
5084 
5085   // Find the one SGPR operand we are allowed to use.
5086   //
5087   // First we need to consider the instruction's operand requirements before
5088   // legalizing. Some operands are required to be SGPRs, such as implicit uses
5089   // of VCC, but we are still bound by the constant bus requirement to only use
5090   // one.
5091   //
5092   // If the operand's class is an SGPR, we can never move it.
5093 
5094   unsigned SGPRReg = findImplicitSGPRRead(MI);
5095   if (SGPRReg != AMDGPU::NoRegister)
5096     return SGPRReg;
5097 
5098   unsigned UsedSGPRs[3] = { AMDGPU::NoRegister };
5099   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5100 
5101   for (unsigned i = 0; i < 3; ++i) {
5102     int Idx = OpIndices[i];
5103     if (Idx == -1)
5104       break;
5105 
5106     const MachineOperand &MO = MI.getOperand(Idx);
5107     if (!MO.isReg())
5108       continue;
5109 
5110     // Is this operand statically required to be an SGPR based on the operand
5111     // constraints?
5112     const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass);
5113     bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
5114     if (IsRequiredSGPR)
5115       return MO.getReg();
5116 
5117     // If this could be a VGPR or an SGPR, Check the dynamic register class.
5118     unsigned Reg = MO.getReg();
5119     const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
5120     if (RI.isSGPRClass(RegRC))
5121       UsedSGPRs[i] = Reg;
5122   }
5123 
5124   // We don't have a required SGPR operand, so we have a bit more freedom in
5125   // selecting operands to move.
5126 
5127   // Try to select the most used SGPR. If an SGPR is equal to one of the
5128   // others, we choose that.
5129   //
5130   // e.g.
5131   // V_FMA_F32 v0, s0, s0, s0 -> No moves
5132   // V_FMA_F32 v0, s0, s1, s0 -> Move s1
5133 
5134   // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
5135   // prefer those.
5136 
5137   if (UsedSGPRs[0] != AMDGPU::NoRegister) {
5138     if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
5139       SGPRReg = UsedSGPRs[0];
5140   }
5141 
5142   if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) {
5143     if (UsedSGPRs[1] == UsedSGPRs[2])
5144       SGPRReg = UsedSGPRs[1];
5145   }
5146 
5147   return SGPRReg;
5148 }
5149 
5150 MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
5151                                              unsigned OperandName) const {
5152   int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
5153   if (Idx == -1)
5154     return nullptr;
5155 
5156   return &MI.getOperand(Idx);
5157 }
5158 
5159 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
5160   uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
5161   if (ST.isAmdHsaOS()) {
5162     // Set ATC = 1. GFX9 doesn't have this bit.
5163     if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
5164       RsrcDataFormat |= (1ULL << 56);
5165 
5166     // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
5167     // BTW, it disables TC L2 and therefore decreases performance.
5168     if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
5169       RsrcDataFormat |= (2ULL << 59);
5170   }
5171 
5172   return RsrcDataFormat;
5173 }
5174 
5175 uint64_t SIInstrInfo::getScratchRsrcWords23() const {
5176   uint64_t Rsrc23 = getDefaultRsrcDataFormat() |
5177                     AMDGPU::RSRC_TID_ENABLE |
5178                     0xffffffff; // Size;
5179 
5180   // GFX9 doesn't have ELEMENT_SIZE.
5181   if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
5182     uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1;
5183     Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
5184   }
5185 
5186   // IndexStride = 64.
5187   Rsrc23 |= UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
5188 
5189   // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
5190   // Clear them unless we want a huge stride.
5191   if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
5192     Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
5193 
5194   return Rsrc23;
5195 }
5196 
5197 bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const {
5198   unsigned Opc = MI.getOpcode();
5199 
5200   return isSMRD(Opc);
5201 }
5202 
5203 bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr &MI) const {
5204   unsigned Opc = MI.getOpcode();
5205 
5206   return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc);
5207 }
5208 
5209 unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI,
5210                                     int &FrameIndex) const {
5211   const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
5212   if (!Addr || !Addr->isFI())
5213     return AMDGPU::NoRegister;
5214 
5215   assert(!MI.memoperands_empty() &&
5216          (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
5217 
5218   FrameIndex = Addr->getIndex();
5219   return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
5220 }
5221 
5222 unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI,
5223                                         int &FrameIndex) const {
5224   const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
5225   assert(Addr && Addr->isFI());
5226   FrameIndex = Addr->getIndex();
5227   return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
5228 }
5229 
5230 unsigned SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
5231                                           int &FrameIndex) const {
5232   if (!MI.mayLoad())
5233     return AMDGPU::NoRegister;
5234 
5235   if (isMUBUF(MI) || isVGPRSpill(MI))
5236     return isStackAccess(MI, FrameIndex);
5237 
5238   if (isSGPRSpill(MI))
5239     return isSGPRStackAccess(MI, FrameIndex);
5240 
5241   return AMDGPU::NoRegister;
5242 }
5243 
5244 unsigned SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
5245                                          int &FrameIndex) const {
5246   if (!MI.mayStore())
5247     return AMDGPU::NoRegister;
5248 
5249   if (isMUBUF(MI) || isVGPRSpill(MI))
5250     return isStackAccess(MI, FrameIndex);
5251 
5252   if (isSGPRSpill(MI))
5253     return isSGPRStackAccess(MI, FrameIndex);
5254 
5255   return AMDGPU::NoRegister;
5256 }
5257 
5258 unsigned SIInstrInfo::getInstBundleSize(const MachineInstr &MI) const {
5259   unsigned Size = 0;
5260   MachineBasicBlock::const_instr_iterator I = MI.getIterator();
5261   MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
5262   while (++I != E && I->isInsideBundle()) {
5263     assert(!I->isBundle() && "No nested bundle!");
5264     Size += getInstSizeInBytes(*I);
5265   }
5266 
5267   return Size;
5268 }
5269 
5270 unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
5271   unsigned Opc = MI.getOpcode();
5272   const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc);
5273   unsigned DescSize = Desc.getSize();
5274 
5275   // If we have a definitive size, we can use it. Otherwise we need to inspect
5276   // the operands to know the size.
5277   if (isFixedSize(MI))
5278     return DescSize;
5279 
5280   // 4-byte instructions may have a 32-bit literal encoded after them. Check
5281   // operands that coud ever be literals.
5282   if (isVALU(MI) || isSALU(MI)) {
5283     int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
5284     if (Src0Idx == -1)
5285       return DescSize; // No operands.
5286 
5287     if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx]))
5288       return DescSize + 4;
5289 
5290     int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
5291     if (Src1Idx == -1)
5292       return DescSize;
5293 
5294     if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx]))
5295       return DescSize + 4;
5296 
5297     int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
5298     if (Src2Idx == -1)
5299       return DescSize;
5300 
5301     if (isLiteralConstantLike(MI.getOperand(Src2Idx), Desc.OpInfo[Src2Idx]))
5302       return DescSize + 4;
5303 
5304     return DescSize;
5305   }
5306 
5307   switch (Opc) {
5308   case TargetOpcode::IMPLICIT_DEF:
5309   case TargetOpcode::KILL:
5310   case TargetOpcode::DBG_VALUE:
5311   case TargetOpcode::EH_LABEL:
5312     return 0;
5313   case TargetOpcode::BUNDLE:
5314     return getInstBundleSize(MI);
5315   case TargetOpcode::INLINEASM: {
5316     const MachineFunction *MF = MI.getParent()->getParent();
5317     const char *AsmStr = MI.getOperand(0).getSymbolName();
5318     return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
5319   }
5320   default:
5321     return DescSize;
5322   }
5323 }
5324 
5325 bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const {
5326   if (!isFLAT(MI))
5327     return false;
5328 
5329   if (MI.memoperands_empty())
5330     return true;
5331 
5332   for (const MachineMemOperand *MMO : MI.memoperands()) {
5333     if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
5334       return true;
5335   }
5336   return false;
5337 }
5338 
5339 bool SIInstrInfo::isNonUniformBranchInstr(MachineInstr &Branch) const {
5340   return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO;
5341 }
5342 
5343 void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry,
5344                                             MachineBasicBlock *IfEnd) const {
5345   MachineBasicBlock::iterator TI = IfEntry->getFirstTerminator();
5346   assert(TI != IfEntry->end());
5347 
5348   MachineInstr *Branch = &(*TI);
5349   MachineFunction *MF = IfEntry->getParent();
5350   MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo();
5351 
5352   if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
5353     unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5354     MachineInstr *SIIF =
5355         BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg)
5356             .add(Branch->getOperand(0))
5357             .add(Branch->getOperand(1));
5358     MachineInstr *SIEND =
5359         BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF))
5360             .addReg(DstReg);
5361 
5362     IfEntry->erase(TI);
5363     IfEntry->insert(IfEntry->end(), SIIF);
5364     IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND);
5365   }
5366 }
5367 
5368 void SIInstrInfo::convertNonUniformLoopRegion(
5369     MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const {
5370   MachineBasicBlock::iterator TI = LoopEnd->getFirstTerminator();
5371   // We expect 2 terminators, one conditional and one unconditional.
5372   assert(TI != LoopEnd->end());
5373 
5374   MachineInstr *Branch = &(*TI);
5375   MachineFunction *MF = LoopEnd->getParent();
5376   MachineRegisterInfo &MRI = LoopEnd->getParent()->getRegInfo();
5377 
5378   if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
5379 
5380     unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5381     unsigned BackEdgeReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5382     MachineInstrBuilder HeaderPHIBuilder =
5383         BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg);
5384     for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(),
5385                                           E = LoopEntry->pred_end();
5386          PI != E; ++PI) {
5387       if (*PI == LoopEnd) {
5388         HeaderPHIBuilder.addReg(BackEdgeReg);
5389       } else {
5390         MachineBasicBlock *PMBB = *PI;
5391         unsigned ZeroReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5392         materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(),
5393                              ZeroReg, 0);
5394         HeaderPHIBuilder.addReg(ZeroReg);
5395       }
5396       HeaderPHIBuilder.addMBB(*PI);
5397     }
5398     MachineInstr *HeaderPhi = HeaderPHIBuilder;
5399     MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(),
5400                                       get(AMDGPU::SI_IF_BREAK), BackEdgeReg)
5401                                   .addReg(DstReg)
5402                                   .add(Branch->getOperand(0));
5403     MachineInstr *SILOOP =
5404         BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP))
5405             .addReg(BackEdgeReg)
5406             .addMBB(LoopEntry);
5407 
5408     LoopEntry->insert(LoopEntry->begin(), HeaderPhi);
5409     LoopEnd->erase(TI);
5410     LoopEnd->insert(LoopEnd->end(), SIIFBREAK);
5411     LoopEnd->insert(LoopEnd->end(), SILOOP);
5412   }
5413 }
5414 
5415 ArrayRef<std::pair<int, const char *>>
5416 SIInstrInfo::getSerializableTargetIndices() const {
5417   static const std::pair<int, const char *> TargetIndices[] = {
5418       {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
5419       {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
5420       {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
5421       {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
5422       {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
5423   return makeArrayRef(TargetIndices);
5424 }
5425 
5426 /// This is used by the post-RA scheduler (SchedulePostRAList.cpp).  The
5427 /// post-RA version of misched uses CreateTargetMIHazardRecognizer.
5428 ScheduleHazardRecognizer *
5429 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
5430                                             const ScheduleDAG *DAG) const {
5431   return new GCNHazardRecognizer(DAG->MF);
5432 }
5433 
5434 /// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
5435 /// pass.
5436 ScheduleHazardRecognizer *
5437 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const {
5438   return new GCNHazardRecognizer(MF);
5439 }
5440 
5441 std::pair<unsigned, unsigned>
5442 SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
5443   return std::make_pair(TF & MO_MASK, TF & ~MO_MASK);
5444 }
5445 
5446 ArrayRef<std::pair<unsigned, const char *>>
5447 SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
5448   static const std::pair<unsigned, const char *> TargetFlags[] = {
5449     { MO_GOTPCREL, "amdgpu-gotprel" },
5450     { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" },
5451     { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" },
5452     { MO_REL32_LO, "amdgpu-rel32-lo" },
5453     { MO_REL32_HI, "amdgpu-rel32-hi" }
5454   };
5455 
5456   return makeArrayRef(TargetFlags);
5457 }
5458 
5459 bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const {
5460   return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
5461          MI.modifiesRegister(AMDGPU::EXEC, &RI);
5462 }
5463 
5464 MachineInstrBuilder
5465 SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
5466                            MachineBasicBlock::iterator I,
5467                            const DebugLoc &DL,
5468                            unsigned DestReg) const {
5469   if (ST.hasAddNoCarry())
5470     return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
5471 
5472   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
5473   unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5474   MRI.setRegAllocationHint(UnusedCarry, 0, AMDGPU::VCC);
5475 
5476   return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg)
5477            .addReg(UnusedCarry, RegState::Define | RegState::Dead);
5478 }
5479 
5480 bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
5481   switch (Opcode) {
5482   case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
5483   case AMDGPU::SI_KILL_I1_TERMINATOR:
5484     return true;
5485   default:
5486     return false;
5487   }
5488 }
5489 
5490 const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) const {
5491   switch (Opcode) {
5492   case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5493     return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
5494   case AMDGPU::SI_KILL_I1_PSEUDO:
5495     return get(AMDGPU::SI_KILL_I1_TERMINATOR);
5496   default:
5497     llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
5498   }
5499 }
5500 
5501 bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const {
5502   if (!isSMRD(MI))
5503     return false;
5504 
5505   // Check that it is using a buffer resource.
5506   int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
5507   if (Idx == -1) // e.g. s_memtime
5508     return false;
5509 
5510   const auto RCID = MI.getDesc().OpInfo[Idx].RegClass;
5511   return RCID == AMDGPU::SReg_128RegClassID;
5512 }
5513 
5514 // This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td
5515 enum SIEncodingFamily {
5516   SI = 0,
5517   VI = 1,
5518   SDWA = 2,
5519   SDWA9 = 3,
5520   GFX80 = 4,
5521   GFX9 = 5
5522 };
5523 
5524 static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) {
5525   switch (ST.getGeneration()) {
5526   default:
5527     break;
5528   case AMDGPUSubtarget::SOUTHERN_ISLANDS:
5529   case AMDGPUSubtarget::SEA_ISLANDS:
5530     return SIEncodingFamily::SI;
5531   case AMDGPUSubtarget::VOLCANIC_ISLANDS:
5532   case AMDGPUSubtarget::GFX9:
5533     return SIEncodingFamily::VI;
5534   }
5535   llvm_unreachable("Unknown subtarget generation!");
5536 }
5537 
5538 int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
5539   SIEncodingFamily Gen = subtargetEncodingFamily(ST);
5540 
5541   if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 &&
5542     ST.getGeneration() >= AMDGPUSubtarget::GFX9)
5543     Gen = SIEncodingFamily::GFX9;
5544 
5545   if (get(Opcode).TSFlags & SIInstrFlags::SDWA)
5546     Gen = ST.getGeneration() == AMDGPUSubtarget::GFX9 ? SIEncodingFamily::SDWA9
5547                                                       : SIEncodingFamily::SDWA;
5548   // Adjust the encoding family to GFX80 for D16 buffer instructions when the
5549   // subtarget has UnpackedD16VMem feature.
5550   // TODO: remove this when we discard GFX80 encoding.
5551   if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
5552     Gen = SIEncodingFamily::GFX80;
5553 
5554   int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
5555 
5556   // -1 means that Opcode is already a native instruction.
5557   if (MCOp == -1)
5558     return Opcode;
5559 
5560   // (uint16_t)-1 means that Opcode is a pseudo instruction that has
5561   // no encoding in the given subtarget generation.
5562   if (MCOp == (uint16_t)-1)
5563     return -1;
5564 
5565   return MCOp;
5566 }
5567 
5568 static
5569 TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd) {
5570   assert(RegOpnd.isReg());
5571   return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
5572                              getRegSubRegPair(RegOpnd);
5573 }
5574 
5575 TargetInstrInfo::RegSubRegPair
5576 llvm::getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg) {
5577   assert(MI.isRegSequence());
5578   for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
5579     if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
5580       auto &RegOp = MI.getOperand(1 + 2 * I);
5581       return getRegOrUndef(RegOp);
5582     }
5583   return TargetInstrInfo::RegSubRegPair();
5584 }
5585 
5586 // Try to find the definition of reg:subreg in subreg-manipulation pseudos
5587 // Following a subreg of reg:subreg isn't supported
5588 static bool followSubRegDef(MachineInstr &MI,
5589                             TargetInstrInfo::RegSubRegPair &RSR) {
5590   if (!RSR.SubReg)
5591     return false;
5592   switch (MI.getOpcode()) {
5593   default: break;
5594   case AMDGPU::REG_SEQUENCE:
5595     RSR = getRegSequenceSubReg(MI, RSR.SubReg);
5596     return true;
5597   // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
5598   case AMDGPU::INSERT_SUBREG:
5599     if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
5600       // inserted the subreg we're looking for
5601       RSR = getRegOrUndef(MI.getOperand(2));
5602     else { // the subreg in the rest of the reg
5603       auto R1 = getRegOrUndef(MI.getOperand(1));
5604       if (R1.SubReg) // subreg of subreg isn't supported
5605         return false;
5606       RSR.Reg = R1.Reg;
5607     }
5608     return true;
5609   }
5610   return false;
5611 }
5612 
5613 MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
5614                                      MachineRegisterInfo &MRI) {
5615   assert(MRI.isSSA());
5616   if (!TargetRegisterInfo::isVirtualRegister(P.Reg))
5617     return nullptr;
5618 
5619   auto RSR = P;
5620   auto *DefInst = MRI.getVRegDef(RSR.Reg);
5621   while (auto *MI = DefInst) {
5622     DefInst = nullptr;
5623     switch (MI->getOpcode()) {
5624     case AMDGPU::COPY:
5625     case AMDGPU::V_MOV_B32_e32: {
5626       auto &Op1 = MI->getOperand(1);
5627       if (Op1.isReg() &&
5628         TargetRegisterInfo::isVirtualRegister(Op1.getReg())) {
5629         if (Op1.isUndef())
5630           return nullptr;
5631         RSR = getRegSubRegPair(Op1);
5632         DefInst = MRI.getVRegDef(RSR.Reg);
5633       }
5634       break;
5635     }
5636     default:
5637       if (followSubRegDef(*MI, RSR)) {
5638         if (!RSR.Reg)
5639           return nullptr;
5640         DefInst = MRI.getVRegDef(RSR.Reg);
5641       }
5642     }
5643     if (!DefInst)
5644       return MI;
5645   }
5646   return nullptr;
5647 }
5648