1 //===- SIInstrInfo.cpp - SI Instruction Information  ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// SI Implementation of TargetInstrInfo.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "SIInstrInfo.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUInstrInfo.h"
17 #include "GCNHazardRecognizer.h"
18 #include "GCNSubtarget.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "llvm/Analysis/ValueTracking.h"
22 #include "llvm/CodeGen/LiveVariables.h"
23 #include "llvm/CodeGen/MachineDominators.h"
24 #include "llvm/CodeGen/RegisterScavenging.h"
25 #include "llvm/CodeGen/ScheduleDAG.h"
26 #include "llvm/IR/DiagnosticInfo.h"
27 #include "llvm/IR/IntrinsicsAMDGPU.h"
28 #include "llvm/Support/CommandLine.h"
29 #include "llvm/Target/TargetMachine.h"
30 
31 using namespace llvm;
32 
33 #define DEBUG_TYPE "si-instr-info"
34 
35 #define GET_INSTRINFO_CTOR_DTOR
36 #include "AMDGPUGenInstrInfo.inc"
37 
38 namespace llvm {
39 
40 class AAResults;
41 
42 namespace AMDGPU {
43 #define GET_D16ImageDimIntrinsics_IMPL
44 #define GET_ImageDimIntrinsicTable_IMPL
45 #define GET_RsrcIntrinsics_IMPL
46 #include "AMDGPUGenSearchableTables.inc"
47 }
48 }
49 
50 
51 // Must be at least 4 to be able to branch over minimum unconditional branch
52 // code. This is only for making it possible to write reasonably small tests for
53 // long branches.
54 static cl::opt<unsigned>
55 BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
56                  cl::desc("Restrict range of branch instructions (DEBUG)"));
57 
58 static cl::opt<bool> Fix16BitCopies(
59   "amdgpu-fix-16-bit-physreg-copies",
60   cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
61   cl::init(true),
62   cl::ReallyHidden);
63 
64 SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST)
65   : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
66     RI(ST), ST(ST) {
67   SchedModel.init(&ST);
68 }
69 
70 //===----------------------------------------------------------------------===//
71 // TargetInstrInfo callbacks
72 //===----------------------------------------------------------------------===//
73 
74 static unsigned getNumOperandsNoGlue(SDNode *Node) {
75   unsigned N = Node->getNumOperands();
76   while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
77     --N;
78   return N;
79 }
80 
81 /// Returns true if both nodes have the same value for the given
82 ///        operand \p Op, or if both nodes do not have this operand.
83 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
84   unsigned Opc0 = N0->getMachineOpcode();
85   unsigned Opc1 = N1->getMachineOpcode();
86 
87   int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
88   int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
89 
90   if (Op0Idx == -1 && Op1Idx == -1)
91     return true;
92 
93 
94   if ((Op0Idx == -1 && Op1Idx != -1) ||
95       (Op1Idx == -1 && Op0Idx != -1))
96     return false;
97 
98   // getNamedOperandIdx returns the index for the MachineInstr's operands,
99   // which includes the result as the first operand. We are indexing into the
100   // MachineSDNode's operands, so we need to skip the result operand to get
101   // the real index.
102   --Op0Idx;
103   --Op1Idx;
104 
105   return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
106 }
107 
108 bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
109                                                     AAResults *AA) const {
110   // TODO: The generic check fails for VALU instructions that should be
111   // rematerializable due to implicit reads of exec. We really want all of the
112   // generic logic for this except for this.
113   switch (MI.getOpcode()) {
114   case AMDGPU::V_MOV_B32_e32:
115   case AMDGPU::V_MOV_B32_e64:
116   case AMDGPU::V_MOV_B64_PSEUDO:
117   case AMDGPU::V_ACCVGPR_READ_B32_e64:
118   case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
119     // No non-standard implicit operands.
120     assert(MI.getDesc().getNumOperands() == 2);
121     assert(MI.getDesc().getNumImplicitDefs() == 0);
122     assert(MI.getDesc().getNumImplicitUses() == 1);
123     return MI.getNumOperands() == 3;
124   default:
125     return false;
126   }
127 }
128 
129 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
130                                           int64_t &Offset0,
131                                           int64_t &Offset1) const {
132   if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
133     return false;
134 
135   unsigned Opc0 = Load0->getMachineOpcode();
136   unsigned Opc1 = Load1->getMachineOpcode();
137 
138   // Make sure both are actually loads.
139   if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
140     return false;
141 
142   if (isDS(Opc0) && isDS(Opc1)) {
143 
144     // FIXME: Handle this case:
145     if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
146       return false;
147 
148     // Check base reg.
149     if (Load0->getOperand(0) != Load1->getOperand(0))
150       return false;
151 
152     // Skip read2 / write2 variants for simplicity.
153     // TODO: We should report true if the used offsets are adjacent (excluded
154     // st64 versions).
155     int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
156     int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
157     if (Offset0Idx == -1 || Offset1Idx == -1)
158       return false;
159 
160     // XXX - be careful of datalesss loads
161     // getNamedOperandIdx returns the index for MachineInstrs.  Since they
162     // include the output in the operand list, but SDNodes don't, we need to
163     // subtract the index by one.
164     Offset0Idx -= get(Opc0).NumDefs;
165     Offset1Idx -= get(Opc1).NumDefs;
166     Offset0 = cast<ConstantSDNode>(Load0->getOperand(Offset0Idx))->getZExtValue();
167     Offset1 = cast<ConstantSDNode>(Load1->getOperand(Offset1Idx))->getZExtValue();
168     return true;
169   }
170 
171   if (isSMRD(Opc0) && isSMRD(Opc1)) {
172     // Skip time and cache invalidation instructions.
173     if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 ||
174         AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1)
175       return false;
176 
177     assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1));
178 
179     // Check base reg.
180     if (Load0->getOperand(0) != Load1->getOperand(0))
181       return false;
182 
183     const ConstantSDNode *Load0Offset =
184         dyn_cast<ConstantSDNode>(Load0->getOperand(1));
185     const ConstantSDNode *Load1Offset =
186         dyn_cast<ConstantSDNode>(Load1->getOperand(1));
187 
188     if (!Load0Offset || !Load1Offset)
189       return false;
190 
191     Offset0 = Load0Offset->getZExtValue();
192     Offset1 = Load1Offset->getZExtValue();
193     return true;
194   }
195 
196   // MUBUF and MTBUF can access the same addresses.
197   if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
198 
199     // MUBUF and MTBUF have vaddr at different indices.
200     if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
201         !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
202         !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
203       return false;
204 
205     int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
206     int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
207 
208     if (OffIdx0 == -1 || OffIdx1 == -1)
209       return false;
210 
211     // getNamedOperandIdx returns the index for MachineInstrs.  Since they
212     // include the output in the operand list, but SDNodes don't, we need to
213     // subtract the index by one.
214     OffIdx0 -= get(Opc0).NumDefs;
215     OffIdx1 -= get(Opc1).NumDefs;
216 
217     SDValue Off0 = Load0->getOperand(OffIdx0);
218     SDValue Off1 = Load1->getOperand(OffIdx1);
219 
220     // The offset might be a FrameIndexSDNode.
221     if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
222       return false;
223 
224     Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue();
225     Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue();
226     return true;
227   }
228 
229   return false;
230 }
231 
232 static bool isStride64(unsigned Opc) {
233   switch (Opc) {
234   case AMDGPU::DS_READ2ST64_B32:
235   case AMDGPU::DS_READ2ST64_B64:
236   case AMDGPU::DS_WRITE2ST64_B32:
237   case AMDGPU::DS_WRITE2ST64_B64:
238     return true;
239   default:
240     return false;
241   }
242 }
243 
244 bool SIInstrInfo::getMemOperandsWithOffsetWidth(
245     const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
246     int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
247     const TargetRegisterInfo *TRI) const {
248   if (!LdSt.mayLoadOrStore())
249     return false;
250 
251   unsigned Opc = LdSt.getOpcode();
252   OffsetIsScalable = false;
253   const MachineOperand *BaseOp, *OffsetOp;
254   int DataOpIdx;
255 
256   if (isDS(LdSt)) {
257     BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
258     OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
259     if (OffsetOp) {
260       // Normal, single offset LDS instruction.
261       if (!BaseOp) {
262         // DS_CONSUME/DS_APPEND use M0 for the base address.
263         // TODO: find the implicit use operand for M0 and use that as BaseOp?
264         return false;
265       }
266       BaseOps.push_back(BaseOp);
267       Offset = OffsetOp->getImm();
268       // Get appropriate operand, and compute width accordingly.
269       DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
270       if (DataOpIdx == -1)
271         DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
272       Width = getOpSize(LdSt, DataOpIdx);
273     } else {
274       // The 2 offset instructions use offset0 and offset1 instead. We can treat
275       // these as a load with a single offset if the 2 offsets are consecutive.
276       // We will use this for some partially aligned loads.
277       const MachineOperand *Offset0Op =
278           getNamedOperand(LdSt, AMDGPU::OpName::offset0);
279       const MachineOperand *Offset1Op =
280           getNamedOperand(LdSt, AMDGPU::OpName::offset1);
281 
282       unsigned Offset0 = Offset0Op->getImm();
283       unsigned Offset1 = Offset1Op->getImm();
284       if (Offset0 + 1 != Offset1)
285         return false;
286 
287       // Each of these offsets is in element sized units, so we need to convert
288       // to bytes of the individual reads.
289 
290       unsigned EltSize;
291       if (LdSt.mayLoad())
292         EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
293       else {
294         assert(LdSt.mayStore());
295         int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
296         EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
297       }
298 
299       if (isStride64(Opc))
300         EltSize *= 64;
301 
302       BaseOps.push_back(BaseOp);
303       Offset = EltSize * Offset0;
304       // Get appropriate operand(s), and compute width accordingly.
305       DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
306       if (DataOpIdx == -1) {
307         DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
308         Width = getOpSize(LdSt, DataOpIdx);
309         DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
310         Width += getOpSize(LdSt, DataOpIdx);
311       } else {
312         Width = getOpSize(LdSt, DataOpIdx);
313       }
314     }
315     return true;
316   }
317 
318   if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
319     const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
320     if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
321       return false;
322     BaseOps.push_back(RSrc);
323     BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
324     if (BaseOp && !BaseOp->isFI())
325       BaseOps.push_back(BaseOp);
326     const MachineOperand *OffsetImm =
327         getNamedOperand(LdSt, AMDGPU::OpName::offset);
328     Offset = OffsetImm->getImm();
329     const MachineOperand *SOffset =
330         getNamedOperand(LdSt, AMDGPU::OpName::soffset);
331     if (SOffset) {
332       if (SOffset->isReg())
333         BaseOps.push_back(SOffset);
334       else
335         Offset += SOffset->getImm();
336     }
337     // Get appropriate operand, and compute width accordingly.
338     DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
339     if (DataOpIdx == -1)
340       DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
341     Width = getOpSize(LdSt, DataOpIdx);
342     return true;
343   }
344 
345   if (isMIMG(LdSt)) {
346     int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
347     BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
348     int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
349     if (VAddr0Idx >= 0) {
350       // GFX10 possible NSA encoding.
351       for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
352         BaseOps.push_back(&LdSt.getOperand(I));
353     } else {
354       BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
355     }
356     Offset = 0;
357     // Get appropriate operand, and compute width accordingly.
358     DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
359     Width = getOpSize(LdSt, DataOpIdx);
360     return true;
361   }
362 
363   if (isSMRD(LdSt)) {
364     BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
365     if (!BaseOp) // e.g. S_MEMTIME
366       return false;
367     BaseOps.push_back(BaseOp);
368     OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
369     Offset = OffsetOp ? OffsetOp->getImm() : 0;
370     // Get appropriate operand, and compute width accordingly.
371     DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
372     Width = getOpSize(LdSt, DataOpIdx);
373     return true;
374   }
375 
376   if (isFLAT(LdSt)) {
377     // Instructions have either vaddr or saddr or both or none.
378     BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
379     if (BaseOp)
380       BaseOps.push_back(BaseOp);
381     BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
382     if (BaseOp)
383       BaseOps.push_back(BaseOp);
384     Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
385     // Get appropriate operand, and compute width accordingly.
386     DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
387     if (DataOpIdx == -1)
388       DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
389     Width = getOpSize(LdSt, DataOpIdx);
390     return true;
391   }
392 
393   return false;
394 }
395 
396 static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
397                                   ArrayRef<const MachineOperand *> BaseOps1,
398                                   const MachineInstr &MI2,
399                                   ArrayRef<const MachineOperand *> BaseOps2) {
400   // Only examine the first "base" operand of each instruction, on the
401   // assumption that it represents the real base address of the memory access.
402   // Other operands are typically offsets or indices from this base address.
403   if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
404     return true;
405 
406   if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
407     return false;
408 
409   auto MO1 = *MI1.memoperands_begin();
410   auto MO2 = *MI2.memoperands_begin();
411   if (MO1->getAddrSpace() != MO2->getAddrSpace())
412     return false;
413 
414   auto Base1 = MO1->getValue();
415   auto Base2 = MO2->getValue();
416   if (!Base1 || !Base2)
417     return false;
418   Base1 = getUnderlyingObject(Base1);
419   Base2 = getUnderlyingObject(Base2);
420 
421   if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
422     return false;
423 
424   return Base1 == Base2;
425 }
426 
427 bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
428                                       ArrayRef<const MachineOperand *> BaseOps2,
429                                       unsigned NumLoads,
430                                       unsigned NumBytes) const {
431   // If the mem ops (to be clustered) do not have the same base ptr, then they
432   // should not be clustered
433   if (!BaseOps1.empty() && !BaseOps2.empty()) {
434     const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
435     const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
436     if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
437       return false;
438   } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
439     // If only one base op is empty, they do not have the same base ptr
440     return false;
441   }
442 
443   // In order to avoid regester pressure, on an average, the number of DWORDS
444   // loaded together by all clustered mem ops should not exceed 8. This is an
445   // empirical value based on certain observations and performance related
446   // experiments.
447   // The good thing about this heuristic is - it avoids clustering of too many
448   // sub-word loads, and also avoids clustering of wide loads. Below is the
449   // brief summary of how the heuristic behaves for various `LoadSize`.
450   // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
451   // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
452   // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
453   // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
454   // (5) LoadSize >= 17: do not cluster
455   const unsigned LoadSize = NumBytes / NumLoads;
456   const unsigned NumDWORDs = ((LoadSize + 3) / 4) * NumLoads;
457   return NumDWORDs <= 8;
458 }
459 
460 // FIXME: This behaves strangely. If, for example, you have 32 load + stores,
461 // the first 16 loads will be interleaved with the stores, and the next 16 will
462 // be clustered as expected. It should really split into 2 16 store batches.
463 //
464 // Loads are clustered until this returns false, rather than trying to schedule
465 // groups of stores. This also means we have to deal with saying different
466 // address space loads should be clustered, and ones which might cause bank
467 // conflicts.
468 //
469 // This might be deprecated so it might not be worth that much effort to fix.
470 bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1,
471                                           int64_t Offset0, int64_t Offset1,
472                                           unsigned NumLoads) const {
473   assert(Offset1 > Offset0 &&
474          "Second offset should be larger than first offset!");
475   // If we have less than 16 loads in a row, and the offsets are within 64
476   // bytes, then schedule together.
477 
478   // A cacheline is 64 bytes (for global memory).
479   return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
480 }
481 
482 static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
483                               MachineBasicBlock::iterator MI,
484                               const DebugLoc &DL, MCRegister DestReg,
485                               MCRegister SrcReg, bool KillSrc,
486                               const char *Msg = "illegal SGPR to VGPR copy") {
487   MachineFunction *MF = MBB.getParent();
488   DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), Msg, DL, DS_Error);
489   LLVMContext &C = MF->getFunction().getContext();
490   C.diagnose(IllegalCopy);
491 
492   BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
493     .addReg(SrcReg, getKillRegState(KillSrc));
494 }
495 
496 /// Handle copying from SGPR to AGPR, or from AGPR to AGPR. It is not possible
497 /// to directly copy, so an intermediate VGPR needs to be used.
498 static void indirectCopyToAGPR(const SIInstrInfo &TII,
499                                MachineBasicBlock &MBB,
500                                MachineBasicBlock::iterator MI,
501                                const DebugLoc &DL, MCRegister DestReg,
502                                MCRegister SrcReg, bool KillSrc,
503                                RegScavenger &RS,
504                                Register ImpDefSuperReg = Register(),
505                                Register ImpUseSuperReg = Register()) {
506   const SIRegisterInfo &RI = TII.getRegisterInfo();
507 
508   assert(AMDGPU::SReg_32RegClass.contains(SrcReg) ||
509          AMDGPU::AGPR_32RegClass.contains(SrcReg));
510 
511   // First try to find defining accvgpr_write to avoid temporary registers.
512   for (auto Def = MI, E = MBB.begin(); Def != E; ) {
513     --Def;
514     if (!Def->definesRegister(SrcReg, &RI))
515       continue;
516     if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
517       break;
518 
519     MachineOperand &DefOp = Def->getOperand(1);
520     assert(DefOp.isReg() || DefOp.isImm());
521 
522     if (DefOp.isReg()) {
523       // Check that register source operand if not clobbered before MI.
524       // Immediate operands are always safe to propagate.
525       bool SafeToPropagate = true;
526       for (auto I = Def; I != MI && SafeToPropagate; ++I)
527         if (I->modifiesRegister(DefOp.getReg(), &RI))
528           SafeToPropagate = false;
529 
530       if (!SafeToPropagate)
531         break;
532 
533       DefOp.setIsKill(false);
534     }
535 
536     MachineInstrBuilder Builder =
537       BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
538       .add(DefOp);
539     if (ImpDefSuperReg)
540       Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
541 
542     if (ImpUseSuperReg) {
543       Builder.addReg(ImpUseSuperReg,
544                      getKillRegState(KillSrc) | RegState::Implicit);
545     }
546 
547     return;
548   }
549 
550   RS.enterBasicBlock(MBB);
551   RS.forward(MI);
552 
553   // Ideally we want to have three registers for a long reg_sequence copy
554   // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
555   unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
556                                              *MBB.getParent());
557 
558   // Registers in the sequence are allocated contiguously so we can just
559   // use register number to pick one of three round-robin temps.
560   unsigned RegNo = DestReg % 3;
561   Register Tmp = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
562   if (!Tmp)
563     report_fatal_error("Cannot scavenge VGPR to copy to AGPR");
564   RS.setRegUsed(Tmp);
565 
566   if (!TII.getSubtarget().hasGFX90AInsts()) {
567     // Only loop through if there are any free registers left, otherwise
568     // scavenger may report a fatal error without emergency spill slot
569     // or spill with the slot.
570     while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) {
571       Register Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
572       if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
573         break;
574       Tmp = Tmp2;
575       RS.setRegUsed(Tmp);
576     }
577   }
578 
579   // Insert copy to temporary VGPR.
580   unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
581   if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
582     TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
583   } else {
584     assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
585   }
586 
587   MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
588     .addReg(SrcReg, getKillRegState(KillSrc));
589   if (ImpUseSuperReg) {
590     UseBuilder.addReg(ImpUseSuperReg,
591                       getKillRegState(KillSrc) | RegState::Implicit);
592   }
593 
594   MachineInstrBuilder DefBuilder
595     = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
596     .addReg(Tmp, RegState::Kill);
597 
598   if (ImpDefSuperReg)
599     DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
600 }
601 
602 static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB,
603                            MachineBasicBlock::iterator MI, const DebugLoc &DL,
604                            MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
605                            const TargetRegisterClass *RC, bool Forward) {
606   const SIRegisterInfo &RI = TII.getRegisterInfo();
607   ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
608   MachineBasicBlock::iterator I = MI;
609   MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
610 
611   for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
612     int16_t SubIdx = BaseIndices[Idx];
613     Register Reg = RI.getSubReg(DestReg, SubIdx);
614     unsigned Opcode = AMDGPU::S_MOV_B32;
615 
616     // Is SGPR aligned? If so try to combine with next.
617     Register Src = RI.getSubReg(SrcReg, SubIdx);
618     bool AlignedDest = ((Reg - AMDGPU::SGPR0) % 2) == 0;
619     bool AlignedSrc = ((Src - AMDGPU::SGPR0) % 2) == 0;
620     if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
621       // Can use SGPR64 copy
622       unsigned Channel = RI.getChannelFromSubReg(SubIdx);
623       SubIdx = RI.getSubRegFromChannel(Channel, 2);
624       Opcode = AMDGPU::S_MOV_B64;
625       Idx++;
626     }
627 
628     LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), RI.getSubReg(DestReg, SubIdx))
629                  .addReg(RI.getSubReg(SrcReg, SubIdx))
630                  .addReg(SrcReg, RegState::Implicit);
631 
632     if (!FirstMI)
633       FirstMI = LastMI;
634 
635     if (!Forward)
636       I--;
637   }
638 
639   assert(FirstMI && LastMI);
640   if (!Forward)
641     std::swap(FirstMI, LastMI);
642 
643   FirstMI->addOperand(
644       MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
645 
646   if (KillSrc)
647     LastMI->addRegisterKilled(SrcReg, &RI);
648 }
649 
650 void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
651                               MachineBasicBlock::iterator MI,
652                               const DebugLoc &DL, MCRegister DestReg,
653                               MCRegister SrcReg, bool KillSrc) const {
654   const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg);
655 
656   // FIXME: This is hack to resolve copies between 16 bit and 32 bit
657   // registers until all patterns are fixed.
658   if (Fix16BitCopies &&
659       ((RI.getRegSizeInBits(*RC) == 16) ^
660        (RI.getRegSizeInBits(*RI.getPhysRegClass(SrcReg)) == 16))) {
661     MCRegister &RegToFix = (RI.getRegSizeInBits(*RC) == 16) ? DestReg : SrcReg;
662     MCRegister Super = RI.get32BitRegister(RegToFix);
663     assert(RI.getSubReg(Super, AMDGPU::lo16) == RegToFix);
664     RegToFix = Super;
665 
666     if (DestReg == SrcReg) {
667       // Insert empty bundle since ExpandPostRA expects an instruction here.
668       BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
669       return;
670     }
671 
672     RC = RI.getPhysRegClass(DestReg);
673   }
674 
675   if (RC == &AMDGPU::VGPR_32RegClass) {
676     assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
677            AMDGPU::SReg_32RegClass.contains(SrcReg) ||
678            AMDGPU::AGPR_32RegClass.contains(SrcReg));
679     unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
680                      AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
681     BuildMI(MBB, MI, DL, get(Opc), DestReg)
682       .addReg(SrcReg, getKillRegState(KillSrc));
683     return;
684   }
685 
686   if (RC == &AMDGPU::SReg_32_XM0RegClass ||
687       RC == &AMDGPU::SReg_32RegClass) {
688     if (SrcReg == AMDGPU::SCC) {
689       BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
690           .addImm(1)
691           .addImm(0);
692       return;
693     }
694 
695     if (DestReg == AMDGPU::VCC_LO) {
696       if (AMDGPU::SReg_32RegClass.contains(SrcReg)) {
697         BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO)
698           .addReg(SrcReg, getKillRegState(KillSrc));
699       } else {
700         // FIXME: Hack until VReg_1 removed.
701         assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
702         BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
703           .addImm(0)
704           .addReg(SrcReg, getKillRegState(KillSrc));
705       }
706 
707       return;
708     }
709 
710     if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
711       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
712       return;
713     }
714 
715     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
716             .addReg(SrcReg, getKillRegState(KillSrc));
717     return;
718   }
719 
720   if (RC == &AMDGPU::SReg_64RegClass) {
721     if (SrcReg == AMDGPU::SCC) {
722       BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
723           .addImm(1)
724           .addImm(0);
725       return;
726     }
727 
728     if (DestReg == AMDGPU::VCC) {
729       if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
730         BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
731           .addReg(SrcReg, getKillRegState(KillSrc));
732       } else {
733         // FIXME: Hack until VReg_1 removed.
734         assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
735         BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
736           .addImm(0)
737           .addReg(SrcReg, getKillRegState(KillSrc));
738       }
739 
740       return;
741     }
742 
743     if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
744       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
745       return;
746     }
747 
748     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
749             .addReg(SrcReg, getKillRegState(KillSrc));
750     return;
751   }
752 
753   if (DestReg == AMDGPU::SCC) {
754     // Copying 64-bit or 32-bit sources to SCC barely makes sense,
755     // but SelectionDAG emits such copies for i1 sources.
756     if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
757       // This copy can only be produced by patterns
758       // with explicit SCC, which are known to be enabled
759       // only for subtargets with S_CMP_LG_U64 present.
760       assert(ST.hasScalarCompareEq64());
761       BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
762           .addReg(SrcReg, getKillRegState(KillSrc))
763           .addImm(0);
764     } else {
765       assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
766       BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
767           .addReg(SrcReg, getKillRegState(KillSrc))
768           .addImm(0);
769     }
770 
771     return;
772   }
773 
774   if (RC == &AMDGPU::AGPR_32RegClass) {
775     if (AMDGPU::VGPR_32RegClass.contains(SrcReg)) {
776       BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
777         .addReg(SrcReg, getKillRegState(KillSrc));
778       return;
779     }
780 
781     if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
782       BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
783         .addReg(SrcReg, getKillRegState(KillSrc));
784       return;
785     }
786 
787     // FIXME: Pass should maintain scavenger to avoid scan through the block on
788     // every AGPR spill.
789     RegScavenger RS;
790     indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS);
791     return;
792   }
793 
794   const unsigned Size = RI.getRegSizeInBits(*RC);
795   if (Size == 16) {
796     assert(AMDGPU::VGPR_LO16RegClass.contains(SrcReg) ||
797            AMDGPU::VGPR_HI16RegClass.contains(SrcReg) ||
798            AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
799            AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
800 
801     bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
802     bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
803     bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
804     bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
805     bool DstLow = AMDGPU::VGPR_LO16RegClass.contains(DestReg) ||
806                   AMDGPU::SReg_LO16RegClass.contains(DestReg) ||
807                   AMDGPU::AGPR_LO16RegClass.contains(DestReg);
808     bool SrcLow = AMDGPU::VGPR_LO16RegClass.contains(SrcReg) ||
809                   AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
810                   AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
811     MCRegister NewDestReg = RI.get32BitRegister(DestReg);
812     MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
813 
814     if (IsSGPRDst) {
815       if (!IsSGPRSrc) {
816         reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
817         return;
818       }
819 
820       BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
821         .addReg(NewSrcReg, getKillRegState(KillSrc));
822       return;
823     }
824 
825     if (IsAGPRDst || IsAGPRSrc) {
826       if (!DstLow || !SrcLow) {
827         reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
828                           "Cannot use hi16 subreg with an AGPR!");
829       }
830 
831       copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
832       return;
833     }
834 
835     if (IsSGPRSrc && !ST.hasSDWAScalar()) {
836       if (!DstLow || !SrcLow) {
837         reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
838                           "Cannot use hi16 subreg on VI!");
839       }
840 
841       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
842         .addReg(NewSrcReg, getKillRegState(KillSrc));
843       return;
844     }
845 
846     auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
847       .addImm(0) // src0_modifiers
848       .addReg(NewSrcReg)
849       .addImm(0) // clamp
850       .addImm(DstLow ? AMDGPU::SDWA::SdwaSel::WORD_0
851                      : AMDGPU::SDWA::SdwaSel::WORD_1)
852       .addImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE)
853       .addImm(SrcLow ? AMDGPU::SDWA::SdwaSel::WORD_0
854                      : AMDGPU::SDWA::SdwaSel::WORD_1)
855       .addReg(NewDestReg, RegState::Implicit | RegState::Undef);
856     // First implicit operand is $exec.
857     MIB->tieOperands(0, MIB->getNumOperands() - 1);
858     return;
859   }
860 
861   const TargetRegisterClass *SrcRC = RI.getPhysRegClass(SrcReg);
862   if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
863     if (ST.hasPackedFP32Ops()) {
864       BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
865         .addImm(SISrcMods::OP_SEL_1)
866         .addReg(SrcReg)
867         .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
868         .addReg(SrcReg)
869         .addImm(0) // op_sel_lo
870         .addImm(0) // op_sel_hi
871         .addImm(0) // neg_lo
872         .addImm(0) // neg_hi
873         .addImm(0) // clamp
874         .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
875       return;
876     }
877   }
878 
879   const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
880   if (RI.isSGPRClass(RC)) {
881     if (!RI.isSGPRClass(SrcRC)) {
882       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
883       return;
884     }
885     expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RC, Forward);
886     return;
887   }
888 
889   unsigned EltSize = 4;
890   unsigned Opcode = AMDGPU::V_MOV_B32_e32;
891   if (RI.hasAGPRs(RC)) {
892     Opcode = (RI.hasVGPRs(SrcRC)) ?
893       AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
894   } else if (RI.hasVGPRs(RC) && RI.hasAGPRs(SrcRC)) {
895     Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
896   } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
897              (RI.isProperlyAlignedRC(*RC) &&
898               (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
899     // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
900     if (ST.hasPackedFP32Ops()) {
901       Opcode = AMDGPU::V_PK_MOV_B32;
902       EltSize = 8;
903     }
904   }
905 
906   // For the cases where we need an intermediate instruction/temporary register
907   // (destination is an AGPR), we need a scavenger.
908   //
909   // FIXME: The pass should maintain this for us so we don't have to re-scan the
910   // whole block for every handled copy.
911   std::unique_ptr<RegScavenger> RS;
912   if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
913     RS.reset(new RegScavenger());
914 
915   ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
916 
917   // If there is an overlap, we can't kill the super-register on the last
918   // instruction, since it will also kill the components made live by this def.
919   const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
920 
921   for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
922     unsigned SubIdx;
923     if (Forward)
924       SubIdx = SubIndices[Idx];
925     else
926       SubIdx = SubIndices[SubIndices.size() - Idx - 1];
927 
928     bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
929 
930     if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
931       Register ImpDefSuper = Idx == 0 ? Register(DestReg) : Register();
932       Register ImpUseSuper = SrcReg;
933       indirectCopyToAGPR(*this, MBB, MI, DL, RI.getSubReg(DestReg, SubIdx),
934                          RI.getSubReg(SrcReg, SubIdx), UseKill, *RS,
935                          ImpDefSuper, ImpUseSuper);
936     } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
937       Register DstSubReg = RI.getSubReg(DestReg, SubIdx);
938       Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
939       MachineInstrBuilder MIB =
940         BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DstSubReg)
941         .addImm(SISrcMods::OP_SEL_1)
942         .addReg(SrcSubReg)
943         .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
944         .addReg(SrcSubReg)
945         .addImm(0) // op_sel_lo
946         .addImm(0) // op_sel_hi
947         .addImm(0) // neg_lo
948         .addImm(0) // neg_hi
949         .addImm(0) // clamp
950         .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
951       if (Idx == 0)
952         MIB.addReg(DestReg, RegState::Define | RegState::Implicit);
953     } else {
954       MachineInstrBuilder Builder =
955         BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DestReg, SubIdx))
956         .addReg(RI.getSubReg(SrcReg, SubIdx));
957       if (Idx == 0)
958         Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
959 
960       Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
961     }
962   }
963 }
964 
965 int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
966   int NewOpc;
967 
968   // Try to map original to commuted opcode
969   NewOpc = AMDGPU::getCommuteRev(Opcode);
970   if (NewOpc != -1)
971     // Check if the commuted (REV) opcode exists on the target.
972     return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
973 
974   // Try to map commuted to original opcode
975   NewOpc = AMDGPU::getCommuteOrig(Opcode);
976   if (NewOpc != -1)
977     // Check if the original (non-REV) opcode exists on the target.
978     return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
979 
980   return Opcode;
981 }
982 
983 void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB,
984                                        MachineBasicBlock::iterator MI,
985                                        const DebugLoc &DL, unsigned DestReg,
986                                        int64_t Value) const {
987   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
988   const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
989   if (RegClass == &AMDGPU::SReg_32RegClass ||
990       RegClass == &AMDGPU::SGPR_32RegClass ||
991       RegClass == &AMDGPU::SReg_32_XM0RegClass ||
992       RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) {
993     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
994       .addImm(Value);
995     return;
996   }
997 
998   if (RegClass == &AMDGPU::SReg_64RegClass ||
999       RegClass == &AMDGPU::SGPR_64RegClass ||
1000       RegClass == &AMDGPU::SReg_64_XEXECRegClass) {
1001     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
1002       .addImm(Value);
1003     return;
1004   }
1005 
1006   if (RegClass == &AMDGPU::VGPR_32RegClass) {
1007     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
1008       .addImm(Value);
1009     return;
1010   }
1011   if (RegClass->hasSuperClassEq(&AMDGPU::VReg_64RegClass)) {
1012     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
1013       .addImm(Value);
1014     return;
1015   }
1016 
1017   unsigned EltSize = 4;
1018   unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1019   if (RI.isSGPRClass(RegClass)) {
1020     if (RI.getRegSizeInBits(*RegClass) > 32) {
1021       Opcode =  AMDGPU::S_MOV_B64;
1022       EltSize = 8;
1023     } else {
1024       Opcode = AMDGPU::S_MOV_B32;
1025       EltSize = 4;
1026     }
1027   }
1028 
1029   ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
1030   for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1031     int64_t IdxValue = Idx == 0 ? Value : 0;
1032 
1033     MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
1034       get(Opcode), RI.getSubReg(DestReg, SubIndices[Idx]));
1035     Builder.addImm(IdxValue);
1036   }
1037 }
1038 
1039 const TargetRegisterClass *
1040 SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const {
1041   return &AMDGPU::VGPR_32RegClass;
1042 }
1043 
1044 void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
1045                                      MachineBasicBlock::iterator I,
1046                                      const DebugLoc &DL, Register DstReg,
1047                                      ArrayRef<MachineOperand> Cond,
1048                                      Register TrueReg,
1049                                      Register FalseReg) const {
1050   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1051   const TargetRegisterClass *BoolXExecRC =
1052     RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1053   assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1054          "Not a VGPR32 reg");
1055 
1056   if (Cond.size() == 1) {
1057     Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1058     BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1059       .add(Cond[0]);
1060     BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1061       .addImm(0)
1062       .addReg(FalseReg)
1063       .addImm(0)
1064       .addReg(TrueReg)
1065       .addReg(SReg);
1066   } else if (Cond.size() == 2) {
1067     assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1068     switch (Cond[0].getImm()) {
1069     case SIInstrInfo::SCC_TRUE: {
1070       Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1071       BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1072                                             : AMDGPU::S_CSELECT_B64), SReg)
1073         .addImm(1)
1074         .addImm(0);
1075       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1076         .addImm(0)
1077         .addReg(FalseReg)
1078         .addImm(0)
1079         .addReg(TrueReg)
1080         .addReg(SReg);
1081       break;
1082     }
1083     case SIInstrInfo::SCC_FALSE: {
1084       Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1085       BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1086                                             : AMDGPU::S_CSELECT_B64), SReg)
1087         .addImm(0)
1088         .addImm(1);
1089       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1090         .addImm(0)
1091         .addReg(FalseReg)
1092         .addImm(0)
1093         .addReg(TrueReg)
1094         .addReg(SReg);
1095       break;
1096     }
1097     case SIInstrInfo::VCCNZ: {
1098       MachineOperand RegOp = Cond[1];
1099       RegOp.setImplicit(false);
1100       Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1101       BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1102         .add(RegOp);
1103       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1104           .addImm(0)
1105           .addReg(FalseReg)
1106           .addImm(0)
1107           .addReg(TrueReg)
1108           .addReg(SReg);
1109       break;
1110     }
1111     case SIInstrInfo::VCCZ: {
1112       MachineOperand RegOp = Cond[1];
1113       RegOp.setImplicit(false);
1114       Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1115       BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1116         .add(RegOp);
1117       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1118           .addImm(0)
1119           .addReg(TrueReg)
1120           .addImm(0)
1121           .addReg(FalseReg)
1122           .addReg(SReg);
1123       break;
1124     }
1125     case SIInstrInfo::EXECNZ: {
1126       Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1127       Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1128       BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1129                                             : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1130         .addImm(0);
1131       BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1132                                             : AMDGPU::S_CSELECT_B64), SReg)
1133         .addImm(1)
1134         .addImm(0);
1135       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1136         .addImm(0)
1137         .addReg(FalseReg)
1138         .addImm(0)
1139         .addReg(TrueReg)
1140         .addReg(SReg);
1141       break;
1142     }
1143     case SIInstrInfo::EXECZ: {
1144       Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1145       Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1146       BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1147                                             : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1148         .addImm(0);
1149       BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1150                                             : AMDGPU::S_CSELECT_B64), SReg)
1151         .addImm(0)
1152         .addImm(1);
1153       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1154         .addImm(0)
1155         .addReg(FalseReg)
1156         .addImm(0)
1157         .addReg(TrueReg)
1158         .addReg(SReg);
1159       llvm_unreachable("Unhandled branch predicate EXECZ");
1160       break;
1161     }
1162     default:
1163       llvm_unreachable("invalid branch predicate");
1164     }
1165   } else {
1166     llvm_unreachable("Can only handle Cond size 1 or 2");
1167   }
1168 }
1169 
1170 Register SIInstrInfo::insertEQ(MachineBasicBlock *MBB,
1171                                MachineBasicBlock::iterator I,
1172                                const DebugLoc &DL,
1173                                Register SrcReg, int Value) const {
1174   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1175   Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1176   BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1177     .addImm(Value)
1178     .addReg(SrcReg);
1179 
1180   return Reg;
1181 }
1182 
1183 Register SIInstrInfo::insertNE(MachineBasicBlock *MBB,
1184                                MachineBasicBlock::iterator I,
1185                                const DebugLoc &DL,
1186                                Register SrcReg, int Value) const {
1187   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1188   Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1189   BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1190     .addImm(Value)
1191     .addReg(SrcReg);
1192 
1193   return Reg;
1194 }
1195 
1196 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
1197 
1198   if (RI.hasAGPRs(DstRC))
1199     return AMDGPU::COPY;
1200   if (RI.getRegSizeInBits(*DstRC) == 32) {
1201     return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1202   } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) {
1203     return AMDGPU::S_MOV_B64;
1204   } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) {
1205     return  AMDGPU::V_MOV_B64_PSEUDO;
1206   }
1207   return AMDGPU::COPY;
1208 }
1209 
1210 const MCInstrDesc &
1211 SIInstrInfo::getIndirectGPRIDXPseudo(unsigned VecSize,
1212                                      bool IsIndirectSrc) const {
1213   if (IsIndirectSrc) {
1214     if (VecSize <= 32) // 4 bytes
1215       return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1216     if (VecSize <= 64) // 8 bytes
1217       return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1218     if (VecSize <= 96) // 12 bytes
1219       return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1220     if (VecSize <= 128) // 16 bytes
1221       return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1222     if (VecSize <= 160) // 20 bytes
1223       return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1224     if (VecSize <= 256) // 32 bytes
1225       return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1226     if (VecSize <= 512) // 64 bytes
1227       return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1228     if (VecSize <= 1024) // 128 bytes
1229       return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1230 
1231     llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1232   }
1233 
1234   if (VecSize <= 32) // 4 bytes
1235     return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1236   if (VecSize <= 64) // 8 bytes
1237     return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1238   if (VecSize <= 96) // 12 bytes
1239     return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1240   if (VecSize <= 128) // 16 bytes
1241     return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1242   if (VecSize <= 160) // 20 bytes
1243     return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1244   if (VecSize <= 256) // 32 bytes
1245     return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1246   if (VecSize <= 512) // 64 bytes
1247     return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1248   if (VecSize <= 1024) // 128 bytes
1249     return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1250 
1251   llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1252 }
1253 
1254 static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1255   if (VecSize <= 32) // 4 bytes
1256     return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1257   if (VecSize <= 64) // 8 bytes
1258     return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1259   if (VecSize <= 96) // 12 bytes
1260     return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1261   if (VecSize <= 128) // 16 bytes
1262     return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1263   if (VecSize <= 160) // 20 bytes
1264     return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1265   if (VecSize <= 256) // 32 bytes
1266     return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1267   if (VecSize <= 512) // 64 bytes
1268     return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1269   if (VecSize <= 1024) // 128 bytes
1270     return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1271 
1272   llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1273 }
1274 
1275 static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1276   if (VecSize <= 32) // 4 bytes
1277     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1278   if (VecSize <= 64) // 8 bytes
1279     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1280   if (VecSize <= 96) // 12 bytes
1281     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1282   if (VecSize <= 128) // 16 bytes
1283     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1284   if (VecSize <= 160) // 20 bytes
1285     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1286   if (VecSize <= 256) // 32 bytes
1287     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1288   if (VecSize <= 512) // 64 bytes
1289     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1290   if (VecSize <= 1024) // 128 bytes
1291     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1292 
1293   llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1294 }
1295 
1296 static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1297   if (VecSize <= 64) // 8 bytes
1298     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1299   if (VecSize <= 128) // 16 bytes
1300     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1301   if (VecSize <= 256) // 32 bytes
1302     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1303   if (VecSize <= 512) // 64 bytes
1304     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1305   if (VecSize <= 1024) // 128 bytes
1306     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1307 
1308   llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1309 }
1310 
1311 const MCInstrDesc &
1312 SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1313                                              bool IsSGPR) const {
1314   if (IsSGPR) {
1315     switch (EltSize) {
1316     case 32:
1317       return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1318     case 64:
1319       return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1320     default:
1321       llvm_unreachable("invalid reg indexing elt size");
1322     }
1323   }
1324 
1325   assert(EltSize == 32 && "invalid reg indexing elt size");
1326   return get(getIndirectVGPRWriteMovRelPseudoOpc(VecSize));
1327 }
1328 
1329 static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1330   switch (Size) {
1331   case 4:
1332     return AMDGPU::SI_SPILL_S32_SAVE;
1333   case 8:
1334     return AMDGPU::SI_SPILL_S64_SAVE;
1335   case 12:
1336     return AMDGPU::SI_SPILL_S96_SAVE;
1337   case 16:
1338     return AMDGPU::SI_SPILL_S128_SAVE;
1339   case 20:
1340     return AMDGPU::SI_SPILL_S160_SAVE;
1341   case 24:
1342     return AMDGPU::SI_SPILL_S192_SAVE;
1343   case 28:
1344     return AMDGPU::SI_SPILL_S224_SAVE;
1345   case 32:
1346     return AMDGPU::SI_SPILL_S256_SAVE;
1347   case 64:
1348     return AMDGPU::SI_SPILL_S512_SAVE;
1349   case 128:
1350     return AMDGPU::SI_SPILL_S1024_SAVE;
1351   default:
1352     llvm_unreachable("unknown register size");
1353   }
1354 }
1355 
1356 static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1357   switch (Size) {
1358   case 4:
1359     return AMDGPU::SI_SPILL_V32_SAVE;
1360   case 8:
1361     return AMDGPU::SI_SPILL_V64_SAVE;
1362   case 12:
1363     return AMDGPU::SI_SPILL_V96_SAVE;
1364   case 16:
1365     return AMDGPU::SI_SPILL_V128_SAVE;
1366   case 20:
1367     return AMDGPU::SI_SPILL_V160_SAVE;
1368   case 24:
1369     return AMDGPU::SI_SPILL_V192_SAVE;
1370   case 28:
1371     return AMDGPU::SI_SPILL_V224_SAVE;
1372   case 32:
1373     return AMDGPU::SI_SPILL_V256_SAVE;
1374   case 64:
1375     return AMDGPU::SI_SPILL_V512_SAVE;
1376   case 128:
1377     return AMDGPU::SI_SPILL_V1024_SAVE;
1378   default:
1379     llvm_unreachable("unknown register size");
1380   }
1381 }
1382 
1383 static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
1384   switch (Size) {
1385   case 4:
1386     return AMDGPU::SI_SPILL_A32_SAVE;
1387   case 8:
1388     return AMDGPU::SI_SPILL_A64_SAVE;
1389   case 12:
1390     return AMDGPU::SI_SPILL_A96_SAVE;
1391   case 16:
1392     return AMDGPU::SI_SPILL_A128_SAVE;
1393   case 20:
1394     return AMDGPU::SI_SPILL_A160_SAVE;
1395   case 24:
1396     return AMDGPU::SI_SPILL_A192_SAVE;
1397   case 28:
1398     return AMDGPU::SI_SPILL_A224_SAVE;
1399   case 32:
1400     return AMDGPU::SI_SPILL_A256_SAVE;
1401   case 64:
1402     return AMDGPU::SI_SPILL_A512_SAVE;
1403   case 128:
1404     return AMDGPU::SI_SPILL_A1024_SAVE;
1405   default:
1406     llvm_unreachable("unknown register size");
1407   }
1408 }
1409 
1410 void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
1411                                       MachineBasicBlock::iterator MI,
1412                                       Register SrcReg, bool isKill,
1413                                       int FrameIndex,
1414                                       const TargetRegisterClass *RC,
1415                                       const TargetRegisterInfo *TRI) const {
1416   MachineFunction *MF = MBB.getParent();
1417   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1418   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1419   const DebugLoc &DL = MBB.findDebugLoc(MI);
1420 
1421   MachinePointerInfo PtrInfo
1422     = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1423   MachineMemOperand *MMO = MF->getMachineMemOperand(
1424       PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1425       FrameInfo.getObjectAlign(FrameIndex));
1426   unsigned SpillSize = TRI->getSpillSize(*RC);
1427 
1428   if (RI.isSGPRClass(RC)) {
1429     MFI->setHasSpilledSGPRs();
1430     assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1431     assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1432            SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1433 
1434     // We are only allowed to create one new instruction when spilling
1435     // registers, so we need to use pseudo instruction for spilling SGPRs.
1436     const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1437 
1438     // The SGPR spill/restore instructions only work on number sgprs, so we need
1439     // to make sure we are using the correct register class.
1440     if (SrcReg.isVirtual() && SpillSize == 4) {
1441       MachineRegisterInfo &MRI = MF->getRegInfo();
1442       MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1443     }
1444 
1445     BuildMI(MBB, MI, DL, OpDesc)
1446       .addReg(SrcReg, getKillRegState(isKill)) // data
1447       .addFrameIndex(FrameIndex)               // addr
1448       .addMemOperand(MMO)
1449       .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit);
1450 
1451     if (RI.spillSGPRToVGPR())
1452       FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1453     return;
1454   }
1455 
1456   unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillSaveOpcode(SpillSize)
1457                                     : getVGPRSpillSaveOpcode(SpillSize);
1458   MFI->setHasSpilledVGPRs();
1459 
1460   BuildMI(MBB, MI, DL, get(Opcode))
1461     .addReg(SrcReg, getKillRegState(isKill)) // data
1462     .addFrameIndex(FrameIndex)               // addr
1463     .addReg(MFI->getStackPtrOffsetReg())     // scratch_offset
1464     .addImm(0)                               // offset
1465     .addMemOperand(MMO);
1466 }
1467 
1468 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1469   switch (Size) {
1470   case 4:
1471     return AMDGPU::SI_SPILL_S32_RESTORE;
1472   case 8:
1473     return AMDGPU::SI_SPILL_S64_RESTORE;
1474   case 12:
1475     return AMDGPU::SI_SPILL_S96_RESTORE;
1476   case 16:
1477     return AMDGPU::SI_SPILL_S128_RESTORE;
1478   case 20:
1479     return AMDGPU::SI_SPILL_S160_RESTORE;
1480   case 24:
1481     return AMDGPU::SI_SPILL_S192_RESTORE;
1482   case 28:
1483     return AMDGPU::SI_SPILL_S224_RESTORE;
1484   case 32:
1485     return AMDGPU::SI_SPILL_S256_RESTORE;
1486   case 64:
1487     return AMDGPU::SI_SPILL_S512_RESTORE;
1488   case 128:
1489     return AMDGPU::SI_SPILL_S1024_RESTORE;
1490   default:
1491     llvm_unreachable("unknown register size");
1492   }
1493 }
1494 
1495 static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1496   switch (Size) {
1497   case 4:
1498     return AMDGPU::SI_SPILL_V32_RESTORE;
1499   case 8:
1500     return AMDGPU::SI_SPILL_V64_RESTORE;
1501   case 12:
1502     return AMDGPU::SI_SPILL_V96_RESTORE;
1503   case 16:
1504     return AMDGPU::SI_SPILL_V128_RESTORE;
1505   case 20:
1506     return AMDGPU::SI_SPILL_V160_RESTORE;
1507   case 24:
1508     return AMDGPU::SI_SPILL_V192_RESTORE;
1509   case 28:
1510     return AMDGPU::SI_SPILL_V224_RESTORE;
1511   case 32:
1512     return AMDGPU::SI_SPILL_V256_RESTORE;
1513   case 64:
1514     return AMDGPU::SI_SPILL_V512_RESTORE;
1515   case 128:
1516     return AMDGPU::SI_SPILL_V1024_RESTORE;
1517   default:
1518     llvm_unreachable("unknown register size");
1519   }
1520 }
1521 
1522 static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
1523   switch (Size) {
1524   case 4:
1525     return AMDGPU::SI_SPILL_A32_RESTORE;
1526   case 8:
1527     return AMDGPU::SI_SPILL_A64_RESTORE;
1528   case 12:
1529     return AMDGPU::SI_SPILL_A96_RESTORE;
1530   case 16:
1531     return AMDGPU::SI_SPILL_A128_RESTORE;
1532   case 20:
1533     return AMDGPU::SI_SPILL_A160_RESTORE;
1534   case 24:
1535     return AMDGPU::SI_SPILL_A192_RESTORE;
1536   case 28:
1537     return AMDGPU::SI_SPILL_A224_RESTORE;
1538   case 32:
1539     return AMDGPU::SI_SPILL_A256_RESTORE;
1540   case 64:
1541     return AMDGPU::SI_SPILL_A512_RESTORE;
1542   case 128:
1543     return AMDGPU::SI_SPILL_A1024_RESTORE;
1544   default:
1545     llvm_unreachable("unknown register size");
1546   }
1547 }
1548 
1549 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
1550                                        MachineBasicBlock::iterator MI,
1551                                        Register DestReg, int FrameIndex,
1552                                        const TargetRegisterClass *RC,
1553                                        const TargetRegisterInfo *TRI) const {
1554   MachineFunction *MF = MBB.getParent();
1555   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1556   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1557   const DebugLoc &DL = MBB.findDebugLoc(MI);
1558   unsigned SpillSize = TRI->getSpillSize(*RC);
1559 
1560   MachinePointerInfo PtrInfo
1561     = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1562 
1563   MachineMemOperand *MMO = MF->getMachineMemOperand(
1564       PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1565       FrameInfo.getObjectAlign(FrameIndex));
1566 
1567   if (RI.isSGPRClass(RC)) {
1568     MFI->setHasSpilledSGPRs();
1569     assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1570     assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1571            DestReg != AMDGPU::EXEC && "exec should not be spilled");
1572 
1573     // FIXME: Maybe this should not include a memoperand because it will be
1574     // lowered to non-memory instructions.
1575     const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1576     if (DestReg.isVirtual() && SpillSize == 4) {
1577       MachineRegisterInfo &MRI = MF->getRegInfo();
1578       MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1579     }
1580 
1581     if (RI.spillSGPRToVGPR())
1582       FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1583     BuildMI(MBB, MI, DL, OpDesc, DestReg)
1584       .addFrameIndex(FrameIndex) // addr
1585       .addMemOperand(MMO)
1586       .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit);
1587 
1588     return;
1589   }
1590 
1591   unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillRestoreOpcode(SpillSize)
1592                                     : getVGPRSpillRestoreOpcode(SpillSize);
1593   BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1594     .addFrameIndex(FrameIndex)        // vaddr
1595     .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1596     .addImm(0)                           // offset
1597     .addMemOperand(MMO);
1598 }
1599 
1600 void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,
1601                              MachineBasicBlock::iterator MI) const {
1602   insertNoops(MBB, MI, 1);
1603 }
1604 
1605 void SIInstrInfo::insertNoops(MachineBasicBlock &MBB,
1606                               MachineBasicBlock::iterator MI,
1607                               unsigned Quantity) const {
1608   DebugLoc DL = MBB.findDebugLoc(MI);
1609   while (Quantity > 0) {
1610     unsigned Arg = std::min(Quantity, 8u);
1611     Quantity -= Arg;
1612     BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
1613   }
1614 }
1615 
1616 void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const {
1617   auto MF = MBB.getParent();
1618   SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1619 
1620   assert(Info->isEntryFunction());
1621 
1622   if (MBB.succ_empty()) {
1623     bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1624     if (HasNoTerminator) {
1625       if (Info->returnsVoid()) {
1626         BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
1627       } else {
1628         BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
1629       }
1630     }
1631   }
1632 }
1633 
1634 unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
1635   switch (MI.getOpcode()) {
1636   default: return 1; // FIXME: Do wait states equal cycles?
1637 
1638   case AMDGPU::S_NOP:
1639     return MI.getOperand(0).getImm() + 1;
1640   }
1641 }
1642 
1643 bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
1644   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1645   MachineBasicBlock &MBB = *MI.getParent();
1646   DebugLoc DL = MBB.findDebugLoc(MI);
1647   switch (MI.getOpcode()) {
1648   default: return TargetInstrInfo::expandPostRAPseudo(MI);
1649   case AMDGPU::S_MOV_B64_term:
1650     // This is only a terminator to get the correct spill code placement during
1651     // register allocation.
1652     MI.setDesc(get(AMDGPU::S_MOV_B64));
1653     break;
1654 
1655   case AMDGPU::S_MOV_B32_term:
1656     // This is only a terminator to get the correct spill code placement during
1657     // register allocation.
1658     MI.setDesc(get(AMDGPU::S_MOV_B32));
1659     break;
1660 
1661   case AMDGPU::S_XOR_B64_term:
1662     // This is only a terminator to get the correct spill code placement during
1663     // register allocation.
1664     MI.setDesc(get(AMDGPU::S_XOR_B64));
1665     break;
1666 
1667   case AMDGPU::S_XOR_B32_term:
1668     // This is only a terminator to get the correct spill code placement during
1669     // register allocation.
1670     MI.setDesc(get(AMDGPU::S_XOR_B32));
1671     break;
1672   case AMDGPU::S_OR_B64_term:
1673     // This is only a terminator to get the correct spill code placement during
1674     // register allocation.
1675     MI.setDesc(get(AMDGPU::S_OR_B64));
1676     break;
1677   case AMDGPU::S_OR_B32_term:
1678     // This is only a terminator to get the correct spill code placement during
1679     // register allocation.
1680     MI.setDesc(get(AMDGPU::S_OR_B32));
1681     break;
1682 
1683   case AMDGPU::S_ANDN2_B64_term:
1684     // This is only a terminator to get the correct spill code placement during
1685     // register allocation.
1686     MI.setDesc(get(AMDGPU::S_ANDN2_B64));
1687     break;
1688 
1689   case AMDGPU::S_ANDN2_B32_term:
1690     // This is only a terminator to get the correct spill code placement during
1691     // register allocation.
1692     MI.setDesc(get(AMDGPU::S_ANDN2_B32));
1693     break;
1694 
1695   case AMDGPU::S_AND_B64_term:
1696     // This is only a terminator to get the correct spill code placement during
1697     // register allocation.
1698     MI.setDesc(get(AMDGPU::S_AND_B64));
1699     break;
1700 
1701   case AMDGPU::S_AND_B32_term:
1702     // This is only a terminator to get the correct spill code placement during
1703     // register allocation.
1704     MI.setDesc(get(AMDGPU::S_AND_B32));
1705     break;
1706 
1707   case AMDGPU::V_MOV_B64_PSEUDO: {
1708     Register Dst = MI.getOperand(0).getReg();
1709     Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
1710     Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
1711 
1712     const MachineOperand &SrcOp = MI.getOperand(1);
1713     // FIXME: Will this work for 64-bit floating point immediates?
1714     assert(!SrcOp.isFPImm());
1715     if (SrcOp.isImm()) {
1716       APInt Imm(64, SrcOp.getImm());
1717       APInt Lo(32, Imm.getLoBits(32).getZExtValue());
1718       APInt Hi(32, Imm.getHiBits(32).getZExtValue());
1719       if (ST.hasPackedFP32Ops() && Lo == Hi && isInlineConstant(Lo)) {
1720         BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
1721           .addImm(SISrcMods::OP_SEL_1)
1722           .addImm(Lo.getSExtValue())
1723           .addImm(SISrcMods::OP_SEL_1)
1724           .addImm(Lo.getSExtValue())
1725           .addImm(0)  // op_sel_lo
1726           .addImm(0)  // op_sel_hi
1727           .addImm(0)  // neg_lo
1728           .addImm(0)  // neg_hi
1729           .addImm(0); // clamp
1730       } else {
1731         BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
1732           .addImm(Lo.getZExtValue())
1733           .addReg(Dst, RegState::Implicit | RegState::Define);
1734         BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
1735           .addImm(Hi.getZExtValue())
1736           .addReg(Dst, RegState::Implicit | RegState::Define);
1737       }
1738     } else {
1739       assert(SrcOp.isReg());
1740       if (ST.hasPackedFP32Ops() &&
1741           !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
1742         BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
1743           .addImm(SISrcMods::OP_SEL_1) // src0_mod
1744           .addReg(SrcOp.getReg())
1745           .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) // src1_mod
1746           .addReg(SrcOp.getReg())
1747           .addImm(0)  // op_sel_lo
1748           .addImm(0)  // op_sel_hi
1749           .addImm(0)  // neg_lo
1750           .addImm(0)  // neg_hi
1751           .addImm(0); // clamp
1752       } else {
1753         BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
1754           .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
1755           .addReg(Dst, RegState::Implicit | RegState::Define);
1756         BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
1757           .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
1758           .addReg(Dst, RegState::Implicit | RegState::Define);
1759       }
1760     }
1761     MI.eraseFromParent();
1762     break;
1763   }
1764   case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
1765     expandMovDPP64(MI);
1766     break;
1767   }
1768   case AMDGPU::V_SET_INACTIVE_B32: {
1769     unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
1770     unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
1771     auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
1772     FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
1773     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
1774       .add(MI.getOperand(2));
1775     BuildMI(MBB, MI, DL, get(NotOpc), Exec)
1776       .addReg(Exec);
1777     MI.eraseFromParent();
1778     break;
1779   }
1780   case AMDGPU::V_SET_INACTIVE_B64: {
1781     unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
1782     unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
1783     auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
1784     FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
1785     MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
1786                                  MI.getOperand(0).getReg())
1787       .add(MI.getOperand(2));
1788     expandPostRAPseudo(*Copy);
1789     BuildMI(MBB, MI, DL, get(NotOpc), Exec)
1790       .addReg(Exec);
1791     MI.eraseFromParent();
1792     break;
1793   }
1794   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
1795   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
1796   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
1797   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
1798   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
1799   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
1800   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
1801   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
1802   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
1803   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
1804   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
1805   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
1806   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
1807   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
1808   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
1809   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
1810   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
1811   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
1812   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
1813   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
1814   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
1815     const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
1816 
1817     unsigned Opc;
1818     if (RI.hasVGPRs(EltRC)) {
1819       Opc = AMDGPU::V_MOVRELD_B32_e32;
1820     } else {
1821       Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
1822                                               : AMDGPU::S_MOVRELD_B32;
1823     }
1824 
1825     const MCInstrDesc &OpDesc = get(Opc);
1826     Register VecReg = MI.getOperand(0).getReg();
1827     bool IsUndef = MI.getOperand(1).isUndef();
1828     unsigned SubReg = MI.getOperand(3).getImm();
1829     assert(VecReg == MI.getOperand(1).getReg());
1830 
1831     MachineInstrBuilder MIB =
1832       BuildMI(MBB, MI, DL, OpDesc)
1833         .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
1834         .add(MI.getOperand(2))
1835         .addReg(VecReg, RegState::ImplicitDefine)
1836         .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
1837 
1838     const int ImpDefIdx =
1839       OpDesc.getNumOperands() + OpDesc.getNumImplicitUses();
1840     const int ImpUseIdx = ImpDefIdx + 1;
1841     MIB->tieOperands(ImpDefIdx, ImpUseIdx);
1842     MI.eraseFromParent();
1843     break;
1844   }
1845   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
1846   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
1847   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
1848   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
1849   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
1850   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
1851   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
1852   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
1853     assert(ST.useVGPRIndexMode());
1854     Register VecReg = MI.getOperand(0).getReg();
1855     bool IsUndef = MI.getOperand(1).isUndef();
1856     Register Idx = MI.getOperand(3).getReg();
1857     Register SubReg = MI.getOperand(4).getImm();
1858 
1859     MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
1860                               .addReg(Idx)
1861                               .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE);
1862     SetOn->getOperand(3).setIsUndef();
1863 
1864     const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect);
1865     MachineInstrBuilder MIB =
1866         BuildMI(MBB, MI, DL, OpDesc)
1867             .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
1868             .add(MI.getOperand(2))
1869             .addReg(VecReg, RegState::ImplicitDefine)
1870             .addReg(VecReg,
1871                     RegState::Implicit | (IsUndef ? RegState::Undef : 0));
1872 
1873     const int ImpDefIdx = OpDesc.getNumOperands() + OpDesc.getNumImplicitUses();
1874     const int ImpUseIdx = ImpDefIdx + 1;
1875     MIB->tieOperands(ImpDefIdx, ImpUseIdx);
1876 
1877     MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
1878 
1879     finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
1880 
1881     MI.eraseFromParent();
1882     break;
1883   }
1884   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
1885   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
1886   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
1887   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
1888   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
1889   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
1890   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
1891   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
1892     assert(ST.useVGPRIndexMode());
1893     Register Dst = MI.getOperand(0).getReg();
1894     Register VecReg = MI.getOperand(1).getReg();
1895     bool IsUndef = MI.getOperand(1).isUndef();
1896     Register Idx = MI.getOperand(2).getReg();
1897     Register SubReg = MI.getOperand(3).getImm();
1898 
1899     MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
1900                               .addReg(Idx)
1901                               .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE);
1902     SetOn->getOperand(3).setIsUndef();
1903 
1904     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32))
1905         .addDef(Dst)
1906         .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
1907         .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0))
1908         .addReg(AMDGPU::M0, RegState::Implicit);
1909 
1910     MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
1911 
1912     finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
1913 
1914     MI.eraseFromParent();
1915     break;
1916   }
1917   case AMDGPU::SI_PC_ADD_REL_OFFSET: {
1918     MachineFunction &MF = *MBB.getParent();
1919     Register Reg = MI.getOperand(0).getReg();
1920     Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
1921     Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
1922 
1923     // Create a bundle so these instructions won't be re-ordered by the
1924     // post-RA scheduler.
1925     MIBundleBuilder Bundler(MBB, MI);
1926     Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
1927 
1928     // Add 32-bit offset from this instruction to the start of the
1929     // constant data.
1930     Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo)
1931                        .addReg(RegLo)
1932                        .add(MI.getOperand(1)));
1933 
1934     MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
1935                                   .addReg(RegHi);
1936     MIB.add(MI.getOperand(2));
1937 
1938     Bundler.append(MIB);
1939     finalizeBundle(MBB, Bundler.begin());
1940 
1941     MI.eraseFromParent();
1942     break;
1943   }
1944   case AMDGPU::ENTER_STRICT_WWM: {
1945     // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
1946     // Whole Wave Mode is entered.
1947     MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1948                                  : AMDGPU::S_OR_SAVEEXEC_B64));
1949     break;
1950   }
1951   case AMDGPU::ENTER_STRICT_WQM: {
1952     // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
1953     // STRICT_WQM is entered.
1954     const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
1955     const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64;
1956     const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1957     BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec);
1958     BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec);
1959 
1960     MI.eraseFromParent();
1961     break;
1962   }
1963   case AMDGPU::EXIT_STRICT_WWM:
1964   case AMDGPU::EXIT_STRICT_WQM: {
1965     // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
1966     // WWM/STICT_WQM is exited.
1967     MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
1968     break;
1969   }
1970   }
1971   return true;
1972 }
1973 
1974 std::pair<MachineInstr*, MachineInstr*>
1975 SIInstrInfo::expandMovDPP64(MachineInstr &MI) const {
1976   assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
1977 
1978   MachineBasicBlock &MBB = *MI.getParent();
1979   DebugLoc DL = MBB.findDebugLoc(MI);
1980   MachineFunction *MF = MBB.getParent();
1981   MachineRegisterInfo &MRI = MF->getRegInfo();
1982   Register Dst = MI.getOperand(0).getReg();
1983   unsigned Part = 0;
1984   MachineInstr *Split[2];
1985 
1986   for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
1987     auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
1988     if (Dst.isPhysical()) {
1989       MovDPP.addDef(RI.getSubReg(Dst, Sub));
1990     } else {
1991       assert(MRI.isSSA());
1992       auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1993       MovDPP.addDef(Tmp);
1994     }
1995 
1996     for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
1997       const MachineOperand &SrcOp = MI.getOperand(I);
1998       assert(!SrcOp.isFPImm());
1999       if (SrcOp.isImm()) {
2000         APInt Imm(64, SrcOp.getImm());
2001         Imm.ashrInPlace(Part * 32);
2002         MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2003       } else {
2004         assert(SrcOp.isReg());
2005         Register Src = SrcOp.getReg();
2006         if (Src.isPhysical())
2007           MovDPP.addReg(RI.getSubReg(Src, Sub));
2008         else
2009           MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
2010       }
2011     }
2012 
2013     for (unsigned I = 3; I < MI.getNumExplicitOperands(); ++I)
2014       MovDPP.addImm(MI.getOperand(I).getImm());
2015 
2016     Split[Part] = MovDPP;
2017     ++Part;
2018   }
2019 
2020   if (Dst.isVirtual())
2021     BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2022       .addReg(Split[0]->getOperand(0).getReg())
2023       .addImm(AMDGPU::sub0)
2024       .addReg(Split[1]->getOperand(0).getReg())
2025       .addImm(AMDGPU::sub1);
2026 
2027   MI.eraseFromParent();
2028   return std::make_pair(Split[0], Split[1]);
2029 }
2030 
2031 bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI,
2032                                       MachineOperand &Src0,
2033                                       unsigned Src0OpName,
2034                                       MachineOperand &Src1,
2035                                       unsigned Src1OpName) const {
2036   MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2037   if (!Src0Mods)
2038     return false;
2039 
2040   MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2041   assert(Src1Mods &&
2042          "All commutable instructions have both src0 and src1 modifiers");
2043 
2044   int Src0ModsVal = Src0Mods->getImm();
2045   int Src1ModsVal = Src1Mods->getImm();
2046 
2047   Src1Mods->setImm(Src0ModsVal);
2048   Src0Mods->setImm(Src1ModsVal);
2049   return true;
2050 }
2051 
2052 static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI,
2053                                              MachineOperand &RegOp,
2054                                              MachineOperand &NonRegOp) {
2055   Register Reg = RegOp.getReg();
2056   unsigned SubReg = RegOp.getSubReg();
2057   bool IsKill = RegOp.isKill();
2058   bool IsDead = RegOp.isDead();
2059   bool IsUndef = RegOp.isUndef();
2060   bool IsDebug = RegOp.isDebug();
2061 
2062   if (NonRegOp.isImm())
2063     RegOp.ChangeToImmediate(NonRegOp.getImm());
2064   else if (NonRegOp.isFI())
2065     RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2066   else if (NonRegOp.isGlobal()) {
2067     RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2068                      NonRegOp.getTargetFlags());
2069   } else
2070     return nullptr;
2071 
2072   // Make sure we don't reinterpret a subreg index in the target flags.
2073   RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2074 
2075   NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2076   NonRegOp.setSubReg(SubReg);
2077 
2078   return &MI;
2079 }
2080 
2081 MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
2082                                                   unsigned Src0Idx,
2083                                                   unsigned Src1Idx) const {
2084   assert(!NewMI && "this should never be used");
2085 
2086   unsigned Opc = MI.getOpcode();
2087   int CommutedOpcode = commuteOpcode(Opc);
2088   if (CommutedOpcode == -1)
2089     return nullptr;
2090 
2091   assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2092            static_cast<int>(Src0Idx) &&
2093          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2094            static_cast<int>(Src1Idx) &&
2095          "inconsistency with findCommutedOpIndices");
2096 
2097   MachineOperand &Src0 = MI.getOperand(Src0Idx);
2098   MachineOperand &Src1 = MI.getOperand(Src1Idx);
2099 
2100   MachineInstr *CommutedMI = nullptr;
2101   if (Src0.isReg() && Src1.isReg()) {
2102     if (isOperandLegal(MI, Src1Idx, &Src0)) {
2103       // Be sure to copy the source modifiers to the right place.
2104       CommutedMI
2105         = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2106     }
2107 
2108   } else if (Src0.isReg() && !Src1.isReg()) {
2109     // src0 should always be able to support any operand type, so no need to
2110     // check operand legality.
2111     CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2112   } else if (!Src0.isReg() && Src1.isReg()) {
2113     if (isOperandLegal(MI, Src1Idx, &Src0))
2114       CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2115   } else {
2116     // FIXME: Found two non registers to commute. This does happen.
2117     return nullptr;
2118   }
2119 
2120   if (CommutedMI) {
2121     swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2122                         Src1, AMDGPU::OpName::src1_modifiers);
2123 
2124     CommutedMI->setDesc(get(CommutedOpcode));
2125   }
2126 
2127   return CommutedMI;
2128 }
2129 
2130 // This needs to be implemented because the source modifiers may be inserted
2131 // between the true commutable operands, and the base
2132 // TargetInstrInfo::commuteInstruction uses it.
2133 bool SIInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
2134                                         unsigned &SrcOpIdx0,
2135                                         unsigned &SrcOpIdx1) const {
2136   return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2137 }
2138 
2139 bool SIInstrInfo::findCommutedOpIndices(MCInstrDesc Desc, unsigned &SrcOpIdx0,
2140                                         unsigned &SrcOpIdx1) const {
2141   if (!Desc.isCommutable())
2142     return false;
2143 
2144   unsigned Opc = Desc.getOpcode();
2145   int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2146   if (Src0Idx == -1)
2147     return false;
2148 
2149   int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2150   if (Src1Idx == -1)
2151     return false;
2152 
2153   return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2154 }
2155 
2156 bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
2157                                         int64_t BrOffset) const {
2158   // BranchRelaxation should never have to check s_setpc_b64 because its dest
2159   // block is unanalyzable.
2160   assert(BranchOp != AMDGPU::S_SETPC_B64);
2161 
2162   // Convert to dwords.
2163   BrOffset /= 4;
2164 
2165   // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2166   // from the next instruction.
2167   BrOffset -= 1;
2168 
2169   return isIntN(BranchOffsetBits, BrOffset);
2170 }
2171 
2172 MachineBasicBlock *SIInstrInfo::getBranchDestBlock(
2173   const MachineInstr &MI) const {
2174   if (MI.getOpcode() == AMDGPU::S_SETPC_B64) {
2175     // This would be a difficult analysis to perform, but can always be legal so
2176     // there's no need to analyze it.
2177     return nullptr;
2178   }
2179 
2180   return MI.getOperand(0).getMBB();
2181 }
2182 
2183 unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
2184                                            MachineBasicBlock &DestBB,
2185                                            const DebugLoc &DL,
2186                                            int64_t BrOffset,
2187                                            RegScavenger *RS) const {
2188   assert(RS && "RegScavenger required for long branching");
2189   assert(MBB.empty() &&
2190          "new block should be inserted for expanding unconditional branch");
2191   assert(MBB.pred_size() == 1);
2192 
2193   MachineFunction *MF = MBB.getParent();
2194   MachineRegisterInfo &MRI = MF->getRegInfo();
2195 
2196   // FIXME: Virtual register workaround for RegScavenger not working with empty
2197   // blocks.
2198   Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2199 
2200   auto I = MBB.end();
2201 
2202   // We need to compute the offset relative to the instruction immediately after
2203   // s_getpc_b64. Insert pc arithmetic code before last terminator.
2204   MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
2205 
2206   // TODO: Handle > 32-bit block address.
2207   if (BrOffset >= 0) {
2208     BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
2209       .addReg(PCReg, RegState::Define, AMDGPU::sub0)
2210       .addReg(PCReg, 0, AMDGPU::sub0)
2211       .addMBB(&DestBB, MO_LONG_BRANCH_FORWARD);
2212     BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
2213       .addReg(PCReg, RegState::Define, AMDGPU::sub1)
2214       .addReg(PCReg, 0, AMDGPU::sub1)
2215       .addImm(0);
2216   } else {
2217     // Backwards branch.
2218     BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32))
2219       .addReg(PCReg, RegState::Define, AMDGPU::sub0)
2220       .addReg(PCReg, 0, AMDGPU::sub0)
2221       .addMBB(&DestBB, MO_LONG_BRANCH_BACKWARD);
2222     BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32))
2223       .addReg(PCReg, RegState::Define, AMDGPU::sub1)
2224       .addReg(PCReg, 0, AMDGPU::sub1)
2225       .addImm(0);
2226   }
2227 
2228   // Insert the indirect branch after the other terminator.
2229   BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
2230     .addReg(PCReg);
2231 
2232   // FIXME: If spilling is necessary, this will fail because this scavenger has
2233   // no emergency stack slots. It is non-trivial to spill in this situation,
2234   // because the restore code needs to be specially placed after the
2235   // jump. BranchRelaxation then needs to be made aware of the newly inserted
2236   // block.
2237   //
2238   // If a spill is needed for the pc register pair, we need to insert a spill
2239   // restore block right before the destination block, and insert a short branch
2240   // into the old destination block's fallthrough predecessor.
2241   // e.g.:
2242   //
2243   // s_cbranch_scc0 skip_long_branch:
2244   //
2245   // long_branch_bb:
2246   //   spill s[8:9]
2247   //   s_getpc_b64 s[8:9]
2248   //   s_add_u32 s8, s8, restore_bb
2249   //   s_addc_u32 s9, s9, 0
2250   //   s_setpc_b64 s[8:9]
2251   //
2252   // skip_long_branch:
2253   //   foo;
2254   //
2255   // .....
2256   //
2257   // dest_bb_fallthrough_predecessor:
2258   // bar;
2259   // s_branch dest_bb
2260   //
2261   // restore_bb:
2262   //  restore s[8:9]
2263   //  fallthrough dest_bb
2264   ///
2265   // dest_bb:
2266   //   buzz;
2267 
2268   RS->enterBasicBlockEnd(MBB);
2269   Register Scav = RS->scavengeRegisterBackwards(
2270     AMDGPU::SReg_64RegClass,
2271     MachineBasicBlock::iterator(GetPC), false, 0);
2272   MRI.replaceRegWith(PCReg, Scav);
2273   MRI.clearVirtRegs();
2274   RS->setRegUsed(Scav);
2275 
2276   return 4 + 8 + 4 + 4;
2277 }
2278 
2279 unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
2280   switch (Cond) {
2281   case SIInstrInfo::SCC_TRUE:
2282     return AMDGPU::S_CBRANCH_SCC1;
2283   case SIInstrInfo::SCC_FALSE:
2284     return AMDGPU::S_CBRANCH_SCC0;
2285   case SIInstrInfo::VCCNZ:
2286     return AMDGPU::S_CBRANCH_VCCNZ;
2287   case SIInstrInfo::VCCZ:
2288     return AMDGPU::S_CBRANCH_VCCZ;
2289   case SIInstrInfo::EXECNZ:
2290     return AMDGPU::S_CBRANCH_EXECNZ;
2291   case SIInstrInfo::EXECZ:
2292     return AMDGPU::S_CBRANCH_EXECZ;
2293   default:
2294     llvm_unreachable("invalid branch predicate");
2295   }
2296 }
2297 
2298 SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
2299   switch (Opcode) {
2300   case AMDGPU::S_CBRANCH_SCC0:
2301     return SCC_FALSE;
2302   case AMDGPU::S_CBRANCH_SCC1:
2303     return SCC_TRUE;
2304   case AMDGPU::S_CBRANCH_VCCNZ:
2305     return VCCNZ;
2306   case AMDGPU::S_CBRANCH_VCCZ:
2307     return VCCZ;
2308   case AMDGPU::S_CBRANCH_EXECNZ:
2309     return EXECNZ;
2310   case AMDGPU::S_CBRANCH_EXECZ:
2311     return EXECZ;
2312   default:
2313     return INVALID_BR;
2314   }
2315 }
2316 
2317 bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB,
2318                                     MachineBasicBlock::iterator I,
2319                                     MachineBasicBlock *&TBB,
2320                                     MachineBasicBlock *&FBB,
2321                                     SmallVectorImpl<MachineOperand> &Cond,
2322                                     bool AllowModify) const {
2323   if (I->getOpcode() == AMDGPU::S_BRANCH) {
2324     // Unconditional Branch
2325     TBB = I->getOperand(0).getMBB();
2326     return false;
2327   }
2328 
2329   MachineBasicBlock *CondBB = nullptr;
2330 
2331   if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
2332     CondBB = I->getOperand(1).getMBB();
2333     Cond.push_back(I->getOperand(0));
2334   } else {
2335     BranchPredicate Pred = getBranchPredicate(I->getOpcode());
2336     if (Pred == INVALID_BR)
2337       return true;
2338 
2339     CondBB = I->getOperand(0).getMBB();
2340     Cond.push_back(MachineOperand::CreateImm(Pred));
2341     Cond.push_back(I->getOperand(1)); // Save the branch register.
2342   }
2343   ++I;
2344 
2345   if (I == MBB.end()) {
2346     // Conditional branch followed by fall-through.
2347     TBB = CondBB;
2348     return false;
2349   }
2350 
2351   if (I->getOpcode() == AMDGPU::S_BRANCH) {
2352     TBB = CondBB;
2353     FBB = I->getOperand(0).getMBB();
2354     return false;
2355   }
2356 
2357   return true;
2358 }
2359 
2360 bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
2361                                 MachineBasicBlock *&FBB,
2362                                 SmallVectorImpl<MachineOperand> &Cond,
2363                                 bool AllowModify) const {
2364   MachineBasicBlock::iterator I = MBB.getFirstTerminator();
2365   auto E = MBB.end();
2366   if (I == E)
2367     return false;
2368 
2369   // Skip over the instructions that are artificially terminators for special
2370   // exec management.
2371   while (I != E && !I->isBranch() && !I->isReturn()) {
2372     switch (I->getOpcode()) {
2373     case AMDGPU::S_MOV_B64_term:
2374     case AMDGPU::S_XOR_B64_term:
2375     case AMDGPU::S_OR_B64_term:
2376     case AMDGPU::S_ANDN2_B64_term:
2377     case AMDGPU::S_AND_B64_term:
2378     case AMDGPU::S_MOV_B32_term:
2379     case AMDGPU::S_XOR_B32_term:
2380     case AMDGPU::S_OR_B32_term:
2381     case AMDGPU::S_ANDN2_B32_term:
2382     case AMDGPU::S_AND_B32_term:
2383       break;
2384     case AMDGPU::SI_IF:
2385     case AMDGPU::SI_ELSE:
2386     case AMDGPU::SI_KILL_I1_TERMINATOR:
2387     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
2388       // FIXME: It's messy that these need to be considered here at all.
2389       return true;
2390     default:
2391       llvm_unreachable("unexpected non-branch terminator inst");
2392     }
2393 
2394     ++I;
2395   }
2396 
2397   if (I == E)
2398     return false;
2399 
2400   return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
2401 }
2402 
2403 unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB,
2404                                    int *BytesRemoved) const {
2405   MachineBasicBlock::iterator I = MBB.getFirstTerminator();
2406 
2407   unsigned Count = 0;
2408   unsigned RemovedSize = 0;
2409   while (I != MBB.end()) {
2410     MachineBasicBlock::iterator Next = std::next(I);
2411     RemovedSize += getInstSizeInBytes(*I);
2412     I->eraseFromParent();
2413     ++Count;
2414     I = Next;
2415   }
2416 
2417   if (BytesRemoved)
2418     *BytesRemoved = RemovedSize;
2419 
2420   return Count;
2421 }
2422 
2423 // Copy the flags onto the implicit condition register operand.
2424 static void preserveCondRegFlags(MachineOperand &CondReg,
2425                                  const MachineOperand &OrigCond) {
2426   CondReg.setIsUndef(OrigCond.isUndef());
2427   CondReg.setIsKill(OrigCond.isKill());
2428 }
2429 
2430 unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
2431                                    MachineBasicBlock *TBB,
2432                                    MachineBasicBlock *FBB,
2433                                    ArrayRef<MachineOperand> Cond,
2434                                    const DebugLoc &DL,
2435                                    int *BytesAdded) const {
2436   if (!FBB && Cond.empty()) {
2437     BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
2438       .addMBB(TBB);
2439     if (BytesAdded)
2440       *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
2441     return 1;
2442   }
2443 
2444   if(Cond.size() == 1 && Cond[0].isReg()) {
2445      BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO))
2446        .add(Cond[0])
2447        .addMBB(TBB);
2448      return 1;
2449   }
2450 
2451   assert(TBB && Cond[0].isImm());
2452 
2453   unsigned Opcode
2454     = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
2455 
2456   if (!FBB) {
2457     Cond[1].isUndef();
2458     MachineInstr *CondBr =
2459       BuildMI(&MBB, DL, get(Opcode))
2460       .addMBB(TBB);
2461 
2462     // Copy the flags onto the implicit condition register operand.
2463     preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
2464     fixImplicitOperands(*CondBr);
2465 
2466     if (BytesAdded)
2467       *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
2468     return 1;
2469   }
2470 
2471   assert(TBB && FBB);
2472 
2473   MachineInstr *CondBr =
2474     BuildMI(&MBB, DL, get(Opcode))
2475     .addMBB(TBB);
2476   fixImplicitOperands(*CondBr);
2477   BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
2478     .addMBB(FBB);
2479 
2480   MachineOperand &CondReg = CondBr->getOperand(1);
2481   CondReg.setIsUndef(Cond[1].isUndef());
2482   CondReg.setIsKill(Cond[1].isKill());
2483 
2484   if (BytesAdded)
2485     *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
2486 
2487   return 2;
2488 }
2489 
2490 bool SIInstrInfo::reverseBranchCondition(
2491   SmallVectorImpl<MachineOperand> &Cond) const {
2492   if (Cond.size() != 2) {
2493     return true;
2494   }
2495 
2496   if (Cond[0].isImm()) {
2497     Cond[0].setImm(-Cond[0].getImm());
2498     return false;
2499   }
2500 
2501   return true;
2502 }
2503 
2504 bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
2505                                   ArrayRef<MachineOperand> Cond,
2506                                   Register DstReg, Register TrueReg,
2507                                   Register FalseReg, int &CondCycles,
2508                                   int &TrueCycles, int &FalseCycles) const {
2509   switch (Cond[0].getImm()) {
2510   case VCCNZ:
2511   case VCCZ: {
2512     const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
2513     const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
2514     if (MRI.getRegClass(FalseReg) != RC)
2515       return false;
2516 
2517     int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
2518     CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
2519 
2520     // Limit to equal cost for branch vs. N v_cndmask_b32s.
2521     return RI.hasVGPRs(RC) && NumInsts <= 6;
2522   }
2523   case SCC_TRUE:
2524   case SCC_FALSE: {
2525     // FIXME: We could insert for VGPRs if we could replace the original compare
2526     // with a vector one.
2527     const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
2528     const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
2529     if (MRI.getRegClass(FalseReg) != RC)
2530       return false;
2531 
2532     int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
2533 
2534     // Multiples of 8 can do s_cselect_b64
2535     if (NumInsts % 2 == 0)
2536       NumInsts /= 2;
2537 
2538     CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
2539     return RI.isSGPRClass(RC);
2540   }
2541   default:
2542     return false;
2543   }
2544 }
2545 
2546 void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
2547                                MachineBasicBlock::iterator I, const DebugLoc &DL,
2548                                Register DstReg, ArrayRef<MachineOperand> Cond,
2549                                Register TrueReg, Register FalseReg) const {
2550   BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
2551   if (Pred == VCCZ || Pred == SCC_FALSE) {
2552     Pred = static_cast<BranchPredicate>(-Pred);
2553     std::swap(TrueReg, FalseReg);
2554   }
2555 
2556   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
2557   const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
2558   unsigned DstSize = RI.getRegSizeInBits(*DstRC);
2559 
2560   if (DstSize == 32) {
2561     MachineInstr *Select;
2562     if (Pred == SCC_TRUE) {
2563       Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
2564         .addReg(TrueReg)
2565         .addReg(FalseReg);
2566     } else {
2567       // Instruction's operands are backwards from what is expected.
2568       Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
2569         .addReg(FalseReg)
2570         .addReg(TrueReg);
2571     }
2572 
2573     preserveCondRegFlags(Select->getOperand(3), Cond[1]);
2574     return;
2575   }
2576 
2577   if (DstSize == 64 && Pred == SCC_TRUE) {
2578     MachineInstr *Select =
2579       BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
2580       .addReg(TrueReg)
2581       .addReg(FalseReg);
2582 
2583     preserveCondRegFlags(Select->getOperand(3), Cond[1]);
2584     return;
2585   }
2586 
2587   static const int16_t Sub0_15[] = {
2588     AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
2589     AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
2590     AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
2591     AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
2592   };
2593 
2594   static const int16_t Sub0_15_64[] = {
2595     AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
2596     AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
2597     AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
2598     AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
2599   };
2600 
2601   unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
2602   const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
2603   const int16_t *SubIndices = Sub0_15;
2604   int NElts = DstSize / 32;
2605 
2606   // 64-bit select is only available for SALU.
2607   // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
2608   if (Pred == SCC_TRUE) {
2609     if (NElts % 2) {
2610       SelOp = AMDGPU::S_CSELECT_B32;
2611       EltRC = &AMDGPU::SGPR_32RegClass;
2612     } else {
2613       SelOp = AMDGPU::S_CSELECT_B64;
2614       EltRC = &AMDGPU::SGPR_64RegClass;
2615       SubIndices = Sub0_15_64;
2616       NElts /= 2;
2617     }
2618   }
2619 
2620   MachineInstrBuilder MIB = BuildMI(
2621     MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
2622 
2623   I = MIB->getIterator();
2624 
2625   SmallVector<Register, 8> Regs;
2626   for (int Idx = 0; Idx != NElts; ++Idx) {
2627     Register DstElt = MRI.createVirtualRegister(EltRC);
2628     Regs.push_back(DstElt);
2629 
2630     unsigned SubIdx = SubIndices[Idx];
2631 
2632     MachineInstr *Select;
2633     if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
2634       Select =
2635         BuildMI(MBB, I, DL, get(SelOp), DstElt)
2636         .addReg(FalseReg, 0, SubIdx)
2637         .addReg(TrueReg, 0, SubIdx);
2638     } else {
2639       Select =
2640         BuildMI(MBB, I, DL, get(SelOp), DstElt)
2641         .addReg(TrueReg, 0, SubIdx)
2642         .addReg(FalseReg, 0, SubIdx);
2643     }
2644 
2645     preserveCondRegFlags(Select->getOperand(3), Cond[1]);
2646     fixImplicitOperands(*Select);
2647 
2648     MIB.addReg(DstElt)
2649        .addImm(SubIdx);
2650   }
2651 }
2652 
2653 bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const {
2654   switch (MI.getOpcode()) {
2655   case AMDGPU::V_MOV_B32_e32:
2656   case AMDGPU::V_MOV_B32_e64:
2657   case AMDGPU::V_MOV_B64_PSEUDO: {
2658     // If there are additional implicit register operands, this may be used for
2659     // register indexing so the source register operand isn't simply copied.
2660     unsigned NumOps = MI.getDesc().getNumOperands() +
2661       MI.getDesc().getNumImplicitUses();
2662 
2663     return MI.getNumOperands() == NumOps;
2664   }
2665   case AMDGPU::S_MOV_B32:
2666   case AMDGPU::S_MOV_B64:
2667   case AMDGPU::COPY:
2668   case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
2669   case AMDGPU::V_ACCVGPR_READ_B32_e64:
2670   case AMDGPU::V_ACCVGPR_MOV_B32:
2671     return true;
2672   default:
2673     return false;
2674   }
2675 }
2676 
2677 unsigned SIInstrInfo::getAddressSpaceForPseudoSourceKind(
2678     unsigned Kind) const {
2679   switch(Kind) {
2680   case PseudoSourceValue::Stack:
2681   case PseudoSourceValue::FixedStack:
2682     return AMDGPUAS::PRIVATE_ADDRESS;
2683   case PseudoSourceValue::ConstantPool:
2684   case PseudoSourceValue::GOT:
2685   case PseudoSourceValue::JumpTable:
2686   case PseudoSourceValue::GlobalValueCallEntry:
2687   case PseudoSourceValue::ExternalSymbolCallEntry:
2688   case PseudoSourceValue::TargetCustom:
2689     return AMDGPUAS::CONSTANT_ADDRESS;
2690   }
2691   return AMDGPUAS::FLAT_ADDRESS;
2692 }
2693 
2694 static void removeModOperands(MachineInstr &MI) {
2695   unsigned Opc = MI.getOpcode();
2696   int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc,
2697                                               AMDGPU::OpName::src0_modifiers);
2698   int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc,
2699                                               AMDGPU::OpName::src1_modifiers);
2700   int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc,
2701                                               AMDGPU::OpName::src2_modifiers);
2702 
2703   MI.RemoveOperand(Src2ModIdx);
2704   MI.RemoveOperand(Src1ModIdx);
2705   MI.RemoveOperand(Src0ModIdx);
2706 }
2707 
2708 bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
2709                                 Register Reg, MachineRegisterInfo *MRI) const {
2710   if (!MRI->hasOneNonDBGUse(Reg))
2711     return false;
2712 
2713   switch (DefMI.getOpcode()) {
2714   default:
2715     return false;
2716   case AMDGPU::S_MOV_B64:
2717     // TODO: We could fold 64-bit immediates, but this get compilicated
2718     // when there are sub-registers.
2719     return false;
2720 
2721   case AMDGPU::V_MOV_B32_e32:
2722   case AMDGPU::S_MOV_B32:
2723   case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
2724     break;
2725   }
2726 
2727   const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0);
2728   assert(ImmOp);
2729   // FIXME: We could handle FrameIndex values here.
2730   if (!ImmOp->isImm())
2731     return false;
2732 
2733   unsigned Opc = UseMI.getOpcode();
2734   if (Opc == AMDGPU::COPY) {
2735     Register DstReg = UseMI.getOperand(0).getReg();
2736     bool Is16Bit = getOpSize(UseMI, 0) == 2;
2737     bool isVGPRCopy = RI.isVGPR(*MRI, DstReg);
2738     unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2739     APInt Imm(32, ImmOp->getImm());
2740 
2741     if (UseMI.getOperand(1).getSubReg() == AMDGPU::hi16)
2742       Imm = Imm.ashr(16);
2743 
2744     if (RI.isAGPR(*MRI, DstReg)) {
2745       if (!isInlineConstant(Imm))
2746         return false;
2747       NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
2748     }
2749 
2750     if (Is16Bit) {
2751        if (isVGPRCopy)
2752          return false; // Do not clobber vgpr_hi16
2753 
2754        if (DstReg.isVirtual() &&
2755            UseMI.getOperand(0).getSubReg() != AMDGPU::lo16)
2756          return false;
2757 
2758       UseMI.getOperand(0).setSubReg(0);
2759       if (DstReg.isPhysical()) {
2760         DstReg = RI.get32BitRegister(DstReg);
2761         UseMI.getOperand(0).setReg(DstReg);
2762       }
2763       assert(UseMI.getOperand(1).getReg().isVirtual());
2764     }
2765 
2766     UseMI.setDesc(get(NewOpc));
2767     UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue());
2768     UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
2769     return true;
2770   }
2771 
2772   if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
2773       Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
2774       Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
2775       Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64) {
2776     // Don't fold if we are using source or output modifiers. The new VOP2
2777     // instructions don't have them.
2778     if (hasAnyModifiersSet(UseMI))
2779       return false;
2780 
2781     // If this is a free constant, there's no reason to do this.
2782     // TODO: We could fold this here instead of letting SIFoldOperands do it
2783     // later.
2784     MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
2785 
2786     // Any src operand can be used for the legality check.
2787     if (isInlineConstant(UseMI, *Src0, *ImmOp))
2788       return false;
2789 
2790     bool IsF32 = Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
2791                  Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64;
2792     bool IsFMA = Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
2793                  Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64;
2794     MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
2795     MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
2796 
2797     // Multiplied part is the constant: Use v_madmk_{f16, f32}.
2798     // We should only expect these to be on src0 due to canonicalizations.
2799     if (Src0->isReg() && Src0->getReg() == Reg) {
2800       if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
2801         return false;
2802 
2803       if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
2804         return false;
2805 
2806       unsigned NewOpc =
2807         IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32 : AMDGPU::V_FMAMK_F16)
2808               : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
2809       if (pseudoToMCOpcode(NewOpc) == -1)
2810         return false;
2811 
2812       // We need to swap operands 0 and 1 since madmk constant is at operand 1.
2813 
2814       const int64_t Imm = ImmOp->getImm();
2815 
2816       // FIXME: This would be a lot easier if we could return a new instruction
2817       // instead of having to modify in place.
2818 
2819       // Remove these first since they are at the end.
2820       UseMI.RemoveOperand(
2821           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
2822       UseMI.RemoveOperand(
2823           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
2824 
2825       Register Src1Reg = Src1->getReg();
2826       unsigned Src1SubReg = Src1->getSubReg();
2827       Src0->setReg(Src1Reg);
2828       Src0->setSubReg(Src1SubReg);
2829       Src0->setIsKill(Src1->isKill());
2830 
2831       if (Opc == AMDGPU::V_MAC_F32_e64 ||
2832           Opc == AMDGPU::V_MAC_F16_e64 ||
2833           Opc == AMDGPU::V_FMAC_F32_e64 ||
2834           Opc == AMDGPU::V_FMAC_F16_e64)
2835         UseMI.untieRegOperand(
2836             AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
2837 
2838       Src1->ChangeToImmediate(Imm);
2839 
2840       removeModOperands(UseMI);
2841       UseMI.setDesc(get(NewOpc));
2842 
2843       bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
2844       if (DeleteDef)
2845         DefMI.eraseFromParent();
2846 
2847       return true;
2848     }
2849 
2850     // Added part is the constant: Use v_madak_{f16, f32}.
2851     if (Src2->isReg() && Src2->getReg() == Reg) {
2852       // Not allowed to use constant bus for another operand.
2853       // We can however allow an inline immediate as src0.
2854       bool Src0Inlined = false;
2855       if (Src0->isReg()) {
2856         // Try to inline constant if possible.
2857         // If the Def moves immediate and the use is single
2858         // We are saving VGPR here.
2859         MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
2860         if (Def && Def->isMoveImmediate() &&
2861           isInlineConstant(Def->getOperand(1)) &&
2862           MRI->hasOneUse(Src0->getReg())) {
2863           Src0->ChangeToImmediate(Def->getOperand(1).getImm());
2864           Src0Inlined = true;
2865         } else if ((Src0->getReg().isPhysical() &&
2866                     (ST.getConstantBusLimit(Opc) <= 1 &&
2867                      RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg())))) ||
2868                    (Src0->getReg().isVirtual() &&
2869                     (ST.getConstantBusLimit(Opc) <= 1 &&
2870                      RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))))
2871           return false;
2872           // VGPR is okay as Src0 - fallthrough
2873       }
2874 
2875       if (Src1->isReg() && !Src0Inlined ) {
2876         // We have one slot for inlinable constant so far - try to fill it
2877         MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
2878         if (Def && Def->isMoveImmediate() &&
2879             isInlineConstant(Def->getOperand(1)) &&
2880             MRI->hasOneUse(Src1->getReg()) &&
2881             commuteInstruction(UseMI)) {
2882             Src0->ChangeToImmediate(Def->getOperand(1).getImm());
2883         } else if ((Src1->getReg().isPhysical() &&
2884                     RI.isSGPRClass(RI.getPhysRegClass(Src1->getReg()))) ||
2885                    (Src1->getReg().isVirtual() &&
2886                     RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
2887           return false;
2888           // VGPR is okay as Src1 - fallthrough
2889       }
2890 
2891       unsigned NewOpc =
2892         IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32 : AMDGPU::V_FMAAK_F16)
2893               : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
2894       if (pseudoToMCOpcode(NewOpc) == -1)
2895         return false;
2896 
2897       const int64_t Imm = ImmOp->getImm();
2898 
2899       // FIXME: This would be a lot easier if we could return a new instruction
2900       // instead of having to modify in place.
2901 
2902       // Remove these first since they are at the end.
2903       UseMI.RemoveOperand(
2904           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
2905       UseMI.RemoveOperand(
2906           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
2907 
2908       if (Opc == AMDGPU::V_MAC_F32_e64 ||
2909           Opc == AMDGPU::V_MAC_F16_e64 ||
2910           Opc == AMDGPU::V_FMAC_F32_e64 ||
2911           Opc == AMDGPU::V_FMAC_F16_e64)
2912         UseMI.untieRegOperand(
2913             AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
2914 
2915       // ChangingToImmediate adds Src2 back to the instruction.
2916       Src2->ChangeToImmediate(Imm);
2917 
2918       // These come before src2.
2919       removeModOperands(UseMI);
2920       UseMI.setDesc(get(NewOpc));
2921       // It might happen that UseMI was commuted
2922       // and we now have SGPR as SRC1. If so 2 inlined
2923       // constant and SGPR are illegal.
2924       legalizeOperands(UseMI);
2925 
2926       bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
2927       if (DeleteDef)
2928         DefMI.eraseFromParent();
2929 
2930       return true;
2931     }
2932   }
2933 
2934   return false;
2935 }
2936 
2937 static bool
2938 memOpsHaveSameBaseOperands(ArrayRef<const MachineOperand *> BaseOps1,
2939                            ArrayRef<const MachineOperand *> BaseOps2) {
2940   if (BaseOps1.size() != BaseOps2.size())
2941     return false;
2942   for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
2943     if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
2944       return false;
2945   }
2946   return true;
2947 }
2948 
2949 static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
2950                                 int WidthB, int OffsetB) {
2951   int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
2952   int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
2953   int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
2954   return LowOffset + LowWidth <= HighOffset;
2955 }
2956 
2957 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
2958                                                const MachineInstr &MIb) const {
2959   SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
2960   int64_t Offset0, Offset1;
2961   unsigned Dummy0, Dummy1;
2962   bool Offset0IsScalable, Offset1IsScalable;
2963   if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
2964                                      Dummy0, &RI) ||
2965       !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
2966                                      Dummy1, &RI))
2967     return false;
2968 
2969   if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
2970     return false;
2971 
2972   if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
2973     // FIXME: Handle ds_read2 / ds_write2.
2974     return false;
2975   }
2976   unsigned Width0 = MIa.memoperands().front()->getSize();
2977   unsigned Width1 = MIb.memoperands().front()->getSize();
2978   return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
2979 }
2980 
2981 bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
2982                                                   const MachineInstr &MIb) const {
2983   assert(MIa.mayLoadOrStore() &&
2984          "MIa must load from or modify a memory location");
2985   assert(MIb.mayLoadOrStore() &&
2986          "MIb must load from or modify a memory location");
2987 
2988   if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects())
2989     return false;
2990 
2991   // XXX - Can we relax this between address spaces?
2992   if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
2993     return false;
2994 
2995   // TODO: Should we check the address space from the MachineMemOperand? That
2996   // would allow us to distinguish objects we know don't alias based on the
2997   // underlying address space, even if it was lowered to a different one,
2998   // e.g. private accesses lowered to use MUBUF instructions on a scratch
2999   // buffer.
3000   if (isDS(MIa)) {
3001     if (isDS(MIb))
3002       return checkInstOffsetsDoNotOverlap(MIa, MIb);
3003 
3004     return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
3005   }
3006 
3007   if (isMUBUF(MIa) || isMTBUF(MIa)) {
3008     if (isMUBUF(MIb) || isMTBUF(MIb))
3009       return checkInstOffsetsDoNotOverlap(MIa, MIb);
3010 
3011     return !isFLAT(MIb) && !isSMRD(MIb);
3012   }
3013 
3014   if (isSMRD(MIa)) {
3015     if (isSMRD(MIb))
3016       return checkInstOffsetsDoNotOverlap(MIa, MIb);
3017 
3018     return !isFLAT(MIb) && !isMUBUF(MIb) && !isMTBUF(MIb);
3019   }
3020 
3021   if (isFLAT(MIa)) {
3022     if (isFLAT(MIb))
3023       return checkInstOffsetsDoNotOverlap(MIa, MIb);
3024 
3025     return false;
3026   }
3027 
3028   return false;
3029 }
3030 
3031 static int64_t getFoldableImm(const MachineOperand* MO) {
3032   if (!MO->isReg())
3033     return false;
3034   const MachineFunction *MF = MO->getParent()->getParent()->getParent();
3035   const MachineRegisterInfo &MRI = MF->getRegInfo();
3036   auto Def = MRI.getUniqueVRegDef(MO->getReg());
3037   if (Def && Def->getOpcode() == AMDGPU::V_MOV_B32_e32 &&
3038       Def->getOperand(1).isImm())
3039     return Def->getOperand(1).getImm();
3040   return AMDGPU::NoRegister;
3041 }
3042 
3043 static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI,
3044                                 MachineInstr &NewMI) {
3045   if (LV) {
3046     unsigned NumOps = MI.getNumOperands();
3047     for (unsigned I = 1; I < NumOps; ++I) {
3048       MachineOperand &Op = MI.getOperand(I);
3049       if (Op.isReg() && Op.isKill())
3050         LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
3051     }
3052   }
3053 }
3054 
3055 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
3056                                                  MachineInstr &MI,
3057                                                  LiveVariables *LV) const {
3058   unsigned Opc = MI.getOpcode();
3059   bool IsF16 = false;
3060   bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3061                Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3062                Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3063   bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3064 
3065   switch (Opc) {
3066   default:
3067     return nullptr;
3068   case AMDGPU::V_MAC_F16_e64:
3069   case AMDGPU::V_FMAC_F16_e64:
3070     IsF16 = true;
3071     LLVM_FALLTHROUGH;
3072   case AMDGPU::V_MAC_F32_e64:
3073   case AMDGPU::V_FMAC_F32_e64:
3074   case AMDGPU::V_FMAC_F64_e64:
3075     break;
3076   case AMDGPU::V_MAC_F16_e32:
3077   case AMDGPU::V_FMAC_F16_e32:
3078     IsF16 = true;
3079     LLVM_FALLTHROUGH;
3080   case AMDGPU::V_MAC_F32_e32:
3081   case AMDGPU::V_FMAC_F32_e32:
3082   case AMDGPU::V_FMAC_F64_e32: {
3083     int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
3084                                              AMDGPU::OpName::src0);
3085     const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
3086     if (!Src0->isReg() && !Src0->isImm())
3087       return nullptr;
3088 
3089     if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
3090       return nullptr;
3091 
3092     break;
3093   }
3094   }
3095 
3096   const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
3097   const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
3098   const MachineOperand *Src0Mods =
3099     getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
3100   const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
3101   const MachineOperand *Src1Mods =
3102     getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
3103   const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
3104   const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
3105   const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
3106   MachineInstrBuilder MIB;
3107 
3108   if (!Src0Mods && !Src1Mods && !Clamp && !Omod && !IsF64 &&
3109       // If we have an SGPR input, we will violate the constant bus restriction.
3110       (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
3111        !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) {
3112     if (auto Imm = getFoldableImm(Src2)) {
3113       unsigned NewOpc =
3114           IsFMA ? (IsF16 ? AMDGPU::V_FMAAK_F16 : AMDGPU::V_FMAAK_F32)
3115                 : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
3116       if (pseudoToMCOpcode(NewOpc) != -1) {
3117         MIB = BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
3118                   .add(*Dst)
3119                   .add(*Src0)
3120                   .add(*Src1)
3121                   .addImm(Imm);
3122         updateLiveVariables(LV, MI, *MIB);
3123         return MIB;
3124       }
3125     }
3126     unsigned NewOpc = IsFMA
3127                           ? (IsF16 ? AMDGPU::V_FMAMK_F16 : AMDGPU::V_FMAMK_F32)
3128                           : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
3129     if (auto Imm = getFoldableImm(Src1)) {
3130       if (pseudoToMCOpcode(NewOpc) != -1) {
3131         MIB = BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
3132                   .add(*Dst)
3133                   .add(*Src0)
3134                   .addImm(Imm)
3135                   .add(*Src2);
3136         updateLiveVariables(LV, MI, *MIB);
3137         return MIB;
3138       }
3139     }
3140     if (auto Imm = getFoldableImm(Src0)) {
3141       if (pseudoToMCOpcode(NewOpc) != -1 &&
3142           isOperandLegal(
3143               MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
3144               Src1)) {
3145         MIB = BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
3146                   .add(*Dst)
3147                   .add(*Src1)
3148                   .addImm(Imm)
3149                   .add(*Src2);
3150         updateLiveVariables(LV, MI, *MIB);
3151         return MIB;
3152       }
3153     }
3154   }
3155 
3156   unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16_e64
3157                                    : IsF64 ? AMDGPU::V_FMA_F64_e64
3158                                            : AMDGPU::V_FMA_F32_e64)
3159                           : (IsF16 ? AMDGPU::V_MAD_F16_e64 : AMDGPU::V_MAD_F32_e64);
3160   if (pseudoToMCOpcode(NewOpc) == -1)
3161     return nullptr;
3162 
3163   MIB = BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
3164             .add(*Dst)
3165             .addImm(Src0Mods ? Src0Mods->getImm() : 0)
3166             .add(*Src0)
3167             .addImm(Src1Mods ? Src1Mods->getImm() : 0)
3168             .add(*Src1)
3169             .addImm(0) // Src mods
3170             .add(*Src2)
3171             .addImm(Clamp ? Clamp->getImm() : 0)
3172             .addImm(Omod ? Omod->getImm() : 0);
3173   updateLiveVariables(LV, MI, *MIB);
3174   return MIB;
3175 }
3176 
3177 // It's not generally safe to move VALU instructions across these since it will
3178 // start using the register as a base index rather than directly.
3179 // XXX - Why isn't hasSideEffects sufficient for these?
3180 static bool changesVGPRIndexingMode(const MachineInstr &MI) {
3181   switch (MI.getOpcode()) {
3182   case AMDGPU::S_SET_GPR_IDX_ON:
3183   case AMDGPU::S_SET_GPR_IDX_MODE:
3184   case AMDGPU::S_SET_GPR_IDX_OFF:
3185     return true;
3186   default:
3187     return false;
3188   }
3189 }
3190 
3191 bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
3192                                        const MachineBasicBlock *MBB,
3193                                        const MachineFunction &MF) const {
3194   // Skipping the check for SP writes in the base implementation. The reason it
3195   // was added was apparently due to compile time concerns.
3196   //
3197   // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
3198   // but is probably avoidable.
3199 
3200   // Copied from base implementation.
3201   // Terminators and labels can't be scheduled around.
3202   if (MI.isTerminator() || MI.isPosition())
3203     return true;
3204 
3205   // INLINEASM_BR can jump to another block
3206   if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
3207     return true;
3208 
3209   // Target-independent instructions do not have an implicit-use of EXEC, even
3210   // when they operate on VGPRs. Treating EXEC modifications as scheduling
3211   // boundaries prevents incorrect movements of such instructions.
3212   return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
3213          MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
3214          MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
3215          changesVGPRIndexingMode(MI);
3216 }
3217 
3218 bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const {
3219   return Opcode == AMDGPU::DS_ORDERED_COUNT ||
3220          Opcode == AMDGPU::DS_GWS_INIT ||
3221          Opcode == AMDGPU::DS_GWS_SEMA_V ||
3222          Opcode == AMDGPU::DS_GWS_SEMA_BR ||
3223          Opcode == AMDGPU::DS_GWS_SEMA_P ||
3224          Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL ||
3225          Opcode == AMDGPU::DS_GWS_BARRIER;
3226 }
3227 
3228 bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) {
3229   // Skip the full operand and register alias search modifiesRegister
3230   // does. There's only a handful of instructions that touch this, it's only an
3231   // implicit def, and doesn't alias any other registers.
3232   if (const MCPhysReg *ImpDef = MI.getDesc().getImplicitDefs()) {
3233     for (; ImpDef && *ImpDef; ++ImpDef) {
3234       if (*ImpDef == AMDGPU::MODE)
3235         return true;
3236     }
3237   }
3238 
3239   return false;
3240 }
3241 
3242 bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const {
3243   unsigned Opcode = MI.getOpcode();
3244 
3245   if (MI.mayStore() && isSMRD(MI))
3246     return true; // scalar store or atomic
3247 
3248   // This will terminate the function when other lanes may need to continue.
3249   if (MI.isReturn())
3250     return true;
3251 
3252   // These instructions cause shader I/O that may cause hardware lockups
3253   // when executed with an empty EXEC mask.
3254   //
3255   // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
3256   //       EXEC = 0, but checking for that case here seems not worth it
3257   //       given the typical code patterns.
3258   if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
3259       isEXP(Opcode) ||
3260       Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP ||
3261       Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER)
3262     return true;
3263 
3264   if (MI.isCall() || MI.isInlineAsm())
3265     return true; // conservative assumption
3266 
3267   // A mode change is a scalar operation that influences vector instructions.
3268   if (modifiesModeRegister(MI))
3269     return true;
3270 
3271   // These are like SALU instructions in terms of effects, so it's questionable
3272   // whether we should return true for those.
3273   //
3274   // However, executing them with EXEC = 0 causes them to operate on undefined
3275   // data, which we avoid by returning true here.
3276   if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
3277       Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32)
3278     return true;
3279 
3280   return false;
3281 }
3282 
3283 bool SIInstrInfo::mayReadEXEC(const MachineRegisterInfo &MRI,
3284                               const MachineInstr &MI) const {
3285   if (MI.isMetaInstruction())
3286     return false;
3287 
3288   // This won't read exec if this is an SGPR->SGPR copy.
3289   if (MI.isCopyLike()) {
3290     if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
3291       return true;
3292 
3293     // Make sure this isn't copying exec as a normal operand
3294     return MI.readsRegister(AMDGPU::EXEC, &RI);
3295   }
3296 
3297   // Make a conservative assumption about the callee.
3298   if (MI.isCall())
3299     return true;
3300 
3301   // Be conservative with any unhandled generic opcodes.
3302   if (!isTargetSpecificOpcode(MI.getOpcode()))
3303     return true;
3304 
3305   return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
3306 }
3307 
3308 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
3309   switch (Imm.getBitWidth()) {
3310   case 1: // This likely will be a condition code mask.
3311     return true;
3312 
3313   case 32:
3314     return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
3315                                         ST.hasInv2PiInlineImm());
3316   case 64:
3317     return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
3318                                         ST.hasInv2PiInlineImm());
3319   case 16:
3320     return ST.has16BitInsts() &&
3321            AMDGPU::isInlinableLiteral16(Imm.getSExtValue(),
3322                                         ST.hasInv2PiInlineImm());
3323   default:
3324     llvm_unreachable("invalid bitwidth");
3325   }
3326 }
3327 
3328 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
3329                                    uint8_t OperandType) const {
3330   if (!MO.isImm() ||
3331       OperandType < AMDGPU::OPERAND_SRC_FIRST ||
3332       OperandType > AMDGPU::OPERAND_SRC_LAST)
3333     return false;
3334 
3335   // MachineOperand provides no way to tell the true operand size, since it only
3336   // records a 64-bit value. We need to know the size to determine if a 32-bit
3337   // floating point immediate bit pattern is legal for an integer immediate. It
3338   // would be for any 32-bit integer operand, but would not be for a 64-bit one.
3339 
3340   int64_t Imm = MO.getImm();
3341   switch (OperandType) {
3342   case AMDGPU::OPERAND_REG_IMM_INT32:
3343   case AMDGPU::OPERAND_REG_IMM_FP32:
3344   case AMDGPU::OPERAND_REG_INLINE_C_INT32:
3345   case AMDGPU::OPERAND_REG_INLINE_C_FP32:
3346   case AMDGPU::OPERAND_REG_IMM_V2FP32:
3347   case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
3348   case AMDGPU::OPERAND_REG_IMM_V2INT32:
3349   case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
3350   case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
3351   case AMDGPU::OPERAND_REG_INLINE_AC_FP32: {
3352     int32_t Trunc = static_cast<int32_t>(Imm);
3353     return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
3354   }
3355   case AMDGPU::OPERAND_REG_IMM_INT64:
3356   case AMDGPU::OPERAND_REG_IMM_FP64:
3357   case AMDGPU::OPERAND_REG_INLINE_C_INT64:
3358   case AMDGPU::OPERAND_REG_INLINE_C_FP64:
3359   case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
3360     return AMDGPU::isInlinableLiteral64(MO.getImm(),
3361                                         ST.hasInv2PiInlineImm());
3362   case AMDGPU::OPERAND_REG_IMM_INT16:
3363   case AMDGPU::OPERAND_REG_INLINE_C_INT16:
3364   case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
3365     // We would expect inline immediates to not be concerned with an integer/fp
3366     // distinction. However, in the case of 16-bit integer operations, the
3367     // "floating point" values appear to not work. It seems read the low 16-bits
3368     // of 32-bit immediates, which happens to always work for the integer
3369     // values.
3370     //
3371     // See llvm bugzilla 46302.
3372     //
3373     // TODO: Theoretically we could use op-sel to use the high bits of the
3374     // 32-bit FP values.
3375     return AMDGPU::isInlinableIntLiteral(Imm);
3376   case AMDGPU::OPERAND_REG_IMM_V2INT16:
3377   case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
3378   case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
3379     // This suffers the same problem as the scalar 16-bit cases.
3380     return AMDGPU::isInlinableIntLiteralV216(Imm);
3381   case AMDGPU::OPERAND_REG_IMM_FP16:
3382   case AMDGPU::OPERAND_REG_INLINE_C_FP16:
3383   case AMDGPU::OPERAND_REG_INLINE_AC_FP16: {
3384     if (isInt<16>(Imm) || isUInt<16>(Imm)) {
3385       // A few special case instructions have 16-bit operands on subtargets
3386       // where 16-bit instructions are not legal.
3387       // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
3388       // constants in these cases
3389       int16_t Trunc = static_cast<int16_t>(Imm);
3390       return ST.has16BitInsts() &&
3391              AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm());
3392     }
3393 
3394     return false;
3395   }
3396   case AMDGPU::OPERAND_REG_IMM_V2FP16:
3397   case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
3398   case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: {
3399     uint32_t Trunc = static_cast<uint32_t>(Imm);
3400     return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm());
3401   }
3402   default:
3403     llvm_unreachable("invalid bitwidth");
3404   }
3405 }
3406 
3407 bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO,
3408                                         const MCOperandInfo &OpInfo) const {
3409   switch (MO.getType()) {
3410   case MachineOperand::MO_Register:
3411     return false;
3412   case MachineOperand::MO_Immediate:
3413     return !isInlineConstant(MO, OpInfo);
3414   case MachineOperand::MO_FrameIndex:
3415   case MachineOperand::MO_MachineBasicBlock:
3416   case MachineOperand::MO_ExternalSymbol:
3417   case MachineOperand::MO_GlobalAddress:
3418   case MachineOperand::MO_MCSymbol:
3419     return true;
3420   default:
3421     llvm_unreachable("unexpected operand type");
3422   }
3423 }
3424 
3425 static bool compareMachineOp(const MachineOperand &Op0,
3426                              const MachineOperand &Op1) {
3427   if (Op0.getType() != Op1.getType())
3428     return false;
3429 
3430   switch (Op0.getType()) {
3431   case MachineOperand::MO_Register:
3432     return Op0.getReg() == Op1.getReg();
3433   case MachineOperand::MO_Immediate:
3434     return Op0.getImm() == Op1.getImm();
3435   default:
3436     llvm_unreachable("Didn't expect to be comparing these operand types");
3437   }
3438 }
3439 
3440 bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
3441                                     const MachineOperand &MO) const {
3442   const MCInstrDesc &InstDesc = MI.getDesc();
3443   const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpNo];
3444 
3445   assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
3446 
3447   if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
3448     return true;
3449 
3450   if (OpInfo.RegClass < 0)
3451     return false;
3452 
3453   if (MO.isImm() && isInlineConstant(MO, OpInfo)) {
3454     if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() &&
3455         OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(),
3456                                                     AMDGPU::OpName::src2))
3457       return false;
3458     return RI.opCanUseInlineConstant(OpInfo.OperandType);
3459   }
3460 
3461   if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
3462     return false;
3463 
3464   if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo))
3465     return true;
3466 
3467   return ST.hasVOP3Literal();
3468 }
3469 
3470 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
3471   // GFX90A does not have V_MUL_LEGACY_F32_e32.
3472   if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
3473     return false;
3474 
3475   int Op32 = AMDGPU::getVOPe32(Opcode);
3476   if (Op32 == -1)
3477     return false;
3478 
3479   return pseudoToMCOpcode(Op32) != -1;
3480 }
3481 
3482 bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
3483   // The src0_modifier operand is present on all instructions
3484   // that have modifiers.
3485 
3486   return AMDGPU::getNamedOperandIdx(Opcode,
3487                                     AMDGPU::OpName::src0_modifiers) != -1;
3488 }
3489 
3490 bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI,
3491                                   unsigned OpName) const {
3492   const MachineOperand *Mods = getNamedOperand(MI, OpName);
3493   return Mods && Mods->getImm();
3494 }
3495 
3496 bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const {
3497   return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
3498          hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
3499          hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) ||
3500          hasModifiersSet(MI, AMDGPU::OpName::clamp) ||
3501          hasModifiersSet(MI, AMDGPU::OpName::omod);
3502 }
3503 
3504 bool SIInstrInfo::canShrink(const MachineInstr &MI,
3505                             const MachineRegisterInfo &MRI) const {
3506   const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
3507   // Can't shrink instruction with three operands.
3508   // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add
3509   // a special case for it.  It can only be shrunk if the third operand
3510   // is vcc, and src0_modifiers and src1_modifiers are not set.
3511   // We should handle this the same way we handle vopc, by addding
3512   // a register allocation hint pre-regalloc and then do the shrinking
3513   // post-regalloc.
3514   if (Src2) {
3515     switch (MI.getOpcode()) {
3516       default: return false;
3517 
3518       case AMDGPU::V_ADDC_U32_e64:
3519       case AMDGPU::V_SUBB_U32_e64:
3520       case AMDGPU::V_SUBBREV_U32_e64: {
3521         const MachineOperand *Src1
3522           = getNamedOperand(MI, AMDGPU::OpName::src1);
3523         if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
3524           return false;
3525         // Additional verification is needed for sdst/src2.
3526         return true;
3527       }
3528       case AMDGPU::V_MAC_F32_e64:
3529       case AMDGPU::V_MAC_F16_e64:
3530       case AMDGPU::V_FMAC_F32_e64:
3531       case AMDGPU::V_FMAC_F16_e64:
3532       case AMDGPU::V_FMAC_F64_e64:
3533         if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
3534             hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
3535           return false;
3536         break;
3537 
3538       case AMDGPU::V_CNDMASK_B32_e64:
3539         break;
3540     }
3541   }
3542 
3543   const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
3544   if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
3545                hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
3546     return false;
3547 
3548   // We don't need to check src0, all input types are legal, so just make sure
3549   // src0 isn't using any modifiers.
3550   if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
3551     return false;
3552 
3553   // Can it be shrunk to a valid 32 bit opcode?
3554   if (!hasVALU32BitEncoding(MI.getOpcode()))
3555     return false;
3556 
3557   // Check output modifiers
3558   return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
3559          !hasModifiersSet(MI, AMDGPU::OpName::clamp);
3560 }
3561 
3562 // Set VCC operand with all flags from \p Orig, except for setting it as
3563 // implicit.
3564 static void copyFlagsToImplicitVCC(MachineInstr &MI,
3565                                    const MachineOperand &Orig) {
3566 
3567   for (MachineOperand &Use : MI.implicit_operands()) {
3568     if (Use.isUse() &&
3569         (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
3570       Use.setIsUndef(Orig.isUndef());
3571       Use.setIsKill(Orig.isKill());
3572       return;
3573     }
3574   }
3575 }
3576 
3577 MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI,
3578                                            unsigned Op32) const {
3579   MachineBasicBlock *MBB = MI.getParent();;
3580   MachineInstrBuilder Inst32 =
3581     BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32))
3582     .setMIFlags(MI.getFlags());
3583 
3584   // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
3585   // For VOPC instructions, this is replaced by an implicit def of vcc.
3586   int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst);
3587   if (Op32DstIdx != -1) {
3588     // dst
3589     Inst32.add(MI.getOperand(0));
3590   } else {
3591     assert(((MI.getOperand(0).getReg() == AMDGPU::VCC) ||
3592             (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) &&
3593            "Unexpected case");
3594   }
3595 
3596   Inst32.add(*getNamedOperand(MI, AMDGPU::OpName::src0));
3597 
3598   const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
3599   if (Src1)
3600     Inst32.add(*Src1);
3601 
3602   const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
3603 
3604   if (Src2) {
3605     int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
3606     if (Op32Src2Idx != -1) {
3607       Inst32.add(*Src2);
3608     } else {
3609       // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
3610       // replaced with an implicit read of vcc or vcc_lo. The implicit read
3611       // of vcc was already added during the initial BuildMI, but we
3612       // 1) may need to change vcc to vcc_lo to preserve the original register
3613       // 2) have to preserve the original flags.
3614       fixImplicitOperands(*Inst32);
3615       copyFlagsToImplicitVCC(*Inst32, *Src2);
3616     }
3617   }
3618 
3619   return Inst32;
3620 }
3621 
3622 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
3623                                   const MachineOperand &MO,
3624                                   const MCOperandInfo &OpInfo) const {
3625   // Literal constants use the constant bus.
3626   //if (isLiteralConstantLike(MO, OpInfo))
3627   // return true;
3628   if (MO.isImm())
3629     return !isInlineConstant(MO, OpInfo);
3630 
3631   if (!MO.isReg())
3632     return true; // Misc other operands like FrameIndex
3633 
3634   if (!MO.isUse())
3635     return false;
3636 
3637   if (MO.getReg().isVirtual())
3638     return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
3639 
3640   // Null is free
3641   if (MO.getReg() == AMDGPU::SGPR_NULL)
3642     return false;
3643 
3644   // SGPRs use the constant bus
3645   if (MO.isImplicit()) {
3646     return MO.getReg() == AMDGPU::M0 ||
3647            MO.getReg() == AMDGPU::VCC ||
3648            MO.getReg() == AMDGPU::VCC_LO;
3649   } else {
3650     return AMDGPU::SReg_32RegClass.contains(MO.getReg()) ||
3651            AMDGPU::SReg_64RegClass.contains(MO.getReg());
3652   }
3653 }
3654 
3655 static Register findImplicitSGPRRead(const MachineInstr &MI) {
3656   for (const MachineOperand &MO : MI.implicit_operands()) {
3657     // We only care about reads.
3658     if (MO.isDef())
3659       continue;
3660 
3661     switch (MO.getReg()) {
3662     case AMDGPU::VCC:
3663     case AMDGPU::VCC_LO:
3664     case AMDGPU::VCC_HI:
3665     case AMDGPU::M0:
3666     case AMDGPU::FLAT_SCR:
3667       return MO.getReg();
3668 
3669     default:
3670       break;
3671     }
3672   }
3673 
3674   return AMDGPU::NoRegister;
3675 }
3676 
3677 static bool shouldReadExec(const MachineInstr &MI) {
3678   if (SIInstrInfo::isVALU(MI)) {
3679     switch (MI.getOpcode()) {
3680     case AMDGPU::V_READLANE_B32:
3681     case AMDGPU::V_WRITELANE_B32:
3682       return false;
3683     }
3684 
3685     return true;
3686   }
3687 
3688   if (MI.isPreISelOpcode() ||
3689       SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
3690       SIInstrInfo::isSALU(MI) ||
3691       SIInstrInfo::isSMRD(MI))
3692     return false;
3693 
3694   return true;
3695 }
3696 
3697 static bool isSubRegOf(const SIRegisterInfo &TRI,
3698                        const MachineOperand &SuperVec,
3699                        const MachineOperand &SubReg) {
3700   if (SubReg.getReg().isPhysical())
3701     return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
3702 
3703   return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
3704          SubReg.getReg() == SuperVec.getReg();
3705 }
3706 
3707 bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
3708                                     StringRef &ErrInfo) const {
3709   uint16_t Opcode = MI.getOpcode();
3710   if (SIInstrInfo::isGenericOpcode(MI.getOpcode()))
3711     return true;
3712 
3713   const MachineFunction *MF = MI.getParent()->getParent();
3714   const MachineRegisterInfo &MRI = MF->getRegInfo();
3715 
3716   int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
3717   int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
3718   int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
3719 
3720   // Make sure the number of operands is correct.
3721   const MCInstrDesc &Desc = get(Opcode);
3722   if (!Desc.isVariadic() &&
3723       Desc.getNumOperands() != MI.getNumExplicitOperands()) {
3724     ErrInfo = "Instruction has wrong number of operands.";
3725     return false;
3726   }
3727 
3728   if (MI.isInlineAsm()) {
3729     // Verify register classes for inlineasm constraints.
3730     for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
3731          I != E; ++I) {
3732       const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
3733       if (!RC)
3734         continue;
3735 
3736       const MachineOperand &Op = MI.getOperand(I);
3737       if (!Op.isReg())
3738         continue;
3739 
3740       Register Reg = Op.getReg();
3741       if (!Reg.isVirtual() && !RC->contains(Reg)) {
3742         ErrInfo = "inlineasm operand has incorrect register class.";
3743         return false;
3744       }
3745     }
3746 
3747     return true;
3748   }
3749 
3750   if (isMIMG(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
3751     ErrInfo = "missing memory operand from MIMG instruction.";
3752     return false;
3753   }
3754 
3755   // Make sure the register classes are correct.
3756   for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
3757     const MachineOperand &MO = MI.getOperand(i);
3758     if (MO.isFPImm()) {
3759       ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
3760                 "all fp values to integers.";
3761       return false;
3762     }
3763 
3764     int RegClass = Desc.OpInfo[i].RegClass;
3765 
3766     switch (Desc.OpInfo[i].OperandType) {
3767     case MCOI::OPERAND_REGISTER:
3768       if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
3769         ErrInfo = "Illegal immediate value for operand.";
3770         return false;
3771       }
3772       break;
3773     case AMDGPU::OPERAND_REG_IMM_INT32:
3774     case AMDGPU::OPERAND_REG_IMM_FP32:
3775       break;
3776     case AMDGPU::OPERAND_REG_INLINE_C_INT32:
3777     case AMDGPU::OPERAND_REG_INLINE_C_FP32:
3778     case AMDGPU::OPERAND_REG_INLINE_C_INT64:
3779     case AMDGPU::OPERAND_REG_INLINE_C_FP64:
3780     case AMDGPU::OPERAND_REG_INLINE_C_INT16:
3781     case AMDGPU::OPERAND_REG_INLINE_C_FP16:
3782     case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
3783     case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
3784     case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
3785     case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
3786     case AMDGPU::OPERAND_REG_INLINE_AC_FP64: {
3787       if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
3788         ErrInfo = "Illegal immediate value for operand.";
3789         return false;
3790       }
3791       break;
3792     }
3793     case MCOI::OPERAND_IMMEDIATE:
3794     case AMDGPU::OPERAND_KIMM32:
3795       // Check if this operand is an immediate.
3796       // FrameIndex operands will be replaced by immediates, so they are
3797       // allowed.
3798       if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
3799         ErrInfo = "Expected immediate, but got non-immediate";
3800         return false;
3801       }
3802       LLVM_FALLTHROUGH;
3803     default:
3804       continue;
3805     }
3806 
3807     if (!MO.isReg())
3808       continue;
3809     Register Reg = MO.getReg();
3810     if (!Reg)
3811       continue;
3812 
3813     // FIXME: Ideally we would have separate instruction definitions with the
3814     // aligned register constraint.
3815     // FIXME: We do not verify inline asm operands, but custom inline asm
3816     // verification is broken anyway
3817     if (ST.needsAlignedVGPRs()) {
3818       const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
3819       const bool IsVGPR = RI.hasVGPRs(RC);
3820       const bool IsAGPR = !IsVGPR && RI.hasAGPRs(RC);
3821       if ((IsVGPR || IsAGPR) && MO.getSubReg()) {
3822         const TargetRegisterClass *SubRC =
3823             RI.getSubRegClass(RC, MO.getSubReg());
3824         RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
3825         if (RC)
3826           RC = SubRC;
3827       }
3828 
3829       // Check that this is the aligned version of the class.
3830       if (!RC || !RI.isProperlyAlignedRC(*RC)) {
3831         ErrInfo = "Subtarget requires even aligned vector registers";
3832         return false;
3833       }
3834     }
3835 
3836     if (RegClass != -1) {
3837       if (Reg.isVirtual())
3838         continue;
3839 
3840       const TargetRegisterClass *RC = RI.getRegClass(RegClass);
3841       if (!RC->contains(Reg)) {
3842         ErrInfo = "Operand has incorrect register class.";
3843         return false;
3844       }
3845     }
3846   }
3847 
3848   // Verify SDWA
3849   if (isSDWA(MI)) {
3850     if (!ST.hasSDWA()) {
3851       ErrInfo = "SDWA is not supported on this target";
3852       return false;
3853     }
3854 
3855     int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
3856 
3857     const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx };
3858 
3859     for (int OpIdx: OpIndicies) {
3860       if (OpIdx == -1)
3861         continue;
3862       const MachineOperand &MO = MI.getOperand(OpIdx);
3863 
3864       if (!ST.hasSDWAScalar()) {
3865         // Only VGPRS on VI
3866         if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
3867           ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
3868           return false;
3869         }
3870       } else {
3871         // No immediates on GFX9
3872         if (!MO.isReg()) {
3873           ErrInfo =
3874             "Only reg allowed as operands in SDWA instructions on GFX9+";
3875           return false;
3876         }
3877       }
3878     }
3879 
3880     if (!ST.hasSDWAOmod()) {
3881       // No omod allowed on VI
3882       const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
3883       if (OMod != nullptr &&
3884         (!OMod->isImm() || OMod->getImm() != 0)) {
3885         ErrInfo = "OMod not allowed in SDWA instructions on VI";
3886         return false;
3887       }
3888     }
3889 
3890     uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
3891     if (isVOPC(BasicOpcode)) {
3892       if (!ST.hasSDWASdst() && DstIdx != -1) {
3893         // Only vcc allowed as dst on VI for VOPC
3894         const MachineOperand &Dst = MI.getOperand(DstIdx);
3895         if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
3896           ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
3897           return false;
3898         }
3899       } else if (!ST.hasSDWAOutModsVOPC()) {
3900         // No clamp allowed on GFX9 for VOPC
3901         const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
3902         if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
3903           ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
3904           return false;
3905         }
3906 
3907         // No omod allowed on GFX9 for VOPC
3908         const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
3909         if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
3910           ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
3911           return false;
3912         }
3913       }
3914     }
3915 
3916     const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
3917     if (DstUnused && DstUnused->isImm() &&
3918         DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
3919       const MachineOperand &Dst = MI.getOperand(DstIdx);
3920       if (!Dst.isReg() || !Dst.isTied()) {
3921         ErrInfo = "Dst register should have tied register";
3922         return false;
3923       }
3924 
3925       const MachineOperand &TiedMO =
3926           MI.getOperand(MI.findTiedOperandIdx(DstIdx));
3927       if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
3928         ErrInfo =
3929             "Dst register should be tied to implicit use of preserved register";
3930         return false;
3931       } else if (TiedMO.getReg().isPhysical() &&
3932                  Dst.getReg() != TiedMO.getReg()) {
3933         ErrInfo = "Dst register should use same physical register as preserved";
3934         return false;
3935       }
3936     }
3937   }
3938 
3939   // Verify MIMG
3940   if (isMIMG(MI.getOpcode()) && !MI.mayStore()) {
3941     // Ensure that the return type used is large enough for all the options
3942     // being used TFE/LWE require an extra result register.
3943     const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
3944     if (DMask) {
3945       uint64_t DMaskImm = DMask->getImm();
3946       uint32_t RegCount =
3947           isGather4(MI.getOpcode()) ? 4 : countPopulation(DMaskImm);
3948       const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
3949       const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
3950       const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
3951 
3952       // Adjust for packed 16 bit values
3953       if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
3954         RegCount >>= 1;
3955 
3956       // Adjust if using LWE or TFE
3957       if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
3958         RegCount += 1;
3959 
3960       const uint32_t DstIdx =
3961           AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
3962       const MachineOperand &Dst = MI.getOperand(DstIdx);
3963       if (Dst.isReg()) {
3964         const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
3965         uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
3966         if (RegCount > DstSize) {
3967           ErrInfo = "MIMG instruction returns too many registers for dst "
3968                     "register class";
3969           return false;
3970         }
3971       }
3972     }
3973   }
3974 
3975   // Verify VOP*. Ignore multiple sgpr operands on writelane.
3976   if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32
3977       && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) {
3978     // Only look at the true operands. Only a real operand can use the constant
3979     // bus, and we don't want to check pseudo-operands like the source modifier
3980     // flags.
3981     const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
3982 
3983     unsigned ConstantBusCount = 0;
3984     bool UsesLiteral = false;
3985     const MachineOperand *LiteralVal = nullptr;
3986 
3987     if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1)
3988       ++ConstantBusCount;
3989 
3990     SmallVector<Register, 2> SGPRsUsed;
3991     Register SGPRUsed;
3992 
3993     for (int OpIdx : OpIndices) {
3994       if (OpIdx == -1)
3995         break;
3996       const MachineOperand &MO = MI.getOperand(OpIdx);
3997       if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) {
3998         if (MO.isReg()) {
3999           SGPRUsed = MO.getReg();
4000           if (llvm::all_of(SGPRsUsed, [SGPRUsed](unsigned SGPR) {
4001                 return SGPRUsed != SGPR;
4002               })) {
4003             ++ConstantBusCount;
4004             SGPRsUsed.push_back(SGPRUsed);
4005           }
4006         } else {
4007           if (!UsesLiteral) {
4008             ++ConstantBusCount;
4009             UsesLiteral = true;
4010             LiteralVal = &MO;
4011           } else if (!MO.isIdenticalTo(*LiteralVal)) {
4012             assert(isVOP3(MI));
4013             ErrInfo = "VOP3 instruction uses more than one literal";
4014             return false;
4015           }
4016         }
4017       }
4018     }
4019 
4020     SGPRUsed = findImplicitSGPRRead(MI);
4021     if (SGPRUsed != AMDGPU::NoRegister) {
4022       // Implicit uses may safely overlap true overands
4023       if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
4024             return !RI.regsOverlap(SGPRUsed, SGPR);
4025           })) {
4026         ++ConstantBusCount;
4027         SGPRsUsed.push_back(SGPRUsed);
4028       }
4029     }
4030 
4031     // v_writelane_b32 is an exception from constant bus restriction:
4032     // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
4033     if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
4034         Opcode != AMDGPU::V_WRITELANE_B32) {
4035       ErrInfo = "VOP* instruction violates constant bus restriction";
4036       return false;
4037     }
4038 
4039     if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
4040       ErrInfo = "VOP3 instruction uses literal";
4041       return false;
4042     }
4043   }
4044 
4045   // Special case for writelane - this can break the multiple constant bus rule,
4046   // but still can't use more than one SGPR register
4047   if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
4048     unsigned SGPRCount = 0;
4049     Register SGPRUsed = AMDGPU::NoRegister;
4050 
4051     for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx}) {
4052       if (OpIdx == -1)
4053         break;
4054 
4055       const MachineOperand &MO = MI.getOperand(OpIdx);
4056 
4057       if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) {
4058         if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
4059           if (MO.getReg() != SGPRUsed)
4060             ++SGPRCount;
4061           SGPRUsed = MO.getReg();
4062         }
4063       }
4064       if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
4065         ErrInfo = "WRITELANE instruction violates constant bus restriction";
4066         return false;
4067       }
4068     }
4069   }
4070 
4071   // Verify misc. restrictions on specific instructions.
4072   if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
4073       Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
4074     const MachineOperand &Src0 = MI.getOperand(Src0Idx);
4075     const MachineOperand &Src1 = MI.getOperand(Src1Idx);
4076     const MachineOperand &Src2 = MI.getOperand(Src2Idx);
4077     if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
4078       if (!compareMachineOp(Src0, Src1) &&
4079           !compareMachineOp(Src0, Src2)) {
4080         ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
4081         return false;
4082       }
4083     }
4084     if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
4085          SISrcMods::ABS) ||
4086         (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
4087          SISrcMods::ABS) ||
4088         (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
4089          SISrcMods::ABS)) {
4090       ErrInfo = "ABS not allowed in VOP3B instructions";
4091       return false;
4092     }
4093   }
4094 
4095   if (isSOP2(MI) || isSOPC(MI)) {
4096     const MachineOperand &Src0 = MI.getOperand(Src0Idx);
4097     const MachineOperand &Src1 = MI.getOperand(Src1Idx);
4098     unsigned Immediates = 0;
4099 
4100     if (!Src0.isReg() &&
4101         !isInlineConstant(Src0, Desc.OpInfo[Src0Idx].OperandType))
4102       Immediates++;
4103     if (!Src1.isReg() &&
4104         !isInlineConstant(Src1, Desc.OpInfo[Src1Idx].OperandType))
4105       Immediates++;
4106 
4107     if (Immediates > 1) {
4108       ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
4109       return false;
4110     }
4111   }
4112 
4113   if (isSOPK(MI)) {
4114     auto Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
4115     if (Desc.isBranch()) {
4116       if (!Op->isMBB()) {
4117         ErrInfo = "invalid branch target for SOPK instruction";
4118         return false;
4119       }
4120     } else {
4121       uint64_t Imm = Op->getImm();
4122       if (sopkIsZext(MI)) {
4123         if (!isUInt<16>(Imm)) {
4124           ErrInfo = "invalid immediate for SOPK instruction";
4125           return false;
4126         }
4127       } else {
4128         if (!isInt<16>(Imm)) {
4129           ErrInfo = "invalid immediate for SOPK instruction";
4130           return false;
4131         }
4132       }
4133     }
4134   }
4135 
4136   if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
4137       Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
4138       Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
4139       Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
4140     const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
4141                        Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
4142 
4143     const unsigned StaticNumOps = Desc.getNumOperands() +
4144       Desc.getNumImplicitUses();
4145     const unsigned NumImplicitOps = IsDst ? 2 : 1;
4146 
4147     // Allow additional implicit operands. This allows a fixup done by the post
4148     // RA scheduler where the main implicit operand is killed and implicit-defs
4149     // are added for sub-registers that remain live after this instruction.
4150     if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
4151       ErrInfo = "missing implicit register operands";
4152       return false;
4153     }
4154 
4155     const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
4156     if (IsDst) {
4157       if (!Dst->isUse()) {
4158         ErrInfo = "v_movreld_b32 vdst should be a use operand";
4159         return false;
4160       }
4161 
4162       unsigned UseOpIdx;
4163       if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
4164           UseOpIdx != StaticNumOps + 1) {
4165         ErrInfo = "movrel implicit operands should be tied";
4166         return false;
4167       }
4168     }
4169 
4170     const MachineOperand &Src0 = MI.getOperand(Src0Idx);
4171     const MachineOperand &ImpUse
4172       = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
4173     if (!ImpUse.isReg() || !ImpUse.isUse() ||
4174         !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
4175       ErrInfo = "src0 should be subreg of implicit vector use";
4176       return false;
4177     }
4178   }
4179 
4180   // Make sure we aren't losing exec uses in the td files. This mostly requires
4181   // being careful when using let Uses to try to add other use registers.
4182   if (shouldReadExec(MI)) {
4183     if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
4184       ErrInfo = "VALU instruction does not implicitly read exec mask";
4185       return false;
4186     }
4187   }
4188 
4189   if (isSMRD(MI)) {
4190     if (MI.mayStore()) {
4191       // The register offset form of scalar stores may only use m0 as the
4192       // soffset register.
4193       const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff);
4194       if (Soff && Soff->getReg() != AMDGPU::M0) {
4195         ErrInfo = "scalar stores must use m0 as offset register";
4196         return false;
4197       }
4198     }
4199   }
4200 
4201   if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
4202     const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
4203     if (Offset->getImm() != 0) {
4204       ErrInfo = "subtarget does not support offsets in flat instructions";
4205       return false;
4206     }
4207   }
4208 
4209   if (isMIMG(MI)) {
4210     const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
4211     if (DimOp) {
4212       int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
4213                                                  AMDGPU::OpName::vaddr0);
4214       int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
4215       const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
4216       const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4217           AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
4218       const AMDGPU::MIMGDimInfo *Dim =
4219           AMDGPU::getMIMGDimInfoByEncoding(DimOp->getImm());
4220 
4221       if (!Dim) {
4222         ErrInfo = "dim is out of range";
4223         return false;
4224       }
4225 
4226       bool IsA16 = false;
4227       if (ST.hasR128A16()) {
4228         const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
4229         IsA16 = R128A16->getImm() != 0;
4230       } else if (ST.hasGFX10A16()) {
4231         const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
4232         IsA16 = A16->getImm() != 0;
4233       }
4234 
4235       bool IsNSA = SRsrcIdx - VAddr0Idx > 1;
4236 
4237       unsigned AddrWords =
4238           AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
4239 
4240       unsigned VAddrWords;
4241       if (IsNSA) {
4242         VAddrWords = SRsrcIdx - VAddr0Idx;
4243       } else {
4244         const TargetRegisterClass *RC = getOpRegClass(MI, VAddr0Idx);
4245         VAddrWords = MRI.getTargetRegisterInfo()->getRegSizeInBits(*RC) / 32;
4246         if (AddrWords > 8)
4247           AddrWords = 16;
4248         else if (AddrWords > 5)
4249           AddrWords = 8;
4250       }
4251 
4252       if (VAddrWords != AddrWords) {
4253         LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
4254                           << " but got " << VAddrWords << "\n");
4255         ErrInfo = "bad vaddr size";
4256         return false;
4257       }
4258     }
4259   }
4260 
4261   const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
4262   if (DppCt) {
4263     using namespace AMDGPU::DPP;
4264 
4265     unsigned DC = DppCt->getImm();
4266     if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
4267         DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
4268         (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
4269         (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
4270         (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
4271         (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
4272         (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
4273       ErrInfo = "Invalid dpp_ctrl value";
4274       return false;
4275     }
4276     if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
4277         ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
4278       ErrInfo = "Invalid dpp_ctrl value: "
4279                 "wavefront shifts are not supported on GFX10+";
4280       return false;
4281     }
4282     if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
4283         ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
4284       ErrInfo = "Invalid dpp_ctrl value: "
4285                 "broadcasts are not supported on GFX10+";
4286       return false;
4287     }
4288     if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
4289         ST.getGeneration() < AMDGPUSubtarget::GFX10) {
4290       if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
4291           DC <= DppCtrl::ROW_NEWBCAST_LAST &&
4292           !ST.hasGFX90AInsts()) {
4293         ErrInfo = "Invalid dpp_ctrl value: "
4294                   "row_newbroadcast/row_share is not supported before "
4295                   "GFX90A/GFX10";
4296         return false;
4297       } else if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
4298         ErrInfo = "Invalid dpp_ctrl value: "
4299                   "row_share and row_xmask are not supported before GFX10";
4300         return false;
4301       }
4302     }
4303 
4304     int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
4305     int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
4306 
4307     if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
4308         ((DstIdx >= 0 &&
4309           (Desc.OpInfo[DstIdx].RegClass == AMDGPU::VReg_64RegClassID ||
4310            Desc.OpInfo[DstIdx].RegClass == AMDGPU::VReg_64_Align2RegClassID)) ||
4311          ((Src0Idx >= 0 &&
4312            (Desc.OpInfo[Src0Idx].RegClass == AMDGPU::VReg_64RegClassID ||
4313             Desc.OpInfo[Src0Idx].RegClass ==
4314                 AMDGPU::VReg_64_Align2RegClassID)))) &&
4315         !AMDGPU::isLegal64BitDPPControl(DC)) {
4316       ErrInfo = "Invalid dpp_ctrl value: "
4317                 "64 bit dpp only support row_newbcast";
4318       return false;
4319     }
4320   }
4321 
4322   if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
4323     const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
4324     uint16_t DataNameIdx = isDS(Opcode) ? AMDGPU::OpName::data0
4325                                         : AMDGPU::OpName::vdata;
4326     const MachineOperand *Data = getNamedOperand(MI, DataNameIdx);
4327     const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
4328     if (Data && !Data->isReg())
4329       Data = nullptr;
4330 
4331     if (ST.hasGFX90AInsts()) {
4332       if (Dst && Data &&
4333           (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
4334         ErrInfo = "Invalid register class: "
4335                   "vdata and vdst should be both VGPR or AGPR";
4336         return false;
4337       }
4338       if (Data && Data2 &&
4339           (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
4340         ErrInfo = "Invalid register class: "
4341                   "both data operands should be VGPR or AGPR";
4342         return false;
4343       }
4344     } else {
4345       if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
4346           (Data && RI.isAGPR(MRI, Data->getReg())) ||
4347           (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
4348         ErrInfo = "Invalid register class: "
4349                   "agpr loads and stores not supported on this GPU";
4350         return false;
4351       }
4352     }
4353   }
4354 
4355   if (ST.needsAlignedVGPRs() &&
4356       (MI.getOpcode() == AMDGPU::DS_GWS_INIT ||
4357        MI.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
4358        MI.getOpcode() == AMDGPU::DS_GWS_BARRIER)) {
4359     const MachineOperand *Op = getNamedOperand(MI, AMDGPU::OpName::data0);
4360     Register Reg = Op->getReg();
4361     bool Aligned = true;
4362     if (Reg.isPhysical()) {
4363       Aligned = !(RI.getHWRegIndex(Reg) & 1);
4364     } else {
4365       const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
4366       Aligned = RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
4367                 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
4368     }
4369 
4370     if (!Aligned) {
4371       ErrInfo = "Subtarget requires even aligned vector registers "
4372                 "for DS_GWS instructions";
4373       return false;
4374     }
4375   }
4376 
4377   return true;
4378 }
4379 
4380 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
4381   switch (MI.getOpcode()) {
4382   default: return AMDGPU::INSTRUCTION_LIST_END;
4383   case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
4384   case AMDGPU::COPY: return AMDGPU::COPY;
4385   case AMDGPU::PHI: return AMDGPU::PHI;
4386   case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
4387   case AMDGPU::WQM: return AMDGPU::WQM;
4388   case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
4389   case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
4390   case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
4391   case AMDGPU::S_MOV_B32: {
4392     const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
4393     return MI.getOperand(1).isReg() ||
4394            RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
4395            AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
4396   }
4397   case AMDGPU::S_ADD_I32:
4398     return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
4399   case AMDGPU::S_ADDC_U32:
4400     return AMDGPU::V_ADDC_U32_e32;
4401   case AMDGPU::S_SUB_I32:
4402     return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
4403     // FIXME: These are not consistently handled, and selected when the carry is
4404     // used.
4405   case AMDGPU::S_ADD_U32:
4406     return AMDGPU::V_ADD_CO_U32_e32;
4407   case AMDGPU::S_SUB_U32:
4408     return AMDGPU::V_SUB_CO_U32_e32;
4409   case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
4410   case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
4411   case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
4412   case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
4413   case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
4414   case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
4415   case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
4416   case AMDGPU::S_XNOR_B32:
4417     return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
4418   case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
4419   case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
4420   case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
4421   case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
4422   case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
4423   case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
4424   case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
4425   case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
4426   case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
4427   case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
4428   case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
4429   case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
4430   case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
4431   case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
4432   case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
4433   case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
4434   case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
4435   case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
4436   case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32;
4437   case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32;
4438   case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32;
4439   case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32;
4440   case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
4441   case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
4442   case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32;
4443   case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32;
4444   case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32;
4445   case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32;
4446   case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32;
4447   case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32;
4448   case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32;
4449   case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32;
4450   case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
4451   case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
4452   case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
4453   case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
4454   case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
4455   case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
4456   }
4457   llvm_unreachable(
4458       "Unexpected scalar opcode without corresponding vector one!");
4459 }
4460 
4461 static unsigned adjustAllocatableRegClass(const GCNSubtarget &ST,
4462                                           const MachineRegisterInfo &MRI,
4463                                           const MCInstrDesc &TID,
4464                                           unsigned RCID,
4465                                           bool IsAllocatable) {
4466   if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
4467       (TID.mayLoad() || TID.mayStore() ||
4468       (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::MIMG)))) {
4469     switch (RCID) {
4470     case AMDGPU::AV_32RegClassID: return AMDGPU::VGPR_32RegClassID;
4471     case AMDGPU::AV_64RegClassID: return AMDGPU::VReg_64RegClassID;
4472     case AMDGPU::AV_96RegClassID: return AMDGPU::VReg_96RegClassID;
4473     case AMDGPU::AV_128RegClassID: return AMDGPU::VReg_128RegClassID;
4474     case AMDGPU::AV_160RegClassID: return AMDGPU::VReg_160RegClassID;
4475     default:
4476       break;
4477     }
4478   }
4479   return RCID;
4480 }
4481 
4482 const TargetRegisterClass *SIInstrInfo::getRegClass(const MCInstrDesc &TID,
4483     unsigned OpNum, const TargetRegisterInfo *TRI,
4484     const MachineFunction &MF)
4485   const {
4486   if (OpNum >= TID.getNumOperands())
4487     return nullptr;
4488   auto RegClass = TID.OpInfo[OpNum].RegClass;
4489   bool IsAllocatable = false;
4490   if (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::FLAT)) {
4491     // vdst and vdata should be both VGPR or AGPR, same for the DS instructions
4492     // with two data operands. Request register class constainted to VGPR only
4493     // of both operands present as Machine Copy Propagation can not check this
4494     // constraint and possibly other passes too.
4495     //
4496     // The check is limited to FLAT and DS because atomics in non-flat encoding
4497     // have their vdst and vdata tied to be the same register.
4498     const int VDstIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
4499                                                    AMDGPU::OpName::vdst);
4500     const int DataIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
4501         (TID.TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0
4502                                          : AMDGPU::OpName::vdata);
4503     if (DataIdx != -1) {
4504       IsAllocatable = VDstIdx != -1 ||
4505                       AMDGPU::getNamedOperandIdx(TID.Opcode,
4506                                                  AMDGPU::OpName::data1) != -1;
4507     }
4508   }
4509   RegClass = adjustAllocatableRegClass(ST, MF.getRegInfo(), TID, RegClass,
4510                                        IsAllocatable);
4511   return RI.getRegClass(RegClass);
4512 }
4513 
4514 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
4515                                                       unsigned OpNo) const {
4516   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
4517   const MCInstrDesc &Desc = get(MI.getOpcode());
4518   if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
4519       Desc.OpInfo[OpNo].RegClass == -1) {
4520     Register Reg = MI.getOperand(OpNo).getReg();
4521 
4522     if (Reg.isVirtual())
4523       return MRI.getRegClass(Reg);
4524     return RI.getPhysRegClass(Reg);
4525   }
4526 
4527   unsigned RCID = Desc.OpInfo[OpNo].RegClass;
4528   RCID = adjustAllocatableRegClass(ST, MRI, Desc, RCID, true);
4529   return RI.getRegClass(RCID);
4530 }
4531 
4532 void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
4533   MachineBasicBlock::iterator I = MI;
4534   MachineBasicBlock *MBB = MI.getParent();
4535   MachineOperand &MO = MI.getOperand(OpIdx);
4536   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
4537   unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass;
4538   const TargetRegisterClass *RC = RI.getRegClass(RCID);
4539   unsigned Size = RI.getRegSizeInBits(*RC);
4540   unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32;
4541   if (MO.isReg())
4542     Opcode = AMDGPU::COPY;
4543   else if (RI.isSGPRClass(RC))
4544     Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
4545 
4546   const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
4547   const TargetRegisterClass *VRC64 = RI.getVGPR64Class();
4548   if (RI.getCommonSubClass(VRC64, VRC))
4549     VRC = VRC64;
4550   else
4551     VRC = &AMDGPU::VGPR_32RegClass;
4552 
4553   Register Reg = MRI.createVirtualRegister(VRC);
4554   DebugLoc DL = MBB->findDebugLoc(I);
4555   BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
4556   MO.ChangeToRegister(Reg, false);
4557 }
4558 
4559 unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
4560                                          MachineRegisterInfo &MRI,
4561                                          MachineOperand &SuperReg,
4562                                          const TargetRegisterClass *SuperRC,
4563                                          unsigned SubIdx,
4564                                          const TargetRegisterClass *SubRC)
4565                                          const {
4566   MachineBasicBlock *MBB = MI->getParent();
4567   DebugLoc DL = MI->getDebugLoc();
4568   Register SubReg = MRI.createVirtualRegister(SubRC);
4569 
4570   if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) {
4571     BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
4572       .addReg(SuperReg.getReg(), 0, SubIdx);
4573     return SubReg;
4574   }
4575 
4576   // Just in case the super register is itself a sub-register, copy it to a new
4577   // value so we don't need to worry about merging its subreg index with the
4578   // SubIdx passed to this function. The register coalescer should be able to
4579   // eliminate this extra copy.
4580   Register NewSuperReg = MRI.createVirtualRegister(SuperRC);
4581 
4582   BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
4583     .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());
4584 
4585   BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
4586     .addReg(NewSuperReg, 0, SubIdx);
4587 
4588   return SubReg;
4589 }
4590 
4591 MachineOperand SIInstrInfo::buildExtractSubRegOrImm(
4592   MachineBasicBlock::iterator MII,
4593   MachineRegisterInfo &MRI,
4594   MachineOperand &Op,
4595   const TargetRegisterClass *SuperRC,
4596   unsigned SubIdx,
4597   const TargetRegisterClass *SubRC) const {
4598   if (Op.isImm()) {
4599     if (SubIdx == AMDGPU::sub0)
4600       return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
4601     if (SubIdx == AMDGPU::sub1)
4602       return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
4603 
4604     llvm_unreachable("Unhandled register index for immediate");
4605   }
4606 
4607   unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
4608                                        SubIdx, SubRC);
4609   return MachineOperand::CreateReg(SubReg, false);
4610 }
4611 
4612 // Change the order of operands from (0, 1, 2) to (0, 2, 1)
4613 void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
4614   assert(Inst.getNumExplicitOperands() == 3);
4615   MachineOperand Op1 = Inst.getOperand(1);
4616   Inst.RemoveOperand(1);
4617   Inst.addOperand(Op1);
4618 }
4619 
4620 bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
4621                                     const MCOperandInfo &OpInfo,
4622                                     const MachineOperand &MO) const {
4623   if (!MO.isReg())
4624     return false;
4625 
4626   Register Reg = MO.getReg();
4627 
4628   const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass);
4629   if (Reg.isPhysical())
4630     return DRC->contains(Reg);
4631 
4632   const TargetRegisterClass *RC = MRI.getRegClass(Reg);
4633 
4634   if (MO.getSubReg()) {
4635     const MachineFunction *MF = MO.getParent()->getParent()->getParent();
4636     const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
4637     if (!SuperRC)
4638       return false;
4639 
4640     DRC = RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg());
4641     if (!DRC)
4642       return false;
4643   }
4644   return RC->hasSuperClassEq(DRC);
4645 }
4646 
4647 bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
4648                                      const MCOperandInfo &OpInfo,
4649                                      const MachineOperand &MO) const {
4650   if (MO.isReg())
4651     return isLegalRegOperand(MRI, OpInfo, MO);
4652 
4653   // Handle non-register types that are treated like immediates.
4654   assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
4655   return true;
4656 }
4657 
4658 bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
4659                                  const MachineOperand *MO) const {
4660   const MachineFunction &MF = *MI.getParent()->getParent();
4661   const MachineRegisterInfo &MRI = MF.getRegInfo();
4662   const MCInstrDesc &InstDesc = MI.getDesc();
4663   const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx];
4664   const TargetRegisterClass *DefinedRC =
4665       OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
4666   if (!MO)
4667     MO = &MI.getOperand(OpIdx);
4668 
4669   int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
4670   int VOP3LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
4671   if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
4672     if (isVOP3(MI) && isLiteralConstantLike(*MO, OpInfo) && !VOP3LiteralLimit--)
4673       return false;
4674 
4675     SmallDenseSet<RegSubRegPair> SGPRsUsed;
4676     if (MO->isReg())
4677       SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
4678 
4679     for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
4680       if (i == OpIdx)
4681         continue;
4682       const MachineOperand &Op = MI.getOperand(i);
4683       if (Op.isReg()) {
4684         RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
4685         if (!SGPRsUsed.count(SGPR) &&
4686             usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) {
4687           if (--ConstantBusLimit <= 0)
4688             return false;
4689           SGPRsUsed.insert(SGPR);
4690         }
4691       } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) {
4692         if (--ConstantBusLimit <= 0)
4693           return false;
4694       } else if (isVOP3(MI) && AMDGPU::isSISrcOperand(InstDesc, i) &&
4695                  isLiteralConstantLike(Op, InstDesc.OpInfo[i])) {
4696         if (!VOP3LiteralLimit--)
4697           return false;
4698         if (--ConstantBusLimit <= 0)
4699           return false;
4700       }
4701     }
4702   }
4703 
4704   if (MO->isReg()) {
4705     assert(DefinedRC);
4706     if (!isLegalRegOperand(MRI, OpInfo, *MO))
4707       return false;
4708     bool IsAGPR = RI.isAGPR(MRI, MO->getReg());
4709     if (IsAGPR && !ST.hasMAIInsts())
4710       return false;
4711     unsigned Opc = MI.getOpcode();
4712     if (IsAGPR &&
4713         (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
4714         (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
4715       return false;
4716     // Atomics should have both vdst and vdata either vgpr or agpr.
4717     const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
4718     const int DataIdx = AMDGPU::getNamedOperandIdx(Opc,
4719         isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
4720     if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
4721         MI.getOperand(DataIdx).isReg() &&
4722         RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
4723       return false;
4724     if ((int)OpIdx == DataIdx) {
4725       if (VDstIdx != -1 &&
4726           RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
4727         return false;
4728       // DS instructions with 2 src operands also must have tied RC.
4729       const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc,
4730                                                       AMDGPU::OpName::data1);
4731       if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
4732           RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
4733         return false;
4734     }
4735     if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
4736         (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
4737         RI.isSGPRReg(MRI, MO->getReg()))
4738       return false;
4739     return true;
4740   }
4741 
4742   // Handle non-register types that are treated like immediates.
4743   assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
4744 
4745   if (!DefinedRC) {
4746     // This operand expects an immediate.
4747     return true;
4748   }
4749 
4750   return isImmOperandLegal(MI, OpIdx, *MO);
4751 }
4752 
4753 void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
4754                                        MachineInstr &MI) const {
4755   unsigned Opc = MI.getOpcode();
4756   const MCInstrDesc &InstrDesc = get(Opc);
4757 
4758   int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
4759   MachineOperand &Src0 = MI.getOperand(Src0Idx);
4760 
4761   int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
4762   MachineOperand &Src1 = MI.getOperand(Src1Idx);
4763 
4764   // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
4765   // we need to only have one constant bus use before GFX10.
4766   bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister;
4767   if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 &&
4768       Src0.isReg() && (RI.isSGPRReg(MRI, Src0.getReg()) ||
4769        isLiteralConstantLike(Src0, InstrDesc.OpInfo[Src0Idx])))
4770     legalizeOpWithMove(MI, Src0Idx);
4771 
4772   // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
4773   // both the value to write (src0) and lane select (src1).  Fix up non-SGPR
4774   // src0/src1 with V_READFIRSTLANE.
4775   if (Opc == AMDGPU::V_WRITELANE_B32) {
4776     const DebugLoc &DL = MI.getDebugLoc();
4777     if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
4778       Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4779       BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
4780           .add(Src0);
4781       Src0.ChangeToRegister(Reg, false);
4782     }
4783     if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
4784       Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4785       const DebugLoc &DL = MI.getDebugLoc();
4786       BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
4787           .add(Src1);
4788       Src1.ChangeToRegister(Reg, false);
4789     }
4790     return;
4791   }
4792 
4793   // No VOP2 instructions support AGPRs.
4794   if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg()))
4795     legalizeOpWithMove(MI, Src0Idx);
4796 
4797   if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg()))
4798     legalizeOpWithMove(MI, Src1Idx);
4799 
4800   // VOP2 src0 instructions support all operand types, so we don't need to check
4801   // their legality. If src1 is already legal, we don't need to do anything.
4802   if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
4803     return;
4804 
4805   // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
4806   // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
4807   // select is uniform.
4808   if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
4809       RI.isVGPR(MRI, Src1.getReg())) {
4810     Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4811     const DebugLoc &DL = MI.getDebugLoc();
4812     BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
4813         .add(Src1);
4814     Src1.ChangeToRegister(Reg, false);
4815     return;
4816   }
4817 
4818   // We do not use commuteInstruction here because it is too aggressive and will
4819   // commute if it is possible. We only want to commute here if it improves
4820   // legality. This can be called a fairly large number of times so don't waste
4821   // compile time pointlessly swapping and checking legality again.
4822   if (HasImplicitSGPR || !MI.isCommutable()) {
4823     legalizeOpWithMove(MI, Src1Idx);
4824     return;
4825   }
4826 
4827   // If src0 can be used as src1, commuting will make the operands legal.
4828   // Otherwise we have to give up and insert a move.
4829   //
4830   // TODO: Other immediate-like operand kinds could be commuted if there was a
4831   // MachineOperand::ChangeTo* for them.
4832   if ((!Src1.isImm() && !Src1.isReg()) ||
4833       !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) {
4834     legalizeOpWithMove(MI, Src1Idx);
4835     return;
4836   }
4837 
4838   int CommutedOpc = commuteOpcode(MI);
4839   if (CommutedOpc == -1) {
4840     legalizeOpWithMove(MI, Src1Idx);
4841     return;
4842   }
4843 
4844   MI.setDesc(get(CommutedOpc));
4845 
4846   Register Src0Reg = Src0.getReg();
4847   unsigned Src0SubReg = Src0.getSubReg();
4848   bool Src0Kill = Src0.isKill();
4849 
4850   if (Src1.isImm())
4851     Src0.ChangeToImmediate(Src1.getImm());
4852   else if (Src1.isReg()) {
4853     Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
4854     Src0.setSubReg(Src1.getSubReg());
4855   } else
4856     llvm_unreachable("Should only have register or immediate operands");
4857 
4858   Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
4859   Src1.setSubReg(Src0SubReg);
4860   fixImplicitOperands(MI);
4861 }
4862 
4863 // Legalize VOP3 operands. All operand types are supported for any operand
4864 // but only one literal constant and only starting from GFX10.
4865 void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
4866                                        MachineInstr &MI) const {
4867   unsigned Opc = MI.getOpcode();
4868 
4869   int VOP3Idx[3] = {
4870     AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
4871     AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
4872     AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
4873   };
4874 
4875   if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
4876       Opc == AMDGPU::V_PERMLANEX16_B32_e64) {
4877     // src1 and src2 must be scalar
4878     MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
4879     MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
4880     const DebugLoc &DL = MI.getDebugLoc();
4881     if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
4882       Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4883       BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
4884         .add(Src1);
4885       Src1.ChangeToRegister(Reg, false);
4886     }
4887     if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
4888       Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4889       BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
4890         .add(Src2);
4891       Src2.ChangeToRegister(Reg, false);
4892     }
4893   }
4894 
4895   // Find the one SGPR operand we are allowed to use.
4896   int ConstantBusLimit = ST.getConstantBusLimit(Opc);
4897   int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
4898   SmallDenseSet<unsigned> SGPRsUsed;
4899   Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
4900   if (SGPRReg != AMDGPU::NoRegister) {
4901     SGPRsUsed.insert(SGPRReg);
4902     --ConstantBusLimit;
4903   }
4904 
4905   for (unsigned i = 0; i < 3; ++i) {
4906     int Idx = VOP3Idx[i];
4907     if (Idx == -1)
4908       break;
4909     MachineOperand &MO = MI.getOperand(Idx);
4910 
4911     if (!MO.isReg()) {
4912       if (!isLiteralConstantLike(MO, get(Opc).OpInfo[Idx]))
4913         continue;
4914 
4915       if (LiteralLimit > 0 && ConstantBusLimit > 0) {
4916         --LiteralLimit;
4917         --ConstantBusLimit;
4918         continue;
4919       }
4920 
4921       --LiteralLimit;
4922       --ConstantBusLimit;
4923       legalizeOpWithMove(MI, Idx);
4924       continue;
4925     }
4926 
4927     if (RI.hasAGPRs(MRI.getRegClass(MO.getReg())) &&
4928         !isOperandLegal(MI, Idx, &MO)) {
4929       legalizeOpWithMove(MI, Idx);
4930       continue;
4931     }
4932 
4933     if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
4934       continue; // VGPRs are legal
4935 
4936     // We can use one SGPR in each VOP3 instruction prior to GFX10
4937     // and two starting from GFX10.
4938     if (SGPRsUsed.count(MO.getReg()))
4939       continue;
4940     if (ConstantBusLimit > 0) {
4941       SGPRsUsed.insert(MO.getReg());
4942       --ConstantBusLimit;
4943       continue;
4944     }
4945 
4946     // If we make it this far, then the operand is not legal and we must
4947     // legalize it.
4948     legalizeOpWithMove(MI, Idx);
4949   }
4950 }
4951 
4952 Register SIInstrInfo::readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI,
4953                                          MachineRegisterInfo &MRI) const {
4954   const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
4955   const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
4956   Register DstReg = MRI.createVirtualRegister(SRC);
4957   unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
4958 
4959   if (RI.hasAGPRs(VRC)) {
4960     VRC = RI.getEquivalentVGPRClass(VRC);
4961     Register NewSrcReg = MRI.createVirtualRegister(VRC);
4962     BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
4963             get(TargetOpcode::COPY), NewSrcReg)
4964         .addReg(SrcReg);
4965     SrcReg = NewSrcReg;
4966   }
4967 
4968   if (SubRegs == 1) {
4969     BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
4970             get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
4971         .addReg(SrcReg);
4972     return DstReg;
4973   }
4974 
4975   SmallVector<unsigned, 8> SRegs;
4976   for (unsigned i = 0; i < SubRegs; ++i) {
4977     Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4978     BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
4979             get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
4980         .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
4981     SRegs.push_back(SGPR);
4982   }
4983 
4984   MachineInstrBuilder MIB =
4985       BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
4986               get(AMDGPU::REG_SEQUENCE), DstReg);
4987   for (unsigned i = 0; i < SubRegs; ++i) {
4988     MIB.addReg(SRegs[i]);
4989     MIB.addImm(RI.getSubRegFromChannel(i));
4990   }
4991   return DstReg;
4992 }
4993 
4994 void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
4995                                        MachineInstr &MI) const {
4996 
4997   // If the pointer is store in VGPRs, then we need to move them to
4998   // SGPRs using v_readfirstlane.  This is safe because we only select
4999   // loads with uniform pointers to SMRD instruction so we know the
5000   // pointer value is uniform.
5001   MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
5002   if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
5003     Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
5004     SBase->setReg(SGPR);
5005   }
5006   MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soff);
5007   if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) {
5008     Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
5009     SOff->setReg(SGPR);
5010   }
5011 }
5012 
5013 bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const {
5014   unsigned Opc = Inst.getOpcode();
5015   int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
5016   if (OldSAddrIdx < 0)
5017     return false;
5018 
5019   assert(isSegmentSpecificFLAT(Inst));
5020 
5021   int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
5022   if (NewOpc < 0)
5023     NewOpc = AMDGPU::getFlatScratchInstSVfromSS(Opc);
5024   if (NewOpc < 0)
5025     return false;
5026 
5027   MachineRegisterInfo &MRI = Inst.getMF()->getRegInfo();
5028   MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
5029   if (RI.isSGPRReg(MRI, SAddr.getReg()))
5030     return false;
5031 
5032   int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
5033   if (NewVAddrIdx < 0)
5034     return false;
5035 
5036   int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
5037 
5038   // Check vaddr, it shall be zero or absent.
5039   MachineInstr *VAddrDef = nullptr;
5040   if (OldVAddrIdx >= 0) {
5041     MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
5042     VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
5043     if (!VAddrDef || VAddrDef->getOpcode() != AMDGPU::V_MOV_B32_e32 ||
5044         !VAddrDef->getOperand(1).isImm() ||
5045         VAddrDef->getOperand(1).getImm() != 0)
5046       return false;
5047   }
5048 
5049   const MCInstrDesc &NewDesc = get(NewOpc);
5050   Inst.setDesc(NewDesc);
5051 
5052   // Callers expect interator to be valid after this call, so modify the
5053   // instruction in place.
5054   if (OldVAddrIdx == NewVAddrIdx) {
5055     MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
5056     // Clear use list from the old vaddr holding a zero register.
5057     MRI.removeRegOperandFromUseList(&NewVAddr);
5058     MRI.moveOperands(&NewVAddr, &SAddr, 1);
5059     Inst.RemoveOperand(OldSAddrIdx);
5060     // Update the use list with the pointer we have just moved from vaddr to
5061     // saddr poisition. Otherwise new vaddr will be missing from the use list.
5062     MRI.removeRegOperandFromUseList(&NewVAddr);
5063     MRI.addRegOperandToUseList(&NewVAddr);
5064   } else {
5065     assert(OldSAddrIdx == NewVAddrIdx);
5066 
5067     if (OldVAddrIdx >= 0) {
5068       int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
5069                                                  AMDGPU::OpName::vdst_in);
5070 
5071       // RemoveOperand doesn't try to fixup tied operand indexes at it goes, so
5072       // it asserts. Untie the operands for now and retie them afterwards.
5073       if (NewVDstIn != -1) {
5074         int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
5075         Inst.untieRegOperand(OldVDstIn);
5076       }
5077 
5078       Inst.RemoveOperand(OldVAddrIdx);
5079 
5080       if (NewVDstIn != -1) {
5081         int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
5082         Inst.tieOperands(NewVDst, NewVDstIn);
5083       }
5084     }
5085   }
5086 
5087   if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
5088     VAddrDef->eraseFromParent();
5089 
5090   return true;
5091 }
5092 
5093 // FIXME: Remove this when SelectionDAG is obsoleted.
5094 void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI,
5095                                        MachineInstr &MI) const {
5096   if (!isSegmentSpecificFLAT(MI))
5097     return;
5098 
5099   // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
5100   // thinks they are uniform, so a readfirstlane should be valid.
5101   MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
5102   if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
5103     return;
5104 
5105   if (moveFlatAddrToVGPR(MI))
5106     return;
5107 
5108   Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI);
5109   SAddr->setReg(ToSGPR);
5110 }
5111 
5112 void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
5113                                          MachineBasicBlock::iterator I,
5114                                          const TargetRegisterClass *DstRC,
5115                                          MachineOperand &Op,
5116                                          MachineRegisterInfo &MRI,
5117                                          const DebugLoc &DL) const {
5118   Register OpReg = Op.getReg();
5119   unsigned OpSubReg = Op.getSubReg();
5120 
5121   const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
5122       RI.getRegClassForReg(MRI, OpReg), OpSubReg);
5123 
5124   // Check if operand is already the correct register class.
5125   if (DstRC == OpRC)
5126     return;
5127 
5128   Register DstReg = MRI.createVirtualRegister(DstRC);
5129   MachineInstr *Copy =
5130       BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op);
5131 
5132   Op.setReg(DstReg);
5133   Op.setSubReg(0);
5134 
5135   MachineInstr *Def = MRI.getVRegDef(OpReg);
5136   if (!Def)
5137     return;
5138 
5139   // Try to eliminate the copy if it is copying an immediate value.
5140   if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
5141     FoldImmediate(*Copy, *Def, OpReg, &MRI);
5142 
5143   bool ImpDef = Def->isImplicitDef();
5144   while (!ImpDef && Def && Def->isCopy()) {
5145     if (Def->getOperand(1).getReg().isPhysical())
5146       break;
5147     Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
5148     ImpDef = Def && Def->isImplicitDef();
5149   }
5150   if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
5151       !ImpDef)
5152     Copy->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
5153 }
5154 
5155 // Emit the actual waterfall loop, executing the wrapped instruction for each
5156 // unique value of \p Rsrc across all lanes. In the best case we execute 1
5157 // iteration, in the worst case we execute 64 (once per lane).
5158 static void
5159 emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI,
5160                           MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
5161                           const DebugLoc &DL, MachineOperand &Rsrc) {
5162   MachineFunction &MF = *OrigBB.getParent();
5163   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
5164   const SIRegisterInfo *TRI = ST.getRegisterInfo();
5165   unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5166   unsigned SaveExecOpc =
5167       ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
5168   unsigned XorTermOpc =
5169       ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
5170   unsigned AndOpc =
5171       ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5172   const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5173 
5174   MachineBasicBlock::iterator I = LoopBB.begin();
5175 
5176   SmallVector<Register, 8> ReadlanePieces;
5177   Register CondReg = AMDGPU::NoRegister;
5178 
5179   Register VRsrc = Rsrc.getReg();
5180   unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef());
5181 
5182   unsigned RegSize = TRI->getRegSizeInBits(Rsrc.getReg(), MRI);
5183   unsigned NumSubRegs =  RegSize / 32;
5184   assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 && "Unhandled register size");
5185 
5186   for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
5187 
5188     Register CurRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5189     Register CurRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5190 
5191     // Read the next variant <- also loop target.
5192     BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
5193             .addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx));
5194 
5195     // Read the next variant <- also loop target.
5196     BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
5197             .addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx + 1));
5198 
5199     ReadlanePieces.push_back(CurRegLo);
5200     ReadlanePieces.push_back(CurRegHi);
5201 
5202     // Comparison is to be done as 64-bit.
5203     Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
5204     BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
5205             .addReg(CurRegLo)
5206             .addImm(AMDGPU::sub0)
5207             .addReg(CurRegHi)
5208             .addImm(AMDGPU::sub1);
5209 
5210     Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
5211     auto Cmp =
5212         BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), NewCondReg)
5213             .addReg(CurReg);
5214     if (NumSubRegs <= 2)
5215       Cmp.addReg(VRsrc);
5216     else
5217       Cmp.addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx, 2));
5218 
5219     // Combine the comparision results with AND.
5220     if (CondReg == AMDGPU::NoRegister) // First.
5221       CondReg = NewCondReg;
5222     else { // If not the first, we create an AND.
5223       Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
5224       BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
5225               .addReg(CondReg)
5226               .addReg(NewCondReg);
5227       CondReg = AndReg;
5228     }
5229   } // End for loop.
5230 
5231   auto SRsrcRC = TRI->getEquivalentSGPRClass(MRI.getRegClass(VRsrc));
5232   Register SRsrc = MRI.createVirtualRegister(SRsrcRC);
5233 
5234   // Build scalar Rsrc.
5235   auto Merge = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SRsrc);
5236   unsigned Channel = 0;
5237   for (Register Piece : ReadlanePieces) {
5238     Merge.addReg(Piece)
5239          .addImm(TRI->getSubRegFromChannel(Channel++));
5240   }
5241 
5242   // Update Rsrc operand to use the SGPR Rsrc.
5243   Rsrc.setReg(SRsrc);
5244   Rsrc.setIsKill(true);
5245 
5246   Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
5247   MRI.setSimpleHint(SaveExec, CondReg);
5248 
5249   // Update EXEC to matching lanes, saving original to SaveExec.
5250   BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec)
5251       .addReg(CondReg, RegState::Kill);
5252 
5253   // The original instruction is here; we insert the terminators after it.
5254   I = LoopBB.end();
5255 
5256   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
5257   BuildMI(LoopBB, I, DL, TII.get(XorTermOpc), Exec)
5258       .addReg(Exec)
5259       .addReg(SaveExec);
5260 
5261   BuildMI(LoopBB, I, DL, TII.get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(&LoopBB);
5262 }
5263 
5264 // Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register
5265 // with SGPRs by iterating over all unique values across all lanes.
5266 // Returns the loop basic block that now contains \p MI.
5267 static MachineBasicBlock *
5268 loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
5269                   MachineOperand &Rsrc, MachineDominatorTree *MDT,
5270                   MachineBasicBlock::iterator Begin = nullptr,
5271                   MachineBasicBlock::iterator End = nullptr) {
5272   MachineBasicBlock &MBB = *MI.getParent();
5273   MachineFunction &MF = *MBB.getParent();
5274   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
5275   const SIRegisterInfo *TRI = ST.getRegisterInfo();
5276   MachineRegisterInfo &MRI = MF.getRegInfo();
5277   if (!Begin.isValid())
5278     Begin = &MI;
5279   if (!End.isValid()) {
5280     End = &MI;
5281     ++End;
5282   }
5283   const DebugLoc &DL = MI.getDebugLoc();
5284   unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5285   unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5286   const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5287 
5288   Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
5289 
5290   // Save the EXEC mask
5291   BuildMI(MBB, Begin, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec);
5292 
5293   // Killed uses in the instruction we are waterfalling around will be
5294   // incorrect due to the added control-flow.
5295   MachineBasicBlock::iterator AfterMI = MI;
5296   ++AfterMI;
5297   for (auto I = Begin; I != AfterMI; I++) {
5298     for (auto &MO : I->uses()) {
5299       if (MO.isReg() && MO.isUse()) {
5300         MRI.clearKillFlags(MO.getReg());
5301       }
5302     }
5303   }
5304 
5305   // To insert the loop we need to split the block. Move everything after this
5306   // point to a new block, and insert a new empty block between the two.
5307   MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock();
5308   MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
5309   MachineFunction::iterator MBBI(MBB);
5310   ++MBBI;
5311 
5312   MF.insert(MBBI, LoopBB);
5313   MF.insert(MBBI, RemainderBB);
5314 
5315   LoopBB->addSuccessor(LoopBB);
5316   LoopBB->addSuccessor(RemainderBB);
5317 
5318   // Move Begin to MI to the LoopBB, and the remainder of the block to
5319   // RemainderBB.
5320   RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
5321   RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
5322   LoopBB->splice(LoopBB->begin(), &MBB, Begin, MBB.end());
5323 
5324   MBB.addSuccessor(LoopBB);
5325 
5326   // Update dominators. We know that MBB immediately dominates LoopBB, that
5327   // LoopBB immediately dominates RemainderBB, and that RemainderBB immediately
5328   // dominates all of the successors transferred to it from MBB that MBB used
5329   // to properly dominate.
5330   if (MDT) {
5331     MDT->addNewBlock(LoopBB, &MBB);
5332     MDT->addNewBlock(RemainderBB, LoopBB);
5333     for (auto &Succ : RemainderBB->successors()) {
5334       if (MDT->properlyDominates(&MBB, Succ)) {
5335         MDT->changeImmediateDominator(Succ, RemainderBB);
5336       }
5337     }
5338   }
5339 
5340   emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, Rsrc);
5341 
5342   // Restore the EXEC mask
5343   MachineBasicBlock::iterator First = RemainderBB->begin();
5344   BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec);
5345   return LoopBB;
5346 }
5347 
5348 // Extract pointer from Rsrc and return a zero-value Rsrc replacement.
5349 static std::tuple<unsigned, unsigned>
5350 extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) {
5351   MachineBasicBlock &MBB = *MI.getParent();
5352   MachineFunction &MF = *MBB.getParent();
5353   MachineRegisterInfo &MRI = MF.getRegInfo();
5354 
5355   // Extract the ptr from the resource descriptor.
5356   unsigned RsrcPtr =
5357       TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
5358                              AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
5359 
5360   // Create an empty resource descriptor
5361   Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5362   Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5363   Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5364   Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
5365   uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
5366 
5367   // Zero64 = 0
5368   BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
5369       .addImm(0);
5370 
5371   // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
5372   BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
5373       .addImm(RsrcDataFormat & 0xFFFFFFFF);
5374 
5375   // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
5376   BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
5377       .addImm(RsrcDataFormat >> 32);
5378 
5379   // NewSRsrc = {Zero64, SRsrcFormat}
5380   BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
5381       .addReg(Zero64)
5382       .addImm(AMDGPU::sub0_sub1)
5383       .addReg(SRsrcFormatLo)
5384       .addImm(AMDGPU::sub2)
5385       .addReg(SRsrcFormatHi)
5386       .addImm(AMDGPU::sub3);
5387 
5388   return std::make_tuple(RsrcPtr, NewSRsrc);
5389 }
5390 
5391 MachineBasicBlock *
5392 SIInstrInfo::legalizeOperands(MachineInstr &MI,
5393                               MachineDominatorTree *MDT) const {
5394   MachineFunction &MF = *MI.getParent()->getParent();
5395   MachineRegisterInfo &MRI = MF.getRegInfo();
5396   MachineBasicBlock *CreatedBB = nullptr;
5397 
5398   // Legalize VOP2
5399   if (isVOP2(MI) || isVOPC(MI)) {
5400     legalizeOperandsVOP2(MRI, MI);
5401     return CreatedBB;
5402   }
5403 
5404   // Legalize VOP3
5405   if (isVOP3(MI)) {
5406     legalizeOperandsVOP3(MRI, MI);
5407     return CreatedBB;
5408   }
5409 
5410   // Legalize SMRD
5411   if (isSMRD(MI)) {
5412     legalizeOperandsSMRD(MRI, MI);
5413     return CreatedBB;
5414   }
5415 
5416   // Legalize FLAT
5417   if (isFLAT(MI)) {
5418     legalizeOperandsFLAT(MRI, MI);
5419     return CreatedBB;
5420   }
5421 
5422   // Legalize REG_SEQUENCE and PHI
5423   // The register class of the operands much be the same type as the register
5424   // class of the output.
5425   if (MI.getOpcode() == AMDGPU::PHI) {
5426     const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
5427     for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
5428       if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
5429         continue;
5430       const TargetRegisterClass *OpRC =
5431           MRI.getRegClass(MI.getOperand(i).getReg());
5432       if (RI.hasVectorRegisters(OpRC)) {
5433         VRC = OpRC;
5434       } else {
5435         SRC = OpRC;
5436       }
5437     }
5438 
5439     // If any of the operands are VGPR registers, then they all most be
5440     // otherwise we will create illegal VGPR->SGPR copies when legalizing
5441     // them.
5442     if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
5443       if (!VRC) {
5444         assert(SRC);
5445         if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
5446           VRC = &AMDGPU::VReg_1RegClass;
5447         } else
5448           VRC = RI.hasAGPRs(getOpRegClass(MI, 0))
5449                     ? RI.getEquivalentAGPRClass(SRC)
5450                     : RI.getEquivalentVGPRClass(SRC);
5451       } else {
5452           VRC = RI.hasAGPRs(getOpRegClass(MI, 0))
5453                     ? RI.getEquivalentAGPRClass(VRC)
5454                     : RI.getEquivalentVGPRClass(VRC);
5455       }
5456       RC = VRC;
5457     } else {
5458       RC = SRC;
5459     }
5460 
5461     // Update all the operands so they have the same type.
5462     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
5463       MachineOperand &Op = MI.getOperand(I);
5464       if (!Op.isReg() || !Op.getReg().isVirtual())
5465         continue;
5466 
5467       // MI is a PHI instruction.
5468       MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
5469       MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator();
5470 
5471       // Avoid creating no-op copies with the same src and dst reg class.  These
5472       // confuse some of the machine passes.
5473       legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
5474     }
5475   }
5476 
5477   // REG_SEQUENCE doesn't really require operand legalization, but if one has a
5478   // VGPR dest type and SGPR sources, insert copies so all operands are
5479   // VGPRs. This seems to help operand folding / the register coalescer.
5480   if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
5481     MachineBasicBlock *MBB = MI.getParent();
5482     const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
5483     if (RI.hasVGPRs(DstRC)) {
5484       // Update all the operands so they are VGPR register classes. These may
5485       // not be the same register class because REG_SEQUENCE supports mixing
5486       // subregister index types e.g. sub0_sub1 + sub2 + sub3
5487       for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
5488         MachineOperand &Op = MI.getOperand(I);
5489         if (!Op.isReg() || !Op.getReg().isVirtual())
5490           continue;
5491 
5492         const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
5493         const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
5494         if (VRC == OpRC)
5495           continue;
5496 
5497         legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
5498         Op.setIsKill();
5499       }
5500     }
5501 
5502     return CreatedBB;
5503   }
5504 
5505   // Legalize INSERT_SUBREG
5506   // src0 must have the same register class as dst
5507   if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
5508     Register Dst = MI.getOperand(0).getReg();
5509     Register Src0 = MI.getOperand(1).getReg();
5510     const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
5511     const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
5512     if (DstRC != Src0RC) {
5513       MachineBasicBlock *MBB = MI.getParent();
5514       MachineOperand &Op = MI.getOperand(1);
5515       legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
5516     }
5517     return CreatedBB;
5518   }
5519 
5520   // Legalize SI_INIT_M0
5521   if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
5522     MachineOperand &Src = MI.getOperand(0);
5523     if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
5524       Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
5525     return CreatedBB;
5526   }
5527 
5528   // Legalize MIMG and MUBUF/MTBUF for shaders.
5529   //
5530   // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
5531   // scratch memory access. In both cases, the legalization never involves
5532   // conversion to the addr64 form.
5533   if (isMIMG(MI) || (AMDGPU::isGraphics(MF.getFunction().getCallingConv()) &&
5534                      (isMUBUF(MI) || isMTBUF(MI)))) {
5535     MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc);
5536     if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
5537       CreatedBB = loadSRsrcFromVGPR(*this, MI, *SRsrc, MDT);
5538 
5539     MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp);
5540     if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
5541       CreatedBB = loadSRsrcFromVGPR(*this, MI, *SSamp, MDT);
5542 
5543     return CreatedBB;
5544   }
5545 
5546   // Legalize SI_CALL
5547   if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
5548     MachineOperand *Dest = &MI.getOperand(0);
5549     if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
5550       // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
5551       // following copies, we also need to move copies from and to physical
5552       // registers into the loop block.
5553       unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
5554       unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
5555 
5556       // Also move the copies to physical registers into the loop block
5557       MachineBasicBlock &MBB = *MI.getParent();
5558       MachineBasicBlock::iterator Start(&MI);
5559       while (Start->getOpcode() != FrameSetupOpcode)
5560         --Start;
5561       MachineBasicBlock::iterator End(&MI);
5562       while (End->getOpcode() != FrameDestroyOpcode)
5563         ++End;
5564       // Also include following copies of the return value
5565       ++End;
5566       while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
5567              MI.definesRegister(End->getOperand(1).getReg()))
5568         ++End;
5569       CreatedBB = loadSRsrcFromVGPR(*this, MI, *Dest, MDT, Start, End);
5570     }
5571   }
5572 
5573   // Legalize MUBUF* instructions.
5574   int RsrcIdx =
5575       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
5576   if (RsrcIdx != -1) {
5577     // We have an MUBUF instruction
5578     MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
5579     unsigned RsrcRC = get(MI.getOpcode()).OpInfo[RsrcIdx].RegClass;
5580     if (RI.getCommonSubClass(MRI.getRegClass(Rsrc->getReg()),
5581                              RI.getRegClass(RsrcRC))) {
5582       // The operands are legal.
5583       // FIXME: We may need to legalize operands besided srsrc.
5584       return CreatedBB;
5585     }
5586 
5587     // Legalize a VGPR Rsrc.
5588     //
5589     // If the instruction is _ADDR64, we can avoid a waterfall by extracting
5590     // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
5591     // a zero-value SRsrc.
5592     //
5593     // If the instruction is _OFFSET (both idxen and offen disabled), and we
5594     // support ADDR64 instructions, we can convert to ADDR64 and do the same as
5595     // above.
5596     //
5597     // Otherwise we are on non-ADDR64 hardware, and/or we have
5598     // idxen/offen/bothen and we fall back to a waterfall loop.
5599 
5600     MachineBasicBlock &MBB = *MI.getParent();
5601 
5602     MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
5603     if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
5604       // This is already an ADDR64 instruction so we need to add the pointer
5605       // extracted from the resource descriptor to the current value of VAddr.
5606       Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5607       Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5608       Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
5609 
5610       const auto *BoolXExecRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5611       Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
5612       Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
5613 
5614       unsigned RsrcPtr, NewSRsrc;
5615       std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
5616 
5617       // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
5618       const DebugLoc &DL = MI.getDebugLoc();
5619       BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
5620         .addDef(CondReg0)
5621         .addReg(RsrcPtr, 0, AMDGPU::sub0)
5622         .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
5623         .addImm(0);
5624 
5625       // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
5626       BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
5627         .addDef(CondReg1, RegState::Dead)
5628         .addReg(RsrcPtr, 0, AMDGPU::sub1)
5629         .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
5630         .addReg(CondReg0, RegState::Kill)
5631         .addImm(0);
5632 
5633       // NewVaddr = {NewVaddrHi, NewVaddrLo}
5634       BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
5635           .addReg(NewVAddrLo)
5636           .addImm(AMDGPU::sub0)
5637           .addReg(NewVAddrHi)
5638           .addImm(AMDGPU::sub1);
5639 
5640       VAddr->setReg(NewVAddr);
5641       Rsrc->setReg(NewSRsrc);
5642     } else if (!VAddr && ST.hasAddr64()) {
5643       // This instructions is the _OFFSET variant, so we need to convert it to
5644       // ADDR64.
5645       assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
5646              "FIXME: Need to emit flat atomics here");
5647 
5648       unsigned RsrcPtr, NewSRsrc;
5649       std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
5650 
5651       Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
5652       MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
5653       MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5654       MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
5655       unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
5656 
5657       // Atomics rith return have have an additional tied operand and are
5658       // missing some of the special bits.
5659       MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
5660       MachineInstr *Addr64;
5661 
5662       if (!VDataIn) {
5663         // Regular buffer load / store.
5664         MachineInstrBuilder MIB =
5665             BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
5666                 .add(*VData)
5667                 .addReg(NewVAddr)
5668                 .addReg(NewSRsrc)
5669                 .add(*SOffset)
5670                 .add(*Offset);
5671 
5672         if (const MachineOperand *CPol =
5673                 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
5674           MIB.addImm(CPol->getImm());
5675         }
5676 
5677         if (const MachineOperand *TFE =
5678                 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
5679           MIB.addImm(TFE->getImm());
5680         }
5681 
5682         MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
5683 
5684         MIB.cloneMemRefs(MI);
5685         Addr64 = MIB;
5686       } else {
5687         // Atomics with return.
5688         Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
5689                      .add(*VData)
5690                      .add(*VDataIn)
5691                      .addReg(NewVAddr)
5692                      .addReg(NewSRsrc)
5693                      .add(*SOffset)
5694                      .add(*Offset)
5695                      .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
5696                      .cloneMemRefs(MI);
5697       }
5698 
5699       MI.removeFromParent();
5700 
5701       // NewVaddr = {NewVaddrHi, NewVaddrLo}
5702       BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
5703               NewVAddr)
5704           .addReg(RsrcPtr, 0, AMDGPU::sub0)
5705           .addImm(AMDGPU::sub0)
5706           .addReg(RsrcPtr, 0, AMDGPU::sub1)
5707           .addImm(AMDGPU::sub1);
5708     } else {
5709       // This is another variant; legalize Rsrc with waterfall loop from VGPRs
5710       // to SGPRs.
5711       CreatedBB = loadSRsrcFromVGPR(*this, MI, *Rsrc, MDT);
5712       return CreatedBB;
5713     }
5714   }
5715   return CreatedBB;
5716 }
5717 
5718 MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst,
5719                                            MachineDominatorTree *MDT) const {
5720   SetVectorType Worklist;
5721   Worklist.insert(&TopInst);
5722   MachineBasicBlock *CreatedBB = nullptr;
5723   MachineBasicBlock *CreatedBBTmp = nullptr;
5724 
5725   while (!Worklist.empty()) {
5726     MachineInstr &Inst = *Worklist.pop_back_val();
5727     MachineBasicBlock *MBB = Inst.getParent();
5728     MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
5729 
5730     unsigned Opcode = Inst.getOpcode();
5731     unsigned NewOpcode = getVALUOp(Inst);
5732 
5733     // Handle some special cases
5734     switch (Opcode) {
5735     default:
5736       break;
5737     case AMDGPU::S_ADD_U64_PSEUDO:
5738     case AMDGPU::S_SUB_U64_PSEUDO:
5739       splitScalar64BitAddSub(Worklist, Inst, MDT);
5740       Inst.eraseFromParent();
5741       continue;
5742     case AMDGPU::S_ADD_I32:
5743     case AMDGPU::S_SUB_I32: {
5744       // FIXME: The u32 versions currently selected use the carry.
5745       bool Changed;
5746       std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
5747       if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp)
5748         CreatedBB = CreatedBBTmp;
5749       if (Changed)
5750         continue;
5751 
5752       // Default handling
5753       break;
5754     }
5755     case AMDGPU::S_AND_B64:
5756       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
5757       Inst.eraseFromParent();
5758       continue;
5759 
5760     case AMDGPU::S_OR_B64:
5761       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
5762       Inst.eraseFromParent();
5763       continue;
5764 
5765     case AMDGPU::S_XOR_B64:
5766       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
5767       Inst.eraseFromParent();
5768       continue;
5769 
5770     case AMDGPU::S_NAND_B64:
5771       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
5772       Inst.eraseFromParent();
5773       continue;
5774 
5775     case AMDGPU::S_NOR_B64:
5776       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
5777       Inst.eraseFromParent();
5778       continue;
5779 
5780     case AMDGPU::S_XNOR_B64:
5781       if (ST.hasDLInsts())
5782         splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
5783       else
5784         splitScalar64BitXnor(Worklist, Inst, MDT);
5785       Inst.eraseFromParent();
5786       continue;
5787 
5788     case AMDGPU::S_ANDN2_B64:
5789       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
5790       Inst.eraseFromParent();
5791       continue;
5792 
5793     case AMDGPU::S_ORN2_B64:
5794       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
5795       Inst.eraseFromParent();
5796       continue;
5797 
5798     case AMDGPU::S_BREV_B64:
5799       splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
5800       Inst.eraseFromParent();
5801       continue;
5802 
5803     case AMDGPU::S_NOT_B64:
5804       splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
5805       Inst.eraseFromParent();
5806       continue;
5807 
5808     case AMDGPU::S_BCNT1_I32_B64:
5809       splitScalar64BitBCNT(Worklist, Inst);
5810       Inst.eraseFromParent();
5811       continue;
5812 
5813     case AMDGPU::S_BFE_I64:
5814       splitScalar64BitBFE(Worklist, Inst);
5815       Inst.eraseFromParent();
5816       continue;
5817 
5818     case AMDGPU::S_LSHL_B32:
5819       if (ST.hasOnlyRevVALUShifts()) {
5820         NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
5821         swapOperands(Inst);
5822       }
5823       break;
5824     case AMDGPU::S_ASHR_I32:
5825       if (ST.hasOnlyRevVALUShifts()) {
5826         NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
5827         swapOperands(Inst);
5828       }
5829       break;
5830     case AMDGPU::S_LSHR_B32:
5831       if (ST.hasOnlyRevVALUShifts()) {
5832         NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
5833         swapOperands(Inst);
5834       }
5835       break;
5836     case AMDGPU::S_LSHL_B64:
5837       if (ST.hasOnlyRevVALUShifts()) {
5838         NewOpcode = AMDGPU::V_LSHLREV_B64_e64;
5839         swapOperands(Inst);
5840       }
5841       break;
5842     case AMDGPU::S_ASHR_I64:
5843       if (ST.hasOnlyRevVALUShifts()) {
5844         NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
5845         swapOperands(Inst);
5846       }
5847       break;
5848     case AMDGPU::S_LSHR_B64:
5849       if (ST.hasOnlyRevVALUShifts()) {
5850         NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
5851         swapOperands(Inst);
5852       }
5853       break;
5854 
5855     case AMDGPU::S_ABS_I32:
5856       lowerScalarAbs(Worklist, Inst);
5857       Inst.eraseFromParent();
5858       continue;
5859 
5860     case AMDGPU::S_CBRANCH_SCC0:
5861     case AMDGPU::S_CBRANCH_SCC1:
5862       // Clear unused bits of vcc
5863       if (ST.isWave32())
5864         BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B32),
5865                 AMDGPU::VCC_LO)
5866             .addReg(AMDGPU::EXEC_LO)
5867             .addReg(AMDGPU::VCC_LO);
5868       else
5869         BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64),
5870                 AMDGPU::VCC)
5871             .addReg(AMDGPU::EXEC)
5872             .addReg(AMDGPU::VCC);
5873       break;
5874 
5875     case AMDGPU::S_BFE_U64:
5876     case AMDGPU::S_BFM_B64:
5877       llvm_unreachable("Moving this op to VALU not implemented");
5878 
5879     case AMDGPU::S_PACK_LL_B32_B16:
5880     case AMDGPU::S_PACK_LH_B32_B16:
5881     case AMDGPU::S_PACK_HH_B32_B16:
5882       movePackToVALU(Worklist, MRI, Inst);
5883       Inst.eraseFromParent();
5884       continue;
5885 
5886     case AMDGPU::S_XNOR_B32:
5887       lowerScalarXnor(Worklist, Inst);
5888       Inst.eraseFromParent();
5889       continue;
5890 
5891     case AMDGPU::S_NAND_B32:
5892       splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
5893       Inst.eraseFromParent();
5894       continue;
5895 
5896     case AMDGPU::S_NOR_B32:
5897       splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
5898       Inst.eraseFromParent();
5899       continue;
5900 
5901     case AMDGPU::S_ANDN2_B32:
5902       splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
5903       Inst.eraseFromParent();
5904       continue;
5905 
5906     case AMDGPU::S_ORN2_B32:
5907       splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
5908       Inst.eraseFromParent();
5909       continue;
5910 
5911     // TODO: remove as soon as everything is ready
5912     // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
5913     // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
5914     // can only be selected from the uniform SDNode.
5915     case AMDGPU::S_ADD_CO_PSEUDO:
5916     case AMDGPU::S_SUB_CO_PSEUDO: {
5917       unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5918                          ? AMDGPU::V_ADDC_U32_e64
5919                          : AMDGPU::V_SUBB_U32_e64;
5920       const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5921 
5922       Register CarryInReg = Inst.getOperand(4).getReg();
5923       if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
5924         Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
5925         BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
5926             .addReg(CarryInReg);
5927       }
5928 
5929       Register CarryOutReg = Inst.getOperand(1).getReg();
5930 
5931       Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
5932           MRI.getRegClass(Inst.getOperand(0).getReg())));
5933       MachineInstr *CarryOp =
5934           BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
5935               .addReg(CarryOutReg, RegState::Define)
5936               .add(Inst.getOperand(2))
5937               .add(Inst.getOperand(3))
5938               .addReg(CarryInReg)
5939               .addImm(0);
5940       CreatedBBTmp = legalizeOperands(*CarryOp);
5941       if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp)
5942         CreatedBB = CreatedBBTmp;
5943       MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
5944       addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
5945       Inst.eraseFromParent();
5946     }
5947       continue;
5948     case AMDGPU::S_UADDO_PSEUDO:
5949     case AMDGPU::S_USUBO_PSEUDO: {
5950       const DebugLoc &DL = Inst.getDebugLoc();
5951       MachineOperand &Dest0 = Inst.getOperand(0);
5952       MachineOperand &Dest1 = Inst.getOperand(1);
5953       MachineOperand &Src0 = Inst.getOperand(2);
5954       MachineOperand &Src1 = Inst.getOperand(3);
5955 
5956       unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5957                          ? AMDGPU::V_ADD_CO_U32_e64
5958                          : AMDGPU::V_SUB_CO_U32_e64;
5959       const TargetRegisterClass *NewRC =
5960           RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
5961       Register DestReg = MRI.createVirtualRegister(NewRC);
5962       MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
5963                                    .addReg(Dest1.getReg(), RegState::Define)
5964                                    .add(Src0)
5965                                    .add(Src1)
5966                                    .addImm(0); // clamp bit
5967 
5968       CreatedBBTmp = legalizeOperands(*NewInstr, MDT);
5969       if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp)
5970         CreatedBB = CreatedBBTmp;
5971 
5972       MRI.replaceRegWith(Dest0.getReg(), DestReg);
5973       addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
5974                                    Worklist);
5975       Inst.eraseFromParent();
5976     }
5977       continue;
5978 
5979     case AMDGPU::S_CSELECT_B32:
5980     case AMDGPU::S_CSELECT_B64:
5981       lowerSelect(Worklist, Inst, MDT);
5982       Inst.eraseFromParent();
5983       continue;
5984     }
5985 
5986     if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
5987       // We cannot move this instruction to the VALU, so we should try to
5988       // legalize its operands instead.
5989       CreatedBBTmp = legalizeOperands(Inst, MDT);
5990       if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp)
5991         CreatedBB = CreatedBBTmp;
5992       continue;
5993     }
5994 
5995     // Use the new VALU Opcode.
5996     const MCInstrDesc &NewDesc = get(NewOpcode);
5997     Inst.setDesc(NewDesc);
5998 
5999     // Remove any references to SCC. Vector instructions can't read from it, and
6000     // We're just about to add the implicit use / defs of VCC, and we don't want
6001     // both.
6002     for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) {
6003       MachineOperand &Op = Inst.getOperand(i);
6004       if (Op.isReg() && Op.getReg() == AMDGPU::SCC) {
6005         // Only propagate through live-def of SCC.
6006         if (Op.isDef() && !Op.isDead())
6007           addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
6008         if (Op.isUse())
6009           addSCCDefsToVALUWorklist(Op, Worklist);
6010         Inst.RemoveOperand(i);
6011       }
6012     }
6013 
6014     if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
6015       // We are converting these to a BFE, so we need to add the missing
6016       // operands for the size and offset.
6017       unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
6018       Inst.addOperand(MachineOperand::CreateImm(0));
6019       Inst.addOperand(MachineOperand::CreateImm(Size));
6020 
6021     } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
6022       // The VALU version adds the second operand to the result, so insert an
6023       // extra 0 operand.
6024       Inst.addOperand(MachineOperand::CreateImm(0));
6025     }
6026 
6027     Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent());
6028     fixImplicitOperands(Inst);
6029 
6030     if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
6031       const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
6032       // If we need to move this to VGPRs, we need to unpack the second operand
6033       // back into the 2 separate ones for bit offset and width.
6034       assert(OffsetWidthOp.isImm() &&
6035              "Scalar BFE is only implemented for constant width and offset");
6036       uint32_t Imm = OffsetWidthOp.getImm();
6037 
6038       uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
6039       uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
6040       Inst.RemoveOperand(2);                     // Remove old immediate.
6041       Inst.addOperand(MachineOperand::CreateImm(Offset));
6042       Inst.addOperand(MachineOperand::CreateImm(BitWidth));
6043     }
6044 
6045     bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef();
6046     unsigned NewDstReg = AMDGPU::NoRegister;
6047     if (HasDst) {
6048       Register DstReg = Inst.getOperand(0).getReg();
6049       if (DstReg.isPhysical())
6050         continue;
6051 
6052       // Update the destination register class.
6053       const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
6054       if (!NewDstRC)
6055         continue;
6056 
6057       if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
6058           NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
6059         // Instead of creating a copy where src and dst are the same register
6060         // class, we just replace all uses of dst with src.  These kinds of
6061         // copies interfere with the heuristics MachineSink uses to decide
6062         // whether or not to split a critical edge.  Since the pass assumes
6063         // that copies will end up as machine instructions and not be
6064         // eliminated.
6065         addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
6066         MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
6067         MRI.clearKillFlags(Inst.getOperand(1).getReg());
6068         Inst.getOperand(0).setReg(DstReg);
6069 
6070         // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
6071         // these are deleted later, but at -O0 it would leave a suspicious
6072         // looking illegal copy of an undef register.
6073         for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
6074           Inst.RemoveOperand(I);
6075         Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
6076         continue;
6077       }
6078 
6079       NewDstReg = MRI.createVirtualRegister(NewDstRC);
6080       MRI.replaceRegWith(DstReg, NewDstReg);
6081     }
6082 
6083     // Legalize the operands
6084     CreatedBBTmp = legalizeOperands(Inst, MDT);
6085     if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp)
6086       CreatedBB = CreatedBBTmp;
6087 
6088     if (HasDst)
6089      addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
6090   }
6091   return CreatedBB;
6092 }
6093 
6094 // Add/sub require special handling to deal with carry outs.
6095 std::pair<bool, MachineBasicBlock *>
6096 SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
6097                               MachineDominatorTree *MDT) const {
6098   if (ST.hasAddNoCarry()) {
6099     // Assume there is no user of scc since we don't select this in that case.
6100     // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
6101     // is used.
6102 
6103     MachineBasicBlock &MBB = *Inst.getParent();
6104     MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6105 
6106     Register OldDstReg = Inst.getOperand(0).getReg();
6107     Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6108 
6109     unsigned Opc = Inst.getOpcode();
6110     assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
6111 
6112     unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
6113       AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
6114 
6115     assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
6116     Inst.RemoveOperand(3);
6117 
6118     Inst.setDesc(get(NewOpc));
6119     Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
6120     Inst.addImplicitDefUseOperands(*MBB.getParent());
6121     MRI.replaceRegWith(OldDstReg, ResultReg);
6122     MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
6123 
6124     addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
6125     return std::make_pair(true, NewBB);
6126   }
6127 
6128   return std::make_pair(false, nullptr);
6129 }
6130 
6131 void SIInstrInfo::lowerSelect(SetVectorType &Worklist, MachineInstr &Inst,
6132                               MachineDominatorTree *MDT) const {
6133 
6134   MachineBasicBlock &MBB = *Inst.getParent();
6135   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6136   MachineBasicBlock::iterator MII = Inst;
6137   DebugLoc DL = Inst.getDebugLoc();
6138 
6139   MachineOperand &Dest = Inst.getOperand(0);
6140   MachineOperand &Src0 = Inst.getOperand(1);
6141   MachineOperand &Src1 = Inst.getOperand(2);
6142   MachineOperand &Cond = Inst.getOperand(3);
6143 
6144   Register SCCSource = Cond.getReg();
6145   // Find SCC def, and if that is a copy (SCC = COPY reg) then use reg instead.
6146   if (!Cond.isUndef()) {
6147     for (MachineInstr &CandI :
6148          make_range(std::next(MachineBasicBlock::reverse_iterator(Inst)),
6149                     Inst.getParent()->rend())) {
6150       if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) !=
6151           -1) {
6152         if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
6153           SCCSource = CandI.getOperand(1).getReg();
6154         }
6155         break;
6156       }
6157     }
6158   }
6159 
6160   // If this is a trivial select where the condition is effectively not SCC
6161   // (SCCSource is a source of copy to SCC), then the select is semantically
6162   // equivalent to copying SCCSource. Hence, there is no need to create
6163   // V_CNDMASK, we can just use that and bail out.
6164   if ((SCCSource != AMDGPU::SCC) && Src0.isImm() && (Src0.getImm() == -1) &&
6165       Src1.isImm() && (Src1.getImm() == 0)) {
6166     MRI.replaceRegWith(Dest.getReg(), SCCSource);
6167     return;
6168   }
6169 
6170   const TargetRegisterClass *TC = ST.getWavefrontSize() == 64
6171                                       ? &AMDGPU::SReg_64_XEXECRegClass
6172                                       : &AMDGPU::SReg_32_XM0_XEXECRegClass;
6173   Register CopySCC = MRI.createVirtualRegister(TC);
6174 
6175   if (SCCSource == AMDGPU::SCC) {
6176     // Insert a trivial select instead of creating a copy, because a copy from
6177     // SCC would semantically mean just copying a single bit, but we may need
6178     // the result to be a vector condition mask that needs preserving.
6179     unsigned Opcode = (ST.getWavefrontSize() == 64) ? AMDGPU::S_CSELECT_B64
6180                                                     : AMDGPU::S_CSELECT_B32;
6181     auto NewSelect =
6182         BuildMI(MBB, MII, DL, get(Opcode), CopySCC).addImm(-1).addImm(0);
6183     NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
6184   } else {
6185     BuildMI(MBB, MII, DL, get(AMDGPU::COPY), CopySCC).addReg(SCCSource);
6186   }
6187 
6188   Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6189 
6190   auto UpdatedInst =
6191       BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), ResultReg)
6192           .addImm(0)
6193           .add(Src1) // False
6194           .addImm(0)
6195           .add(Src0) // True
6196           .addReg(CopySCC);
6197 
6198   MRI.replaceRegWith(Dest.getReg(), ResultReg);
6199   legalizeOperands(*UpdatedInst, MDT);
6200   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
6201 }
6202 
6203 void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist,
6204                                  MachineInstr &Inst) const {
6205   MachineBasicBlock &MBB = *Inst.getParent();
6206   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6207   MachineBasicBlock::iterator MII = Inst;
6208   DebugLoc DL = Inst.getDebugLoc();
6209 
6210   MachineOperand &Dest = Inst.getOperand(0);
6211   MachineOperand &Src = Inst.getOperand(1);
6212   Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6213   Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6214 
6215   unsigned SubOp = ST.hasAddNoCarry() ?
6216     AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
6217 
6218   BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
6219     .addImm(0)
6220     .addReg(Src.getReg());
6221 
6222   BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
6223     .addReg(Src.getReg())
6224     .addReg(TmpReg);
6225 
6226   MRI.replaceRegWith(Dest.getReg(), ResultReg);
6227   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
6228 }
6229 
6230 void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
6231                                   MachineInstr &Inst) const {
6232   MachineBasicBlock &MBB = *Inst.getParent();
6233   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6234   MachineBasicBlock::iterator MII = Inst;
6235   const DebugLoc &DL = Inst.getDebugLoc();
6236 
6237   MachineOperand &Dest = Inst.getOperand(0);
6238   MachineOperand &Src0 = Inst.getOperand(1);
6239   MachineOperand &Src1 = Inst.getOperand(2);
6240 
6241   if (ST.hasDLInsts()) {
6242     Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6243     legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
6244     legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
6245 
6246     BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
6247       .add(Src0)
6248       .add(Src1);
6249 
6250     MRI.replaceRegWith(Dest.getReg(), NewDest);
6251     addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
6252   } else {
6253     // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
6254     // invert either source and then perform the XOR. If either source is a
6255     // scalar register, then we can leave the inversion on the scalar unit to
6256     // acheive a better distrubution of scalar and vector instructions.
6257     bool Src0IsSGPR = Src0.isReg() &&
6258                       RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
6259     bool Src1IsSGPR = Src1.isReg() &&
6260                       RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
6261     MachineInstr *Xor;
6262     Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6263     Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6264 
6265     // Build a pair of scalar instructions and add them to the work list.
6266     // The next iteration over the work list will lower these to the vector
6267     // unit as necessary.
6268     if (Src0IsSGPR) {
6269       BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
6270       Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
6271       .addReg(Temp)
6272       .add(Src1);
6273     } else if (Src1IsSGPR) {
6274       BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
6275       Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
6276       .add(Src0)
6277       .addReg(Temp);
6278     } else {
6279       Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
6280         .add(Src0)
6281         .add(Src1);
6282       MachineInstr *Not =
6283           BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
6284       Worklist.insert(Not);
6285     }
6286 
6287     MRI.replaceRegWith(Dest.getReg(), NewDest);
6288 
6289     Worklist.insert(Xor);
6290 
6291     addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
6292   }
6293 }
6294 
6295 void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist,
6296                                       MachineInstr &Inst,
6297                                       unsigned Opcode) const {
6298   MachineBasicBlock &MBB = *Inst.getParent();
6299   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6300   MachineBasicBlock::iterator MII = Inst;
6301   const DebugLoc &DL = Inst.getDebugLoc();
6302 
6303   MachineOperand &Dest = Inst.getOperand(0);
6304   MachineOperand &Src0 = Inst.getOperand(1);
6305   MachineOperand &Src1 = Inst.getOperand(2);
6306 
6307   Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6308   Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6309 
6310   MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
6311     .add(Src0)
6312     .add(Src1);
6313 
6314   MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
6315     .addReg(Interm);
6316 
6317   Worklist.insert(&Op);
6318   Worklist.insert(&Not);
6319 
6320   MRI.replaceRegWith(Dest.getReg(), NewDest);
6321   addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
6322 }
6323 
6324 void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist,
6325                                      MachineInstr &Inst,
6326                                      unsigned Opcode) const {
6327   MachineBasicBlock &MBB = *Inst.getParent();
6328   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6329   MachineBasicBlock::iterator MII = Inst;
6330   const DebugLoc &DL = Inst.getDebugLoc();
6331 
6332   MachineOperand &Dest = Inst.getOperand(0);
6333   MachineOperand &Src0 = Inst.getOperand(1);
6334   MachineOperand &Src1 = Inst.getOperand(2);
6335 
6336   Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6337   Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6338 
6339   MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
6340     .add(Src1);
6341 
6342   MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
6343     .add(Src0)
6344     .addReg(Interm);
6345 
6346   Worklist.insert(&Not);
6347   Worklist.insert(&Op);
6348 
6349   MRI.replaceRegWith(Dest.getReg(), NewDest);
6350   addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
6351 }
6352 
6353 void SIInstrInfo::splitScalar64BitUnaryOp(
6354     SetVectorType &Worklist, MachineInstr &Inst,
6355     unsigned Opcode, bool Swap) const {
6356   MachineBasicBlock &MBB = *Inst.getParent();
6357   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6358 
6359   MachineOperand &Dest = Inst.getOperand(0);
6360   MachineOperand &Src0 = Inst.getOperand(1);
6361   DebugLoc DL = Inst.getDebugLoc();
6362 
6363   MachineBasicBlock::iterator MII = Inst;
6364 
6365   const MCInstrDesc &InstDesc = get(Opcode);
6366   const TargetRegisterClass *Src0RC = Src0.isReg() ?
6367     MRI.getRegClass(Src0.getReg()) :
6368     &AMDGPU::SGPR_32RegClass;
6369 
6370   const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
6371 
6372   MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
6373                                                        AMDGPU::sub0, Src0SubRC);
6374 
6375   const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
6376   const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
6377   const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
6378 
6379   Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
6380   MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
6381 
6382   MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
6383                                                        AMDGPU::sub1, Src0SubRC);
6384 
6385   Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
6386   MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
6387 
6388   if (Swap)
6389     std::swap(DestSub0, DestSub1);
6390 
6391   Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
6392   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
6393     .addReg(DestSub0)
6394     .addImm(AMDGPU::sub0)
6395     .addReg(DestSub1)
6396     .addImm(AMDGPU::sub1);
6397 
6398   MRI.replaceRegWith(Dest.getReg(), FullDestReg);
6399 
6400   Worklist.insert(&LoHalf);
6401   Worklist.insert(&HiHalf);
6402 
6403   // We don't need to legalizeOperands here because for a single operand, src0
6404   // will support any kind of input.
6405 
6406   // Move all users of this moved value.
6407   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
6408 }
6409 
6410 void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist,
6411                                          MachineInstr &Inst,
6412                                          MachineDominatorTree *MDT) const {
6413   bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
6414 
6415   MachineBasicBlock &MBB = *Inst.getParent();
6416   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6417   const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
6418 
6419   Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
6420   Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6421   Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6422 
6423   Register CarryReg = MRI.createVirtualRegister(CarryRC);
6424   Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
6425 
6426   MachineOperand &Dest = Inst.getOperand(0);
6427   MachineOperand &Src0 = Inst.getOperand(1);
6428   MachineOperand &Src1 = Inst.getOperand(2);
6429   const DebugLoc &DL = Inst.getDebugLoc();
6430   MachineBasicBlock::iterator MII = Inst;
6431 
6432   const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
6433   const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
6434   const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
6435   const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
6436 
6437   MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
6438                                                        AMDGPU::sub0, Src0SubRC);
6439   MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
6440                                                        AMDGPU::sub0, Src1SubRC);
6441 
6442 
6443   MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
6444                                                        AMDGPU::sub1, Src0SubRC);
6445   MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
6446                                                        AMDGPU::sub1, Src1SubRC);
6447 
6448   unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6449   MachineInstr *LoHalf =
6450     BuildMI(MBB, MII, DL, get(LoOpc), DestSub0)
6451     .addReg(CarryReg, RegState::Define)
6452     .add(SrcReg0Sub0)
6453     .add(SrcReg1Sub0)
6454     .addImm(0); // clamp bit
6455 
6456   unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6457   MachineInstr *HiHalf =
6458     BuildMI(MBB, MII, DL, get(HiOpc), DestSub1)
6459     .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
6460     .add(SrcReg0Sub1)
6461     .add(SrcReg1Sub1)
6462     .addReg(CarryReg, RegState::Kill)
6463     .addImm(0); // clamp bit
6464 
6465   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
6466     .addReg(DestSub0)
6467     .addImm(AMDGPU::sub0)
6468     .addReg(DestSub1)
6469     .addImm(AMDGPU::sub1);
6470 
6471   MRI.replaceRegWith(Dest.getReg(), FullDestReg);
6472 
6473   // Try to legalize the operands in case we need to swap the order to keep it
6474   // valid.
6475   legalizeOperands(*LoHalf, MDT);
6476   legalizeOperands(*HiHalf, MDT);
6477 
6478   // Move all users of this moved vlaue.
6479   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
6480 }
6481 
6482 void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist,
6483                                            MachineInstr &Inst, unsigned Opcode,
6484                                            MachineDominatorTree *MDT) const {
6485   MachineBasicBlock &MBB = *Inst.getParent();
6486   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6487 
6488   MachineOperand &Dest = Inst.getOperand(0);
6489   MachineOperand &Src0 = Inst.getOperand(1);
6490   MachineOperand &Src1 = Inst.getOperand(2);
6491   DebugLoc DL = Inst.getDebugLoc();
6492 
6493   MachineBasicBlock::iterator MII = Inst;
6494 
6495   const MCInstrDesc &InstDesc = get(Opcode);
6496   const TargetRegisterClass *Src0RC = Src0.isReg() ?
6497     MRI.getRegClass(Src0.getReg()) :
6498     &AMDGPU::SGPR_32RegClass;
6499 
6500   const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
6501   const TargetRegisterClass *Src1RC = Src1.isReg() ?
6502     MRI.getRegClass(Src1.getReg()) :
6503     &AMDGPU::SGPR_32RegClass;
6504 
6505   const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
6506 
6507   MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
6508                                                        AMDGPU::sub0, Src0SubRC);
6509   MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
6510                                                        AMDGPU::sub0, Src1SubRC);
6511   MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
6512                                                        AMDGPU::sub1, Src0SubRC);
6513   MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
6514                                                        AMDGPU::sub1, Src1SubRC);
6515 
6516   const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
6517   const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
6518   const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
6519 
6520   Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
6521   MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
6522                               .add(SrcReg0Sub0)
6523                               .add(SrcReg1Sub0);
6524 
6525   Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
6526   MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
6527                               .add(SrcReg0Sub1)
6528                               .add(SrcReg1Sub1);
6529 
6530   Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
6531   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
6532     .addReg(DestSub0)
6533     .addImm(AMDGPU::sub0)
6534     .addReg(DestSub1)
6535     .addImm(AMDGPU::sub1);
6536 
6537   MRI.replaceRegWith(Dest.getReg(), FullDestReg);
6538 
6539   Worklist.insert(&LoHalf);
6540   Worklist.insert(&HiHalf);
6541 
6542   // Move all users of this moved vlaue.
6543   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
6544 }
6545 
6546 void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist,
6547                                        MachineInstr &Inst,
6548                                        MachineDominatorTree *MDT) const {
6549   MachineBasicBlock &MBB = *Inst.getParent();
6550   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6551 
6552   MachineOperand &Dest = Inst.getOperand(0);
6553   MachineOperand &Src0 = Inst.getOperand(1);
6554   MachineOperand &Src1 = Inst.getOperand(2);
6555   const DebugLoc &DL = Inst.getDebugLoc();
6556 
6557   MachineBasicBlock::iterator MII = Inst;
6558 
6559   const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
6560 
6561   Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6562 
6563   MachineOperand* Op0;
6564   MachineOperand* Op1;
6565 
6566   if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
6567     Op0 = &Src0;
6568     Op1 = &Src1;
6569   } else {
6570     Op0 = &Src1;
6571     Op1 = &Src0;
6572   }
6573 
6574   BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
6575     .add(*Op0);
6576 
6577   Register NewDest = MRI.createVirtualRegister(DestRC);
6578 
6579   MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
6580     .addReg(Interm)
6581     .add(*Op1);
6582 
6583   MRI.replaceRegWith(Dest.getReg(), NewDest);
6584 
6585   Worklist.insert(&Xor);
6586 }
6587 
6588 void SIInstrInfo::splitScalar64BitBCNT(
6589     SetVectorType &Worklist, MachineInstr &Inst) const {
6590   MachineBasicBlock &MBB = *Inst.getParent();
6591   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6592 
6593   MachineBasicBlock::iterator MII = Inst;
6594   const DebugLoc &DL = Inst.getDebugLoc();
6595 
6596   MachineOperand &Dest = Inst.getOperand(0);
6597   MachineOperand &Src = Inst.getOperand(1);
6598 
6599   const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
6600   const TargetRegisterClass *SrcRC = Src.isReg() ?
6601     MRI.getRegClass(Src.getReg()) :
6602     &AMDGPU::SGPR_32RegClass;
6603 
6604   Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6605   Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6606 
6607   const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0);
6608 
6609   MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
6610                                                       AMDGPU::sub0, SrcSubRC);
6611   MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
6612                                                       AMDGPU::sub1, SrcSubRC);
6613 
6614   BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
6615 
6616   BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
6617 
6618   MRI.replaceRegWith(Dest.getReg(), ResultReg);
6619 
6620   // We don't need to legalize operands here. src0 for etiher instruction can be
6621   // an SGPR, and the second input is unused or determined here.
6622   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
6623 }
6624 
6625 void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
6626                                       MachineInstr &Inst) const {
6627   MachineBasicBlock &MBB = *Inst.getParent();
6628   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6629   MachineBasicBlock::iterator MII = Inst;
6630   const DebugLoc &DL = Inst.getDebugLoc();
6631 
6632   MachineOperand &Dest = Inst.getOperand(0);
6633   uint32_t Imm = Inst.getOperand(2).getImm();
6634   uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
6635   uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
6636 
6637   (void) Offset;
6638 
6639   // Only sext_inreg cases handled.
6640   assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
6641          Offset == 0 && "Not implemented");
6642 
6643   if (BitWidth < 32) {
6644     Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6645     Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6646     Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
6647 
6648     BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
6649         .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
6650         .addImm(0)
6651         .addImm(BitWidth);
6652 
6653     BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
6654       .addImm(31)
6655       .addReg(MidRegLo);
6656 
6657     BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
6658       .addReg(MidRegLo)
6659       .addImm(AMDGPU::sub0)
6660       .addReg(MidRegHi)
6661       .addImm(AMDGPU::sub1);
6662 
6663     MRI.replaceRegWith(Dest.getReg(), ResultReg);
6664     addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
6665     return;
6666   }
6667 
6668   MachineOperand &Src = Inst.getOperand(1);
6669   Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6670   Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
6671 
6672   BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
6673     .addImm(31)
6674     .addReg(Src.getReg(), 0, AMDGPU::sub0);
6675 
6676   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
6677     .addReg(Src.getReg(), 0, AMDGPU::sub0)
6678     .addImm(AMDGPU::sub0)
6679     .addReg(TmpReg)
6680     .addImm(AMDGPU::sub1);
6681 
6682   MRI.replaceRegWith(Dest.getReg(), ResultReg);
6683   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
6684 }
6685 
6686 void SIInstrInfo::addUsersToMoveToVALUWorklist(
6687   Register DstReg,
6688   MachineRegisterInfo &MRI,
6689   SetVectorType &Worklist) const {
6690   for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
6691          E = MRI.use_end(); I != E;) {
6692     MachineInstr &UseMI = *I->getParent();
6693 
6694     unsigned OpNo = 0;
6695 
6696     switch (UseMI.getOpcode()) {
6697     case AMDGPU::COPY:
6698     case AMDGPU::WQM:
6699     case AMDGPU::SOFT_WQM:
6700     case AMDGPU::STRICT_WWM:
6701     case AMDGPU::STRICT_WQM:
6702     case AMDGPU::REG_SEQUENCE:
6703     case AMDGPU::PHI:
6704     case AMDGPU::INSERT_SUBREG:
6705       break;
6706     default:
6707       OpNo = I.getOperandNo();
6708       break;
6709     }
6710 
6711     if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo))) {
6712       Worklist.insert(&UseMI);
6713 
6714       do {
6715         ++I;
6716       } while (I != E && I->getParent() == &UseMI);
6717     } else {
6718       ++I;
6719     }
6720   }
6721 }
6722 
6723 void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
6724                                  MachineRegisterInfo &MRI,
6725                                  MachineInstr &Inst) const {
6726   Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6727   MachineBasicBlock *MBB = Inst.getParent();
6728   MachineOperand &Src0 = Inst.getOperand(1);
6729   MachineOperand &Src1 = Inst.getOperand(2);
6730   const DebugLoc &DL = Inst.getDebugLoc();
6731 
6732   switch (Inst.getOpcode()) {
6733   case AMDGPU::S_PACK_LL_B32_B16: {
6734     Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6735     Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6736 
6737     // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
6738     // 0.
6739     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
6740       .addImm(0xffff);
6741 
6742     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
6743       .addReg(ImmReg, RegState::Kill)
6744       .add(Src0);
6745 
6746     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
6747       .add(Src1)
6748       .addImm(16)
6749       .addReg(TmpReg, RegState::Kill);
6750     break;
6751   }
6752   case AMDGPU::S_PACK_LH_B32_B16: {
6753     Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6754     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
6755       .addImm(0xffff);
6756     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
6757       .addReg(ImmReg, RegState::Kill)
6758       .add(Src0)
6759       .add(Src1);
6760     break;
6761   }
6762   case AMDGPU::S_PACK_HH_B32_B16: {
6763     Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6764     Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6765     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
6766       .addImm(16)
6767       .add(Src0);
6768     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
6769       .addImm(0xffff0000);
6770     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
6771       .add(Src1)
6772       .addReg(ImmReg, RegState::Kill)
6773       .addReg(TmpReg, RegState::Kill);
6774     break;
6775   }
6776   default:
6777     llvm_unreachable("unhandled s_pack_* instruction");
6778   }
6779 
6780   MachineOperand &Dest = Inst.getOperand(0);
6781   MRI.replaceRegWith(Dest.getReg(), ResultReg);
6782   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
6783 }
6784 
6785 void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
6786                                                MachineInstr &SCCDefInst,
6787                                                SetVectorType &Worklist) const {
6788   bool SCCUsedImplicitly = false;
6789 
6790   // Ensure that def inst defines SCC, which is still live.
6791   assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
6792          !Op.isDead() && Op.getParent() == &SCCDefInst);
6793   SmallVector<MachineInstr *, 4> CopyToDelete;
6794   // This assumes that all the users of SCC are in the same block
6795   // as the SCC def.
6796   for (MachineInstr &MI : // Skip the def inst itself.
6797        make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
6798                   SCCDefInst.getParent()->end())) {
6799     // Check if SCC is used first.
6800     if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1) {
6801       if (MI.isCopy()) {
6802         MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
6803         Register DestReg = MI.getOperand(0).getReg();
6804 
6805         for (auto &User : MRI.use_nodbg_instructions(DestReg)) {
6806           if ((User.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) ||
6807               (User.getOpcode() == AMDGPU::S_SUB_CO_PSEUDO)) {
6808             User.getOperand(4).setReg(RI.getVCC());
6809             Worklist.insert(&User);
6810           } else if (User.getOpcode() == AMDGPU::V_CNDMASK_B32_e64) {
6811             User.getOperand(5).setReg(RI.getVCC());
6812             // No need to add to Worklist.
6813           }
6814         }
6815         CopyToDelete.push_back(&MI);
6816       } else {
6817         if (MI.getOpcode() == AMDGPU::S_CSELECT_B32 ||
6818             MI.getOpcode() == AMDGPU::S_CSELECT_B64) {
6819           // This is an implicit use of SCC and it is really expected by
6820           // the SCC users to handle.
6821           // We cannot preserve the edge to the user so add the explicit
6822           // copy: SCC = COPY VCC.
6823           // The copy will be cleaned up during the processing of the user
6824           // in lowerSelect.
6825           SCCUsedImplicitly = true;
6826         }
6827 
6828         Worklist.insert(&MI);
6829       }
6830     }
6831     // Exit if we find another SCC def.
6832     if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1)
6833       break;
6834   }
6835   for (auto &Copy : CopyToDelete)
6836     Copy->eraseFromParent();
6837 
6838   if (SCCUsedImplicitly) {
6839     BuildMI(*SCCDefInst.getParent(), std::next(SCCDefInst.getIterator()),
6840             SCCDefInst.getDebugLoc(), get(AMDGPU::COPY), AMDGPU::SCC)
6841         .addReg(RI.getVCC());
6842   }
6843 }
6844 
6845 // Instructions that use SCC may be converted to VALU instructions. When that
6846 // happens, the SCC register is changed to VCC_LO. The instruction that defines
6847 // SCC must be changed to an instruction that defines VCC. This function makes
6848 // sure that the instruction that defines SCC is added to the moveToVALU
6849 // worklist.
6850 void SIInstrInfo::addSCCDefsToVALUWorklist(MachineOperand &Op,
6851                                            SetVectorType &Worklist) const {
6852   assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isUse());
6853 
6854   MachineInstr *SCCUseInst = Op.getParent();
6855   // Look for a preceeding instruction that either defines VCC or SCC. If VCC
6856   // then there is nothing to do because the defining instruction has been
6857   // converted to a VALU already. If SCC then that instruction needs to be
6858   // converted to a VALU.
6859   for (MachineInstr &MI :
6860        make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
6861                   SCCUseInst->getParent()->rend())) {
6862     if (MI.modifiesRegister(AMDGPU::VCC, &RI))
6863       break;
6864     if (MI.definesRegister(AMDGPU::SCC, &RI)) {
6865       Worklist.insert(&MI);
6866       break;
6867     }
6868   }
6869 }
6870 
6871 const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
6872   const MachineInstr &Inst) const {
6873   const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
6874 
6875   switch (Inst.getOpcode()) {
6876   // For target instructions, getOpRegClass just returns the virtual register
6877   // class associated with the operand, so we need to find an equivalent VGPR
6878   // register class in order to move the instruction to the VALU.
6879   case AMDGPU::COPY:
6880   case AMDGPU::PHI:
6881   case AMDGPU::REG_SEQUENCE:
6882   case AMDGPU::INSERT_SUBREG:
6883   case AMDGPU::WQM:
6884   case AMDGPU::SOFT_WQM:
6885   case AMDGPU::STRICT_WWM:
6886   case AMDGPU::STRICT_WQM: {
6887     const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
6888     if (RI.hasAGPRs(SrcRC)) {
6889       if (RI.hasAGPRs(NewDstRC))
6890         return nullptr;
6891 
6892       switch (Inst.getOpcode()) {
6893       case AMDGPU::PHI:
6894       case AMDGPU::REG_SEQUENCE:
6895       case AMDGPU::INSERT_SUBREG:
6896         NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
6897         break;
6898       default:
6899         NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
6900       }
6901 
6902       if (!NewDstRC)
6903         return nullptr;
6904     } else {
6905       if (RI.hasVGPRs(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
6906         return nullptr;
6907 
6908       NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
6909       if (!NewDstRC)
6910         return nullptr;
6911     }
6912 
6913     return NewDstRC;
6914   }
6915   default:
6916     return NewDstRC;
6917   }
6918 }
6919 
6920 // Find the one SGPR operand we are allowed to use.
6921 Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
6922                                    int OpIndices[3]) const {
6923   const MCInstrDesc &Desc = MI.getDesc();
6924 
6925   // Find the one SGPR operand we are allowed to use.
6926   //
6927   // First we need to consider the instruction's operand requirements before
6928   // legalizing. Some operands are required to be SGPRs, such as implicit uses
6929   // of VCC, but we are still bound by the constant bus requirement to only use
6930   // one.
6931   //
6932   // If the operand's class is an SGPR, we can never move it.
6933 
6934   Register SGPRReg = findImplicitSGPRRead(MI);
6935   if (SGPRReg != AMDGPU::NoRegister)
6936     return SGPRReg;
6937 
6938   Register UsedSGPRs[3] = { AMDGPU::NoRegister };
6939   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
6940 
6941   for (unsigned i = 0; i < 3; ++i) {
6942     int Idx = OpIndices[i];
6943     if (Idx == -1)
6944       break;
6945 
6946     const MachineOperand &MO = MI.getOperand(Idx);
6947     if (!MO.isReg())
6948       continue;
6949 
6950     // Is this operand statically required to be an SGPR based on the operand
6951     // constraints?
6952     const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass);
6953     bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
6954     if (IsRequiredSGPR)
6955       return MO.getReg();
6956 
6957     // If this could be a VGPR or an SGPR, Check the dynamic register class.
6958     Register Reg = MO.getReg();
6959     const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
6960     if (RI.isSGPRClass(RegRC))
6961       UsedSGPRs[i] = Reg;
6962   }
6963 
6964   // We don't have a required SGPR operand, so we have a bit more freedom in
6965   // selecting operands to move.
6966 
6967   // Try to select the most used SGPR. If an SGPR is equal to one of the
6968   // others, we choose that.
6969   //
6970   // e.g.
6971   // V_FMA_F32 v0, s0, s0, s0 -> No moves
6972   // V_FMA_F32 v0, s0, s1, s0 -> Move s1
6973 
6974   // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
6975   // prefer those.
6976 
6977   if (UsedSGPRs[0] != AMDGPU::NoRegister) {
6978     if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
6979       SGPRReg = UsedSGPRs[0];
6980   }
6981 
6982   if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) {
6983     if (UsedSGPRs[1] == UsedSGPRs[2])
6984       SGPRReg = UsedSGPRs[1];
6985   }
6986 
6987   return SGPRReg;
6988 }
6989 
6990 MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
6991                                              unsigned OperandName) const {
6992   int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
6993   if (Idx == -1)
6994     return nullptr;
6995 
6996   return &MI.getOperand(Idx);
6997 }
6998 
6999 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
7000   if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
7001     return (AMDGPU::MTBUFFormat::UFMT_32_FLOAT << 44) |
7002            (1ULL << 56) | // RESOURCE_LEVEL = 1
7003            (3ULL << 60); // OOB_SELECT = 3
7004   }
7005 
7006   uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
7007   if (ST.isAmdHsaOS()) {
7008     // Set ATC = 1. GFX9 doesn't have this bit.
7009     if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
7010       RsrcDataFormat |= (1ULL << 56);
7011 
7012     // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
7013     // BTW, it disables TC L2 and therefore decreases performance.
7014     if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
7015       RsrcDataFormat |= (2ULL << 59);
7016   }
7017 
7018   return RsrcDataFormat;
7019 }
7020 
7021 uint64_t SIInstrInfo::getScratchRsrcWords23() const {
7022   uint64_t Rsrc23 = getDefaultRsrcDataFormat() |
7023                     AMDGPU::RSRC_TID_ENABLE |
7024                     0xffffffff; // Size;
7025 
7026   // GFX9 doesn't have ELEMENT_SIZE.
7027   if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
7028     uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
7029     Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
7030   }
7031 
7032   // IndexStride = 64 / 32.
7033   uint64_t IndexStride = ST.getWavefrontSize() == 64 ? 3 : 2;
7034   Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
7035 
7036   // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
7037   // Clear them unless we want a huge stride.
7038   if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
7039       ST.getGeneration() <= AMDGPUSubtarget::GFX9)
7040     Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
7041 
7042   return Rsrc23;
7043 }
7044 
7045 bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const {
7046   unsigned Opc = MI.getOpcode();
7047 
7048   return isSMRD(Opc);
7049 }
7050 
7051 bool SIInstrInfo::isHighLatencyDef(int Opc) const {
7052   return get(Opc).mayLoad() &&
7053          (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
7054 }
7055 
7056 unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI,
7057                                     int &FrameIndex) const {
7058   const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
7059   if (!Addr || !Addr->isFI())
7060     return AMDGPU::NoRegister;
7061 
7062   assert(!MI.memoperands_empty() &&
7063          (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
7064 
7065   FrameIndex = Addr->getIndex();
7066   return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
7067 }
7068 
7069 unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI,
7070                                         int &FrameIndex) const {
7071   const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
7072   assert(Addr && Addr->isFI());
7073   FrameIndex = Addr->getIndex();
7074   return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
7075 }
7076 
7077 unsigned SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
7078                                           int &FrameIndex) const {
7079   if (!MI.mayLoad())
7080     return AMDGPU::NoRegister;
7081 
7082   if (isMUBUF(MI) || isVGPRSpill(MI))
7083     return isStackAccess(MI, FrameIndex);
7084 
7085   if (isSGPRSpill(MI))
7086     return isSGPRStackAccess(MI, FrameIndex);
7087 
7088   return AMDGPU::NoRegister;
7089 }
7090 
7091 unsigned SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
7092                                          int &FrameIndex) const {
7093   if (!MI.mayStore())
7094     return AMDGPU::NoRegister;
7095 
7096   if (isMUBUF(MI) || isVGPRSpill(MI))
7097     return isStackAccess(MI, FrameIndex);
7098 
7099   if (isSGPRSpill(MI))
7100     return isSGPRStackAccess(MI, FrameIndex);
7101 
7102   return AMDGPU::NoRegister;
7103 }
7104 
7105 unsigned SIInstrInfo::getInstBundleSize(const MachineInstr &MI) const {
7106   unsigned Size = 0;
7107   MachineBasicBlock::const_instr_iterator I = MI.getIterator();
7108   MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
7109   while (++I != E && I->isInsideBundle()) {
7110     assert(!I->isBundle() && "No nested bundle!");
7111     Size += getInstSizeInBytes(*I);
7112   }
7113 
7114   return Size;
7115 }
7116 
7117 unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
7118   unsigned Opc = MI.getOpcode();
7119   const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc);
7120   unsigned DescSize = Desc.getSize();
7121 
7122   // If we have a definitive size, we can use it. Otherwise we need to inspect
7123   // the operands to know the size.
7124   if (isFixedSize(MI)) {
7125     unsigned Size = DescSize;
7126 
7127     // If we hit the buggy offset, an extra nop will be inserted in MC so
7128     // estimate the worst case.
7129     if (MI.isBranch() && ST.hasOffset3fBug())
7130       Size += 4;
7131 
7132     return Size;
7133   }
7134 
7135   // 4-byte instructions may have a 32-bit literal encoded after them. Check
7136   // operands that coud ever be literals.
7137   if (isVALU(MI) || isSALU(MI)) {
7138     int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
7139     if (Src0Idx == -1)
7140       return DescSize; // No operands.
7141 
7142     if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx]))
7143       return isVOP3(MI) ? 12 : (DescSize + 4);
7144 
7145     int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
7146     if (Src1Idx == -1)
7147       return DescSize;
7148 
7149     if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx]))
7150       return isVOP3(MI) ? 12 : (DescSize + 4);
7151 
7152     int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
7153     if (Src2Idx == -1)
7154       return DescSize;
7155 
7156     if (isLiteralConstantLike(MI.getOperand(Src2Idx), Desc.OpInfo[Src2Idx]))
7157       return isVOP3(MI) ? 12 : (DescSize + 4);
7158 
7159     return DescSize;
7160   }
7161 
7162   // Check whether we have extra NSA words.
7163   if (isMIMG(MI)) {
7164     int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
7165     if (VAddr0Idx < 0)
7166       return 8;
7167 
7168     int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
7169     return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
7170   }
7171 
7172   switch (Opc) {
7173   case TargetOpcode::IMPLICIT_DEF:
7174   case TargetOpcode::KILL:
7175   case TargetOpcode::DBG_VALUE:
7176   case TargetOpcode::EH_LABEL:
7177     return 0;
7178   case TargetOpcode::BUNDLE:
7179     return getInstBundleSize(MI);
7180   case TargetOpcode::INLINEASM:
7181   case TargetOpcode::INLINEASM_BR: {
7182     const MachineFunction *MF = MI.getParent()->getParent();
7183     const char *AsmStr = MI.getOperand(0).getSymbolName();
7184     return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
7185   }
7186   default:
7187     return DescSize;
7188   }
7189 }
7190 
7191 bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const {
7192   if (!isFLAT(MI))
7193     return false;
7194 
7195   if (MI.memoperands_empty())
7196     return true;
7197 
7198   for (const MachineMemOperand *MMO : MI.memoperands()) {
7199     if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
7200       return true;
7201   }
7202   return false;
7203 }
7204 
7205 bool SIInstrInfo::isNonUniformBranchInstr(MachineInstr &Branch) const {
7206   return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO;
7207 }
7208 
7209 void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry,
7210                                             MachineBasicBlock *IfEnd) const {
7211   MachineBasicBlock::iterator TI = IfEntry->getFirstTerminator();
7212   assert(TI != IfEntry->end());
7213 
7214   MachineInstr *Branch = &(*TI);
7215   MachineFunction *MF = IfEntry->getParent();
7216   MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo();
7217 
7218   if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
7219     Register DstReg = MRI.createVirtualRegister(RI.getBoolRC());
7220     MachineInstr *SIIF =
7221         BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg)
7222             .add(Branch->getOperand(0))
7223             .add(Branch->getOperand(1));
7224     MachineInstr *SIEND =
7225         BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF))
7226             .addReg(DstReg);
7227 
7228     IfEntry->erase(TI);
7229     IfEntry->insert(IfEntry->end(), SIIF);
7230     IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND);
7231   }
7232 }
7233 
7234 void SIInstrInfo::convertNonUniformLoopRegion(
7235     MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const {
7236   MachineBasicBlock::iterator TI = LoopEnd->getFirstTerminator();
7237   // We expect 2 terminators, one conditional and one unconditional.
7238   assert(TI != LoopEnd->end());
7239 
7240   MachineInstr *Branch = &(*TI);
7241   MachineFunction *MF = LoopEnd->getParent();
7242   MachineRegisterInfo &MRI = LoopEnd->getParent()->getRegInfo();
7243 
7244   if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
7245 
7246     Register DstReg = MRI.createVirtualRegister(RI.getBoolRC());
7247     Register BackEdgeReg = MRI.createVirtualRegister(RI.getBoolRC());
7248     MachineInstrBuilder HeaderPHIBuilder =
7249         BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg);
7250     for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(),
7251                                           E = LoopEntry->pred_end();
7252          PI != E; ++PI) {
7253       if (*PI == LoopEnd) {
7254         HeaderPHIBuilder.addReg(BackEdgeReg);
7255       } else {
7256         MachineBasicBlock *PMBB = *PI;
7257         Register ZeroReg = MRI.createVirtualRegister(RI.getBoolRC());
7258         materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(),
7259                              ZeroReg, 0);
7260         HeaderPHIBuilder.addReg(ZeroReg);
7261       }
7262       HeaderPHIBuilder.addMBB(*PI);
7263     }
7264     MachineInstr *HeaderPhi = HeaderPHIBuilder;
7265     MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(),
7266                                       get(AMDGPU::SI_IF_BREAK), BackEdgeReg)
7267                                   .addReg(DstReg)
7268                                   .add(Branch->getOperand(0));
7269     MachineInstr *SILOOP =
7270         BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP))
7271             .addReg(BackEdgeReg)
7272             .addMBB(LoopEntry);
7273 
7274     LoopEntry->insert(LoopEntry->begin(), HeaderPhi);
7275     LoopEnd->erase(TI);
7276     LoopEnd->insert(LoopEnd->end(), SIIFBREAK);
7277     LoopEnd->insert(LoopEnd->end(), SILOOP);
7278   }
7279 }
7280 
7281 ArrayRef<std::pair<int, const char *>>
7282 SIInstrInfo::getSerializableTargetIndices() const {
7283   static const std::pair<int, const char *> TargetIndices[] = {
7284       {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
7285       {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
7286       {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
7287       {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
7288       {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
7289   return makeArrayRef(TargetIndices);
7290 }
7291 
7292 /// This is used by the post-RA scheduler (SchedulePostRAList.cpp).  The
7293 /// post-RA version of misched uses CreateTargetMIHazardRecognizer.
7294 ScheduleHazardRecognizer *
7295 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
7296                                             const ScheduleDAG *DAG) const {
7297   return new GCNHazardRecognizer(DAG->MF);
7298 }
7299 
7300 /// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
7301 /// pass.
7302 ScheduleHazardRecognizer *
7303 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const {
7304   return new GCNHazardRecognizer(MF);
7305 }
7306 
7307 std::pair<unsigned, unsigned>
7308 SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
7309   return std::make_pair(TF & MO_MASK, TF & ~MO_MASK);
7310 }
7311 
7312 ArrayRef<std::pair<unsigned, const char *>>
7313 SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
7314   static const std::pair<unsigned, const char *> TargetFlags[] = {
7315     { MO_GOTPCREL, "amdgpu-gotprel" },
7316     { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" },
7317     { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" },
7318     { MO_REL32_LO, "amdgpu-rel32-lo" },
7319     { MO_REL32_HI, "amdgpu-rel32-hi" },
7320     { MO_ABS32_LO, "amdgpu-abs32-lo" },
7321     { MO_ABS32_HI, "amdgpu-abs32-hi" },
7322   };
7323 
7324   return makeArrayRef(TargetFlags);
7325 }
7326 
7327 bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const {
7328   return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
7329          MI.modifiesRegister(AMDGPU::EXEC, &RI);
7330 }
7331 
7332 MachineInstrBuilder
7333 SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
7334                            MachineBasicBlock::iterator I,
7335                            const DebugLoc &DL,
7336                            Register DestReg) const {
7337   if (ST.hasAddNoCarry())
7338     return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
7339 
7340   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7341   Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
7342   MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
7343 
7344   return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
7345            .addReg(UnusedCarry, RegState::Define | RegState::Dead);
7346 }
7347 
7348 MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
7349                                                MachineBasicBlock::iterator I,
7350                                                const DebugLoc &DL,
7351                                                Register DestReg,
7352                                                RegScavenger &RS) const {
7353   if (ST.hasAddNoCarry())
7354     return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
7355 
7356   // If available, prefer to use vcc.
7357   Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
7358                              ? Register(RI.getVCC())
7359                              : RS.scavengeRegister(RI.getBoolRC(), I, 0, false);
7360 
7361   // TODO: Users need to deal with this.
7362   if (!UnusedCarry.isValid())
7363     return MachineInstrBuilder();
7364 
7365   return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
7366            .addReg(UnusedCarry, RegState::Define | RegState::Dead);
7367 }
7368 
7369 bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
7370   switch (Opcode) {
7371   case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
7372   case AMDGPU::SI_KILL_I1_TERMINATOR:
7373     return true;
7374   default:
7375     return false;
7376   }
7377 }
7378 
7379 const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) const {
7380   switch (Opcode) {
7381   case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
7382     return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
7383   case AMDGPU::SI_KILL_I1_PSEUDO:
7384     return get(AMDGPU::SI_KILL_I1_TERMINATOR);
7385   default:
7386     llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
7387   }
7388 }
7389 
7390 void SIInstrInfo::fixImplicitOperands(MachineInstr &MI) const {
7391   if (!ST.isWave32())
7392     return;
7393 
7394   for (auto &Op : MI.implicit_operands()) {
7395     if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
7396       Op.setReg(AMDGPU::VCC_LO);
7397   }
7398 }
7399 
7400 bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const {
7401   if (!isSMRD(MI))
7402     return false;
7403 
7404   // Check that it is using a buffer resource.
7405   int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
7406   if (Idx == -1) // e.g. s_memtime
7407     return false;
7408 
7409   const auto RCID = MI.getDesc().OpInfo[Idx].RegClass;
7410   return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
7411 }
7412 
7413 // Depending on the used address space and instructions, some immediate offsets
7414 // are allowed and some are not.
7415 // In general, flat instruction offsets can only be non-negative, global and
7416 // scratch instruction offsets can also be negative.
7417 //
7418 // There are several bugs related to these offsets:
7419 // On gfx10.1, flat instructions that go into the global address space cannot
7420 // use an offset.
7421 //
7422 // For scratch instructions, the address can be either an SGPR or a VGPR.
7423 // The following offsets can be used, depending on the architecture (x means
7424 // cannot be used):
7425 // +----------------------------+------+------+
7426 // | Address-Mode               | SGPR | VGPR |
7427 // +----------------------------+------+------+
7428 // | gfx9                       |      |      |
7429 // | negative, 4-aligned offset | x    | ok   |
7430 // | negative, unaligned offset | x    | ok   |
7431 // +----------------------------+------+------+
7432 // | gfx10                      |      |      |
7433 // | negative, 4-aligned offset | ok   | ok   |
7434 // | negative, unaligned offset | ok   | x    |
7435 // +----------------------------+------+------+
7436 // | gfx10.3                    |      |      |
7437 // | negative, 4-aligned offset | ok   | ok   |
7438 // | negative, unaligned offset | ok   | ok   |
7439 // +----------------------------+------+------+
7440 //
7441 // This function ignores the addressing mode, so if an offset cannot be used in
7442 // one addressing mode, it is considered illegal.
7443 bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
7444                                     uint64_t FlatVariant) const {
7445   // TODO: Should 0 be special cased?
7446   if (!ST.hasFlatInstOffsets())
7447     return false;
7448 
7449   if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
7450       (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
7451        AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
7452     return false;
7453 
7454   bool Signed = FlatVariant != SIInstrFlags::FLAT;
7455   if (ST.hasNegativeScratchOffsetBug() &&
7456       FlatVariant == SIInstrFlags::FlatScratch)
7457     Signed = false;
7458   if (ST.hasNegativeUnalignedScratchOffsetBug() &&
7459       FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
7460       (Offset % 4) != 0) {
7461     return false;
7462   }
7463 
7464   unsigned N = AMDGPU::getNumFlatOffsetBits(ST, Signed);
7465   return Signed ? isIntN(N, Offset) : isUIntN(N, Offset);
7466 }
7467 
7468 // See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
7469 std::pair<int64_t, int64_t>
7470 SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
7471                              uint64_t FlatVariant) const {
7472   int64_t RemainderOffset = COffsetVal;
7473   int64_t ImmField = 0;
7474   bool Signed = FlatVariant != SIInstrFlags::FLAT;
7475   if (ST.hasNegativeScratchOffsetBug() &&
7476       FlatVariant == SIInstrFlags::FlatScratch)
7477     Signed = false;
7478 
7479   const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST, Signed);
7480   if (Signed) {
7481     // Use signed division by a power of two to truncate towards 0.
7482     int64_t D = 1LL << (NumBits - 1);
7483     RemainderOffset = (COffsetVal / D) * D;
7484     ImmField = COffsetVal - RemainderOffset;
7485 
7486     if (ST.hasNegativeUnalignedScratchOffsetBug() &&
7487         FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
7488         (ImmField % 4) != 0) {
7489       // Make ImmField a multiple of 4
7490       RemainderOffset += ImmField % 4;
7491       ImmField -= ImmField % 4;
7492     }
7493   } else if (COffsetVal >= 0) {
7494     ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
7495     RemainderOffset = COffsetVal - ImmField;
7496   }
7497 
7498   assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
7499   assert(RemainderOffset + ImmField == COffsetVal);
7500   return {ImmField, RemainderOffset};
7501 }
7502 
7503 // This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td
7504 enum SIEncodingFamily {
7505   SI = 0,
7506   VI = 1,
7507   SDWA = 2,
7508   SDWA9 = 3,
7509   GFX80 = 4,
7510   GFX9 = 5,
7511   GFX10 = 6,
7512   SDWA10 = 7,
7513   GFX90A = 8
7514 };
7515 
7516 static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) {
7517   switch (ST.getGeneration()) {
7518   default:
7519     break;
7520   case AMDGPUSubtarget::SOUTHERN_ISLANDS:
7521   case AMDGPUSubtarget::SEA_ISLANDS:
7522     return SIEncodingFamily::SI;
7523   case AMDGPUSubtarget::VOLCANIC_ISLANDS:
7524   case AMDGPUSubtarget::GFX9:
7525     return SIEncodingFamily::VI;
7526   case AMDGPUSubtarget::GFX10:
7527     return SIEncodingFamily::GFX10;
7528   }
7529   llvm_unreachable("Unknown subtarget generation!");
7530 }
7531 
7532 bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
7533   switch(MCOp) {
7534   // These opcodes use indirect register addressing so
7535   // they need special handling by codegen (currently missing).
7536   // Therefore it is too risky to allow these opcodes
7537   // to be selected by dpp combiner or sdwa peepholer.
7538   case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
7539   case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
7540   case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
7541   case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
7542   case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
7543   case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
7544   case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
7545   case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
7546     return true;
7547   default:
7548     return false;
7549   }
7550 }
7551 
7552 int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
7553   SIEncodingFamily Gen = subtargetEncodingFamily(ST);
7554 
7555   if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 &&
7556     ST.getGeneration() == AMDGPUSubtarget::GFX9)
7557     Gen = SIEncodingFamily::GFX9;
7558 
7559   // Adjust the encoding family to GFX80 for D16 buffer instructions when the
7560   // subtarget has UnpackedD16VMem feature.
7561   // TODO: remove this when we discard GFX80 encoding.
7562   if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
7563     Gen = SIEncodingFamily::GFX80;
7564 
7565   if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
7566     switch (ST.getGeneration()) {
7567     default:
7568       Gen = SIEncodingFamily::SDWA;
7569       break;
7570     case AMDGPUSubtarget::GFX9:
7571       Gen = SIEncodingFamily::SDWA9;
7572       break;
7573     case AMDGPUSubtarget::GFX10:
7574       Gen = SIEncodingFamily::SDWA10;
7575       break;
7576     }
7577   }
7578 
7579   int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
7580 
7581   // -1 means that Opcode is already a native instruction.
7582   if (MCOp == -1)
7583     return Opcode;
7584 
7585   if (ST.hasGFX90AInsts()) {
7586     uint16_t NMCOp = (uint16_t)-1;
7587       NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX90A);
7588     if (NMCOp == (uint16_t)-1)
7589       NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX9);
7590     if (NMCOp != (uint16_t)-1)
7591       MCOp = NMCOp;
7592   }
7593 
7594   // (uint16_t)-1 means that Opcode is a pseudo instruction that has
7595   // no encoding in the given subtarget generation.
7596   if (MCOp == (uint16_t)-1)
7597     return -1;
7598 
7599   if (isAsmOnlyOpcode(MCOp))
7600     return -1;
7601 
7602   return MCOp;
7603 }
7604 
7605 static
7606 TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd) {
7607   assert(RegOpnd.isReg());
7608   return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
7609                              getRegSubRegPair(RegOpnd);
7610 }
7611 
7612 TargetInstrInfo::RegSubRegPair
7613 llvm::getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg) {
7614   assert(MI.isRegSequence());
7615   for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
7616     if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
7617       auto &RegOp = MI.getOperand(1 + 2 * I);
7618       return getRegOrUndef(RegOp);
7619     }
7620   return TargetInstrInfo::RegSubRegPair();
7621 }
7622 
7623 // Try to find the definition of reg:subreg in subreg-manipulation pseudos
7624 // Following a subreg of reg:subreg isn't supported
7625 static bool followSubRegDef(MachineInstr &MI,
7626                             TargetInstrInfo::RegSubRegPair &RSR) {
7627   if (!RSR.SubReg)
7628     return false;
7629   switch (MI.getOpcode()) {
7630   default: break;
7631   case AMDGPU::REG_SEQUENCE:
7632     RSR = getRegSequenceSubReg(MI, RSR.SubReg);
7633     return true;
7634   // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
7635   case AMDGPU::INSERT_SUBREG:
7636     if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
7637       // inserted the subreg we're looking for
7638       RSR = getRegOrUndef(MI.getOperand(2));
7639     else { // the subreg in the rest of the reg
7640       auto R1 = getRegOrUndef(MI.getOperand(1));
7641       if (R1.SubReg) // subreg of subreg isn't supported
7642         return false;
7643       RSR.Reg = R1.Reg;
7644     }
7645     return true;
7646   }
7647   return false;
7648 }
7649 
7650 MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
7651                                      MachineRegisterInfo &MRI) {
7652   assert(MRI.isSSA());
7653   if (!P.Reg.isVirtual())
7654     return nullptr;
7655 
7656   auto RSR = P;
7657   auto *DefInst = MRI.getVRegDef(RSR.Reg);
7658   while (auto *MI = DefInst) {
7659     DefInst = nullptr;
7660     switch (MI->getOpcode()) {
7661     case AMDGPU::COPY:
7662     case AMDGPU::V_MOV_B32_e32: {
7663       auto &Op1 = MI->getOperand(1);
7664       if (Op1.isReg() && Op1.getReg().isVirtual()) {
7665         if (Op1.isUndef())
7666           return nullptr;
7667         RSR = getRegSubRegPair(Op1);
7668         DefInst = MRI.getVRegDef(RSR.Reg);
7669       }
7670       break;
7671     }
7672     default:
7673       if (followSubRegDef(*MI, RSR)) {
7674         if (!RSR.Reg)
7675           return nullptr;
7676         DefInst = MRI.getVRegDef(RSR.Reg);
7677       }
7678     }
7679     if (!DefInst)
7680       return MI;
7681   }
7682   return nullptr;
7683 }
7684 
7685 bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI,
7686                                       Register VReg,
7687                                       const MachineInstr &DefMI,
7688                                       const MachineInstr &UseMI) {
7689   assert(MRI.isSSA() && "Must be run on SSA");
7690 
7691   auto *TRI = MRI.getTargetRegisterInfo();
7692   auto *DefBB = DefMI.getParent();
7693 
7694   // Don't bother searching between blocks, although it is possible this block
7695   // doesn't modify exec.
7696   if (UseMI.getParent() != DefBB)
7697     return true;
7698 
7699   const int MaxInstScan = 20;
7700   int NumInst = 0;
7701 
7702   // Stop scan at the use.
7703   auto E = UseMI.getIterator();
7704   for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
7705     if (I->isDebugInstr())
7706       continue;
7707 
7708     if (++NumInst > MaxInstScan)
7709       return true;
7710 
7711     if (I->modifiesRegister(AMDGPU::EXEC, TRI))
7712       return true;
7713   }
7714 
7715   return false;
7716 }
7717 
7718 bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI,
7719                                          Register VReg,
7720                                          const MachineInstr &DefMI) {
7721   assert(MRI.isSSA() && "Must be run on SSA");
7722 
7723   auto *TRI = MRI.getTargetRegisterInfo();
7724   auto *DefBB = DefMI.getParent();
7725 
7726   const int MaxUseScan = 10;
7727   int NumUse = 0;
7728 
7729   for (auto &Use : MRI.use_nodbg_operands(VReg)) {
7730     auto &UseInst = *Use.getParent();
7731     // Don't bother searching between blocks, although it is possible this block
7732     // doesn't modify exec.
7733     if (UseInst.getParent() != DefBB)
7734       return true;
7735 
7736     if (++NumUse > MaxUseScan)
7737       return true;
7738   }
7739 
7740   if (NumUse == 0)
7741     return false;
7742 
7743   const int MaxInstScan = 20;
7744   int NumInst = 0;
7745 
7746   // Stop scan when we have seen all the uses.
7747   for (auto I = std::next(DefMI.getIterator()); ; ++I) {
7748     assert(I != DefBB->end());
7749 
7750     if (I->isDebugInstr())
7751       continue;
7752 
7753     if (++NumInst > MaxInstScan)
7754       return true;
7755 
7756     for (const MachineOperand &Op : I->operands()) {
7757       // We don't check reg masks here as they're used only on calls:
7758       // 1. EXEC is only considered const within one BB
7759       // 2. Call should be a terminator instruction if present in a BB
7760 
7761       if (!Op.isReg())
7762         continue;
7763 
7764       Register Reg = Op.getReg();
7765       if (Op.isUse()) {
7766         if (Reg == VReg && --NumUse == 0)
7767           return false;
7768       } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
7769         return true;
7770     }
7771   }
7772 }
7773 
7774 MachineInstr *SIInstrInfo::createPHIDestinationCopy(
7775     MachineBasicBlock &MBB, MachineBasicBlock::iterator LastPHIIt,
7776     const DebugLoc &DL, Register Src, Register Dst) const {
7777   auto Cur = MBB.begin();
7778   if (Cur != MBB.end())
7779     do {
7780       if (!Cur->isPHI() && Cur->readsRegister(Dst))
7781         return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
7782       ++Cur;
7783     } while (Cur != MBB.end() && Cur != LastPHIIt);
7784 
7785   return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
7786                                                    Dst);
7787 }
7788 
7789 MachineInstr *SIInstrInfo::createPHISourceCopy(
7790     MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt,
7791     const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
7792   if (InsPt != MBB.end() &&
7793       (InsPt->getOpcode() == AMDGPU::SI_IF ||
7794        InsPt->getOpcode() == AMDGPU::SI_ELSE ||
7795        InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
7796       InsPt->definesRegister(Src)) {
7797     InsPt++;
7798     return BuildMI(MBB, InsPt, DL,
7799                    get(ST.isWave32() ? AMDGPU::S_MOV_B32_term
7800                                      : AMDGPU::S_MOV_B64_term),
7801                    Dst)
7802         .addReg(Src, 0, SrcSubReg)
7803         .addReg(AMDGPU::EXEC, RegState::Implicit);
7804   }
7805   return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
7806                                               Dst);
7807 }
7808 
7809 bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
7810 
7811 MachineInstr *SIInstrInfo::foldMemoryOperandImpl(
7812     MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
7813     MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
7814     VirtRegMap *VRM) const {
7815   // This is a bit of a hack (copied from AArch64). Consider this instruction:
7816   //
7817   //   %0:sreg_32 = COPY $m0
7818   //
7819   // We explicitly chose SReg_32 for the virtual register so such a copy might
7820   // be eliminated by RegisterCoalescer. However, that may not be possible, and
7821   // %0 may even spill. We can't spill $m0 normally (it would require copying to
7822   // a numbered SGPR anyway), and since it is in the SReg_32 register class,
7823   // TargetInstrInfo::foldMemoryOperand() is going to try.
7824   // A similar issue also exists with spilling and reloading $exec registers.
7825   //
7826   // To prevent that, constrain the %0 register class here.
7827   if (MI.isFullCopy()) {
7828     Register DstReg = MI.getOperand(0).getReg();
7829     Register SrcReg = MI.getOperand(1).getReg();
7830     if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
7831         (DstReg.isVirtual() != SrcReg.isVirtual())) {
7832       MachineRegisterInfo &MRI = MF.getRegInfo();
7833       Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
7834       const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
7835       if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
7836         MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
7837         return nullptr;
7838       } else if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
7839         MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
7840         return nullptr;
7841       }
7842     }
7843   }
7844 
7845   return nullptr;
7846 }
7847 
7848 unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
7849                                       const MachineInstr &MI,
7850                                       unsigned *PredCost) const {
7851   if (MI.isBundle()) {
7852     MachineBasicBlock::const_instr_iterator I(MI.getIterator());
7853     MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
7854     unsigned Lat = 0, Count = 0;
7855     for (++I; I != E && I->isBundledWithPred(); ++I) {
7856       ++Count;
7857       Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
7858     }
7859     return Lat + Count - 1;
7860   }
7861 
7862   return SchedModel.computeInstrLatency(&MI);
7863 }
7864 
7865 unsigned SIInstrInfo::getDSShaderTypeValue(const MachineFunction &MF) {
7866   switch (MF.getFunction().getCallingConv()) {
7867   case CallingConv::AMDGPU_PS:
7868     return 1;
7869   case CallingConv::AMDGPU_VS:
7870     return 2;
7871   case CallingConv::AMDGPU_GS:
7872     return 3;
7873   case CallingConv::AMDGPU_HS:
7874   case CallingConv::AMDGPU_LS:
7875   case CallingConv::AMDGPU_ES:
7876     report_fatal_error("ds_ordered_count unsupported for this calling conv");
7877   case CallingConv::AMDGPU_CS:
7878   case CallingConv::AMDGPU_KERNEL:
7879   case CallingConv::C:
7880   case CallingConv::Fast:
7881   default:
7882     // Assume other calling conventions are various compute callable functions
7883     return 0;
7884   }
7885 }
7886