1 //===- SIInstrInfo.cpp - SI Instruction Information  ----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// SI Implementation of TargetInstrInfo.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "SIInstrInfo.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUInstrInfo.h"
17 #include "GCNHazardRecognizer.h"
18 #include "GCNSubtarget.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "llvm/Analysis/ValueTracking.h"
22 #include "llvm/CodeGen/LiveVariables.h"
23 #include "llvm/CodeGen/MachineDominators.h"
24 #include "llvm/CodeGen/RegisterScavenging.h"
25 #include "llvm/CodeGen/ScheduleDAG.h"
26 #include "llvm/IR/DiagnosticInfo.h"
27 #include "llvm/IR/IntrinsicsAMDGPU.h"
28 #include "llvm/Support/CommandLine.h"
29 #include "llvm/Target/TargetMachine.h"
30 
31 using namespace llvm;
32 
33 #define DEBUG_TYPE "si-instr-info"
34 
35 #define GET_INSTRINFO_CTOR_DTOR
36 #include "AMDGPUGenInstrInfo.inc"
37 
38 namespace llvm {
39 
40 class AAResults;
41 
42 namespace AMDGPU {
43 #define GET_D16ImageDimIntrinsics_IMPL
44 #define GET_ImageDimIntrinsicTable_IMPL
45 #define GET_RsrcIntrinsics_IMPL
46 #include "AMDGPUGenSearchableTables.inc"
47 }
48 }
49 
50 
51 // Must be at least 4 to be able to branch over minimum unconditional branch
52 // code. This is only for making it possible to write reasonably small tests for
53 // long branches.
54 static cl::opt<unsigned>
55 BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
56                  cl::desc("Restrict range of branch instructions (DEBUG)"));
57 
58 static cl::opt<bool> Fix16BitCopies(
59   "amdgpu-fix-16-bit-physreg-copies",
60   cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
61   cl::init(true),
62   cl::ReallyHidden);
63 
64 SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST)
65   : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
66     RI(ST), ST(ST) {
67   SchedModel.init(&ST);
68 }
69 
70 //===----------------------------------------------------------------------===//
71 // TargetInstrInfo callbacks
72 //===----------------------------------------------------------------------===//
73 
74 static unsigned getNumOperandsNoGlue(SDNode *Node) {
75   unsigned N = Node->getNumOperands();
76   while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
77     --N;
78   return N;
79 }
80 
81 /// Returns true if both nodes have the same value for the given
82 ///        operand \p Op, or if both nodes do not have this operand.
83 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
84   unsigned Opc0 = N0->getMachineOpcode();
85   unsigned Opc1 = N1->getMachineOpcode();
86 
87   int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
88   int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
89 
90   if (Op0Idx == -1 && Op1Idx == -1)
91     return true;
92 
93 
94   if ((Op0Idx == -1 && Op1Idx != -1) ||
95       (Op1Idx == -1 && Op0Idx != -1))
96     return false;
97 
98   // getNamedOperandIdx returns the index for the MachineInstr's operands,
99   // which includes the result as the first operand. We are indexing into the
100   // MachineSDNode's operands, so we need to skip the result operand to get
101   // the real index.
102   --Op0Idx;
103   --Op1Idx;
104 
105   return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
106 }
107 
108 bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
109                                                     AAResults *AA) const {
110   if (isVOP1(MI) || isVOP3(MI) || isSDWA(MI)) {
111     // Normally VALU use of exec would block the rematerialization, but that
112     // is OK in this case to have an implicit exec read as all VALU do.
113     // We really want all of the generic logic for this except for this.
114 
115     // Another potential implicit use is mode register. The core logic of
116     // the RA will not attempt rematerialization if mode is set anywhere
117     // in the function, otherwise it is safe since mode is not changed.
118     return !MI.hasImplicitDef() &&
119            MI.getNumImplicitOperands() == MI.getDesc().getNumImplicitUses();
120   }
121 
122   return false;
123 }
124 
125 bool SIInstrInfo::isIgnorableUse(const MachineOperand &MO) const {
126   // Any implicit use of exec by VALU is not a real register read.
127   return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
128          isVALU(*MO.getParent());
129 }
130 
131 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
132                                           int64_t &Offset0,
133                                           int64_t &Offset1) const {
134   if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
135     return false;
136 
137   unsigned Opc0 = Load0->getMachineOpcode();
138   unsigned Opc1 = Load1->getMachineOpcode();
139 
140   // Make sure both are actually loads.
141   if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
142     return false;
143 
144   if (isDS(Opc0) && isDS(Opc1)) {
145 
146     // FIXME: Handle this case:
147     if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
148       return false;
149 
150     // Check base reg.
151     if (Load0->getOperand(0) != Load1->getOperand(0))
152       return false;
153 
154     // Skip read2 / write2 variants for simplicity.
155     // TODO: We should report true if the used offsets are adjacent (excluded
156     // st64 versions).
157     int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
158     int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
159     if (Offset0Idx == -1 || Offset1Idx == -1)
160       return false;
161 
162     // XXX - be careful of datalesss loads
163     // getNamedOperandIdx returns the index for MachineInstrs.  Since they
164     // include the output in the operand list, but SDNodes don't, we need to
165     // subtract the index by one.
166     Offset0Idx -= get(Opc0).NumDefs;
167     Offset1Idx -= get(Opc1).NumDefs;
168     Offset0 = cast<ConstantSDNode>(Load0->getOperand(Offset0Idx))->getZExtValue();
169     Offset1 = cast<ConstantSDNode>(Load1->getOperand(Offset1Idx))->getZExtValue();
170     return true;
171   }
172 
173   if (isSMRD(Opc0) && isSMRD(Opc1)) {
174     // Skip time and cache invalidation instructions.
175     if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 ||
176         AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1)
177       return false;
178 
179     assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1));
180 
181     // Check base reg.
182     if (Load0->getOperand(0) != Load1->getOperand(0))
183       return false;
184 
185     const ConstantSDNode *Load0Offset =
186         dyn_cast<ConstantSDNode>(Load0->getOperand(1));
187     const ConstantSDNode *Load1Offset =
188         dyn_cast<ConstantSDNode>(Load1->getOperand(1));
189 
190     if (!Load0Offset || !Load1Offset)
191       return false;
192 
193     Offset0 = Load0Offset->getZExtValue();
194     Offset1 = Load1Offset->getZExtValue();
195     return true;
196   }
197 
198   // MUBUF and MTBUF can access the same addresses.
199   if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
200 
201     // MUBUF and MTBUF have vaddr at different indices.
202     if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
203         !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
204         !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
205       return false;
206 
207     int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
208     int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
209 
210     if (OffIdx0 == -1 || OffIdx1 == -1)
211       return false;
212 
213     // getNamedOperandIdx returns the index for MachineInstrs.  Since they
214     // include the output in the operand list, but SDNodes don't, we need to
215     // subtract the index by one.
216     OffIdx0 -= get(Opc0).NumDefs;
217     OffIdx1 -= get(Opc1).NumDefs;
218 
219     SDValue Off0 = Load0->getOperand(OffIdx0);
220     SDValue Off1 = Load1->getOperand(OffIdx1);
221 
222     // The offset might be a FrameIndexSDNode.
223     if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
224       return false;
225 
226     Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue();
227     Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue();
228     return true;
229   }
230 
231   return false;
232 }
233 
234 static bool isStride64(unsigned Opc) {
235   switch (Opc) {
236   case AMDGPU::DS_READ2ST64_B32:
237   case AMDGPU::DS_READ2ST64_B64:
238   case AMDGPU::DS_WRITE2ST64_B32:
239   case AMDGPU::DS_WRITE2ST64_B64:
240     return true;
241   default:
242     return false;
243   }
244 }
245 
246 bool SIInstrInfo::getMemOperandsWithOffsetWidth(
247     const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
248     int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
249     const TargetRegisterInfo *TRI) const {
250   if (!LdSt.mayLoadOrStore())
251     return false;
252 
253   unsigned Opc = LdSt.getOpcode();
254   OffsetIsScalable = false;
255   const MachineOperand *BaseOp, *OffsetOp;
256   int DataOpIdx;
257 
258   if (isDS(LdSt)) {
259     BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
260     OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
261     if (OffsetOp) {
262       // Normal, single offset LDS instruction.
263       if (!BaseOp) {
264         // DS_CONSUME/DS_APPEND use M0 for the base address.
265         // TODO: find the implicit use operand for M0 and use that as BaseOp?
266         return false;
267       }
268       BaseOps.push_back(BaseOp);
269       Offset = OffsetOp->getImm();
270       // Get appropriate operand, and compute width accordingly.
271       DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
272       if (DataOpIdx == -1)
273         DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
274       Width = getOpSize(LdSt, DataOpIdx);
275     } else {
276       // The 2 offset instructions use offset0 and offset1 instead. We can treat
277       // these as a load with a single offset if the 2 offsets are consecutive.
278       // We will use this for some partially aligned loads.
279       const MachineOperand *Offset0Op =
280           getNamedOperand(LdSt, AMDGPU::OpName::offset0);
281       const MachineOperand *Offset1Op =
282           getNamedOperand(LdSt, AMDGPU::OpName::offset1);
283 
284       unsigned Offset0 = Offset0Op->getImm();
285       unsigned Offset1 = Offset1Op->getImm();
286       if (Offset0 + 1 != Offset1)
287         return false;
288 
289       // Each of these offsets is in element sized units, so we need to convert
290       // to bytes of the individual reads.
291 
292       unsigned EltSize;
293       if (LdSt.mayLoad())
294         EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
295       else {
296         assert(LdSt.mayStore());
297         int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
298         EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
299       }
300 
301       if (isStride64(Opc))
302         EltSize *= 64;
303 
304       BaseOps.push_back(BaseOp);
305       Offset = EltSize * Offset0;
306       // Get appropriate operand(s), and compute width accordingly.
307       DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
308       if (DataOpIdx == -1) {
309         DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
310         Width = getOpSize(LdSt, DataOpIdx);
311         DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
312         Width += getOpSize(LdSt, DataOpIdx);
313       } else {
314         Width = getOpSize(LdSt, DataOpIdx);
315       }
316     }
317     return true;
318   }
319 
320   if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
321     const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
322     if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
323       return false;
324     BaseOps.push_back(RSrc);
325     BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
326     if (BaseOp && !BaseOp->isFI())
327       BaseOps.push_back(BaseOp);
328     const MachineOperand *OffsetImm =
329         getNamedOperand(LdSt, AMDGPU::OpName::offset);
330     Offset = OffsetImm->getImm();
331     const MachineOperand *SOffset =
332         getNamedOperand(LdSt, AMDGPU::OpName::soffset);
333     if (SOffset) {
334       if (SOffset->isReg())
335         BaseOps.push_back(SOffset);
336       else
337         Offset += SOffset->getImm();
338     }
339     // Get appropriate operand, and compute width accordingly.
340     DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
341     if (DataOpIdx == -1)
342       DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
343     Width = getOpSize(LdSt, DataOpIdx);
344     return true;
345   }
346 
347   if (isMIMG(LdSt)) {
348     int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
349     BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
350     int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
351     if (VAddr0Idx >= 0) {
352       // GFX10 possible NSA encoding.
353       for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
354         BaseOps.push_back(&LdSt.getOperand(I));
355     } else {
356       BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
357     }
358     Offset = 0;
359     // Get appropriate operand, and compute width accordingly.
360     DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
361     Width = getOpSize(LdSt, DataOpIdx);
362     return true;
363   }
364 
365   if (isSMRD(LdSt)) {
366     BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
367     if (!BaseOp) // e.g. S_MEMTIME
368       return false;
369     BaseOps.push_back(BaseOp);
370     OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
371     Offset = OffsetOp ? OffsetOp->getImm() : 0;
372     // Get appropriate operand, and compute width accordingly.
373     DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
374     Width = getOpSize(LdSt, DataOpIdx);
375     return true;
376   }
377 
378   if (isFLAT(LdSt)) {
379     // Instructions have either vaddr or saddr or both or none.
380     BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
381     if (BaseOp)
382       BaseOps.push_back(BaseOp);
383     BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
384     if (BaseOp)
385       BaseOps.push_back(BaseOp);
386     Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
387     // Get appropriate operand, and compute width accordingly.
388     DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
389     if (DataOpIdx == -1)
390       DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
391     Width = getOpSize(LdSt, DataOpIdx);
392     return true;
393   }
394 
395   return false;
396 }
397 
398 static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
399                                   ArrayRef<const MachineOperand *> BaseOps1,
400                                   const MachineInstr &MI2,
401                                   ArrayRef<const MachineOperand *> BaseOps2) {
402   // Only examine the first "base" operand of each instruction, on the
403   // assumption that it represents the real base address of the memory access.
404   // Other operands are typically offsets or indices from this base address.
405   if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
406     return true;
407 
408   if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
409     return false;
410 
411   auto MO1 = *MI1.memoperands_begin();
412   auto MO2 = *MI2.memoperands_begin();
413   if (MO1->getAddrSpace() != MO2->getAddrSpace())
414     return false;
415 
416   auto Base1 = MO1->getValue();
417   auto Base2 = MO2->getValue();
418   if (!Base1 || !Base2)
419     return false;
420   Base1 = getUnderlyingObject(Base1);
421   Base2 = getUnderlyingObject(Base2);
422 
423   if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
424     return false;
425 
426   return Base1 == Base2;
427 }
428 
429 bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
430                                       ArrayRef<const MachineOperand *> BaseOps2,
431                                       unsigned NumLoads,
432                                       unsigned NumBytes) const {
433   // If the mem ops (to be clustered) do not have the same base ptr, then they
434   // should not be clustered
435   if (!BaseOps1.empty() && !BaseOps2.empty()) {
436     const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
437     const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
438     if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
439       return false;
440   } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
441     // If only one base op is empty, they do not have the same base ptr
442     return false;
443   }
444 
445   // In order to avoid regester pressure, on an average, the number of DWORDS
446   // loaded together by all clustered mem ops should not exceed 8. This is an
447   // empirical value based on certain observations and performance related
448   // experiments.
449   // The good thing about this heuristic is - it avoids clustering of too many
450   // sub-word loads, and also avoids clustering of wide loads. Below is the
451   // brief summary of how the heuristic behaves for various `LoadSize`.
452   // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
453   // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
454   // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
455   // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
456   // (5) LoadSize >= 17: do not cluster
457   const unsigned LoadSize = NumBytes / NumLoads;
458   const unsigned NumDWORDs = ((LoadSize + 3) / 4) * NumLoads;
459   return NumDWORDs <= 8;
460 }
461 
462 // FIXME: This behaves strangely. If, for example, you have 32 load + stores,
463 // the first 16 loads will be interleaved with the stores, and the next 16 will
464 // be clustered as expected. It should really split into 2 16 store batches.
465 //
466 // Loads are clustered until this returns false, rather than trying to schedule
467 // groups of stores. This also means we have to deal with saying different
468 // address space loads should be clustered, and ones which might cause bank
469 // conflicts.
470 //
471 // This might be deprecated so it might not be worth that much effort to fix.
472 bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1,
473                                           int64_t Offset0, int64_t Offset1,
474                                           unsigned NumLoads) const {
475   assert(Offset1 > Offset0 &&
476          "Second offset should be larger than first offset!");
477   // If we have less than 16 loads in a row, and the offsets are within 64
478   // bytes, then schedule together.
479 
480   // A cacheline is 64 bytes (for global memory).
481   return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
482 }
483 
484 static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
485                               MachineBasicBlock::iterator MI,
486                               const DebugLoc &DL, MCRegister DestReg,
487                               MCRegister SrcReg, bool KillSrc,
488                               const char *Msg = "illegal SGPR to VGPR copy") {
489   MachineFunction *MF = MBB.getParent();
490   DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), Msg, DL, DS_Error);
491   LLVMContext &C = MF->getFunction().getContext();
492   C.diagnose(IllegalCopy);
493 
494   BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
495     .addReg(SrcReg, getKillRegState(KillSrc));
496 }
497 
498 /// Handle copying from SGPR to AGPR, or from AGPR to AGPR. It is not possible
499 /// to directly copy, so an intermediate VGPR needs to be used.
500 static void indirectCopyToAGPR(const SIInstrInfo &TII,
501                                MachineBasicBlock &MBB,
502                                MachineBasicBlock::iterator MI,
503                                const DebugLoc &DL, MCRegister DestReg,
504                                MCRegister SrcReg, bool KillSrc,
505                                RegScavenger &RS,
506                                Register ImpDefSuperReg = Register(),
507                                Register ImpUseSuperReg = Register()) {
508   const SIRegisterInfo &RI = TII.getRegisterInfo();
509 
510   assert(AMDGPU::SReg_32RegClass.contains(SrcReg) ||
511          AMDGPU::AGPR_32RegClass.contains(SrcReg));
512 
513   // First try to find defining accvgpr_write to avoid temporary registers.
514   for (auto Def = MI, E = MBB.begin(); Def != E; ) {
515     --Def;
516     if (!Def->definesRegister(SrcReg, &RI))
517       continue;
518     if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
519       break;
520 
521     MachineOperand &DefOp = Def->getOperand(1);
522     assert(DefOp.isReg() || DefOp.isImm());
523 
524     if (DefOp.isReg()) {
525       // Check that register source operand if not clobbered before MI.
526       // Immediate operands are always safe to propagate.
527       bool SafeToPropagate = true;
528       for (auto I = Def; I != MI && SafeToPropagate; ++I)
529         if (I->modifiesRegister(DefOp.getReg(), &RI))
530           SafeToPropagate = false;
531 
532       if (!SafeToPropagate)
533         break;
534 
535       DefOp.setIsKill(false);
536     }
537 
538     MachineInstrBuilder Builder =
539       BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
540       .add(DefOp);
541     if (ImpDefSuperReg)
542       Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
543 
544     if (ImpUseSuperReg) {
545       Builder.addReg(ImpUseSuperReg,
546                      getKillRegState(KillSrc) | RegState::Implicit);
547     }
548 
549     return;
550   }
551 
552   RS.enterBasicBlock(MBB);
553   RS.forward(MI);
554 
555   // Ideally we want to have three registers for a long reg_sequence copy
556   // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
557   unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
558                                              *MBB.getParent());
559 
560   // Registers in the sequence are allocated contiguously so we can just
561   // use register number to pick one of three round-robin temps.
562   unsigned RegNo = DestReg % 3;
563   Register Tmp = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
564   if (!Tmp)
565     report_fatal_error("Cannot scavenge VGPR to copy to AGPR");
566   RS.setRegUsed(Tmp);
567 
568   if (!TII.getSubtarget().hasGFX90AInsts()) {
569     // Only loop through if there are any free registers left, otherwise
570     // scavenger may report a fatal error without emergency spill slot
571     // or spill with the slot.
572     while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) {
573       Register Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
574       if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
575         break;
576       Tmp = Tmp2;
577       RS.setRegUsed(Tmp);
578     }
579   }
580 
581   // Insert copy to temporary VGPR.
582   unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
583   if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
584     TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
585   } else {
586     assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
587   }
588 
589   MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
590     .addReg(SrcReg, getKillRegState(KillSrc));
591   if (ImpUseSuperReg) {
592     UseBuilder.addReg(ImpUseSuperReg,
593                       getKillRegState(KillSrc) | RegState::Implicit);
594   }
595 
596   MachineInstrBuilder DefBuilder
597     = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
598     .addReg(Tmp, RegState::Kill);
599 
600   if (ImpDefSuperReg)
601     DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
602 }
603 
604 static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB,
605                            MachineBasicBlock::iterator MI, const DebugLoc &DL,
606                            MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
607                            const TargetRegisterClass *RC, bool Forward) {
608   const SIRegisterInfo &RI = TII.getRegisterInfo();
609   ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
610   MachineBasicBlock::iterator I = MI;
611   MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
612 
613   for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
614     int16_t SubIdx = BaseIndices[Idx];
615     Register Reg = RI.getSubReg(DestReg, SubIdx);
616     unsigned Opcode = AMDGPU::S_MOV_B32;
617 
618     // Is SGPR aligned? If so try to combine with next.
619     Register Src = RI.getSubReg(SrcReg, SubIdx);
620     bool AlignedDest = ((Reg - AMDGPU::SGPR0) % 2) == 0;
621     bool AlignedSrc = ((Src - AMDGPU::SGPR0) % 2) == 0;
622     if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
623       // Can use SGPR64 copy
624       unsigned Channel = RI.getChannelFromSubReg(SubIdx);
625       SubIdx = RI.getSubRegFromChannel(Channel, 2);
626       Opcode = AMDGPU::S_MOV_B64;
627       Idx++;
628     }
629 
630     LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), RI.getSubReg(DestReg, SubIdx))
631                  .addReg(RI.getSubReg(SrcReg, SubIdx))
632                  .addReg(SrcReg, RegState::Implicit);
633 
634     if (!FirstMI)
635       FirstMI = LastMI;
636 
637     if (!Forward)
638       I--;
639   }
640 
641   assert(FirstMI && LastMI);
642   if (!Forward)
643     std::swap(FirstMI, LastMI);
644 
645   FirstMI->addOperand(
646       MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
647 
648   if (KillSrc)
649     LastMI->addRegisterKilled(SrcReg, &RI);
650 }
651 
652 void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
653                               MachineBasicBlock::iterator MI,
654                               const DebugLoc &DL, MCRegister DestReg,
655                               MCRegister SrcReg, bool KillSrc) const {
656   const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg);
657 
658   // FIXME: This is hack to resolve copies between 16 bit and 32 bit
659   // registers until all patterns are fixed.
660   if (Fix16BitCopies &&
661       ((RI.getRegSizeInBits(*RC) == 16) ^
662        (RI.getRegSizeInBits(*RI.getPhysRegClass(SrcReg)) == 16))) {
663     MCRegister &RegToFix = (RI.getRegSizeInBits(*RC) == 16) ? DestReg : SrcReg;
664     MCRegister Super = RI.get32BitRegister(RegToFix);
665     assert(RI.getSubReg(Super, AMDGPU::lo16) == RegToFix);
666     RegToFix = Super;
667 
668     if (DestReg == SrcReg) {
669       // Insert empty bundle since ExpandPostRA expects an instruction here.
670       BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
671       return;
672     }
673 
674     RC = RI.getPhysRegClass(DestReg);
675   }
676 
677   if (RC == &AMDGPU::VGPR_32RegClass) {
678     assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
679            AMDGPU::SReg_32RegClass.contains(SrcReg) ||
680            AMDGPU::AGPR_32RegClass.contains(SrcReg));
681     unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
682                      AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
683     BuildMI(MBB, MI, DL, get(Opc), DestReg)
684       .addReg(SrcReg, getKillRegState(KillSrc));
685     return;
686   }
687 
688   if (RC == &AMDGPU::SReg_32_XM0RegClass ||
689       RC == &AMDGPU::SReg_32RegClass) {
690     if (SrcReg == AMDGPU::SCC) {
691       BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
692           .addImm(1)
693           .addImm(0);
694       return;
695     }
696 
697     if (DestReg == AMDGPU::VCC_LO) {
698       if (AMDGPU::SReg_32RegClass.contains(SrcReg)) {
699         BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO)
700           .addReg(SrcReg, getKillRegState(KillSrc));
701       } else {
702         // FIXME: Hack until VReg_1 removed.
703         assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
704         BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
705           .addImm(0)
706           .addReg(SrcReg, getKillRegState(KillSrc));
707       }
708 
709       return;
710     }
711 
712     if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
713       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
714       return;
715     }
716 
717     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
718             .addReg(SrcReg, getKillRegState(KillSrc));
719     return;
720   }
721 
722   if (RC == &AMDGPU::SReg_64RegClass) {
723     if (SrcReg == AMDGPU::SCC) {
724       BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
725           .addImm(1)
726           .addImm(0);
727       return;
728     }
729 
730     if (DestReg == AMDGPU::VCC) {
731       if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
732         BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
733           .addReg(SrcReg, getKillRegState(KillSrc));
734       } else {
735         // FIXME: Hack until VReg_1 removed.
736         assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
737         BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
738           .addImm(0)
739           .addReg(SrcReg, getKillRegState(KillSrc));
740       }
741 
742       return;
743     }
744 
745     if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
746       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
747       return;
748     }
749 
750     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
751             .addReg(SrcReg, getKillRegState(KillSrc));
752     return;
753   }
754 
755   if (DestReg == AMDGPU::SCC) {
756     // Copying 64-bit or 32-bit sources to SCC barely makes sense,
757     // but SelectionDAG emits such copies for i1 sources.
758     if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
759       // This copy can only be produced by patterns
760       // with explicit SCC, which are known to be enabled
761       // only for subtargets with S_CMP_LG_U64 present.
762       assert(ST.hasScalarCompareEq64());
763       BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
764           .addReg(SrcReg, getKillRegState(KillSrc))
765           .addImm(0);
766     } else {
767       assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
768       BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
769           .addReg(SrcReg, getKillRegState(KillSrc))
770           .addImm(0);
771     }
772 
773     return;
774   }
775 
776   if (RC == &AMDGPU::AGPR_32RegClass) {
777     if (AMDGPU::VGPR_32RegClass.contains(SrcReg)) {
778       BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
779         .addReg(SrcReg, getKillRegState(KillSrc));
780       return;
781     }
782 
783     if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
784       BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
785         .addReg(SrcReg, getKillRegState(KillSrc));
786       return;
787     }
788 
789     // FIXME: Pass should maintain scavenger to avoid scan through the block on
790     // every AGPR spill.
791     RegScavenger RS;
792     indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS);
793     return;
794   }
795 
796   const unsigned Size = RI.getRegSizeInBits(*RC);
797   if (Size == 16) {
798     assert(AMDGPU::VGPR_LO16RegClass.contains(SrcReg) ||
799            AMDGPU::VGPR_HI16RegClass.contains(SrcReg) ||
800            AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
801            AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
802 
803     bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
804     bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
805     bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
806     bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
807     bool DstLow = AMDGPU::VGPR_LO16RegClass.contains(DestReg) ||
808                   AMDGPU::SReg_LO16RegClass.contains(DestReg) ||
809                   AMDGPU::AGPR_LO16RegClass.contains(DestReg);
810     bool SrcLow = AMDGPU::VGPR_LO16RegClass.contains(SrcReg) ||
811                   AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
812                   AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
813     MCRegister NewDestReg = RI.get32BitRegister(DestReg);
814     MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
815 
816     if (IsSGPRDst) {
817       if (!IsSGPRSrc) {
818         reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
819         return;
820       }
821 
822       BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
823         .addReg(NewSrcReg, getKillRegState(KillSrc));
824       return;
825     }
826 
827     if (IsAGPRDst || IsAGPRSrc) {
828       if (!DstLow || !SrcLow) {
829         reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
830                           "Cannot use hi16 subreg with an AGPR!");
831       }
832 
833       copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
834       return;
835     }
836 
837     if (IsSGPRSrc && !ST.hasSDWAScalar()) {
838       if (!DstLow || !SrcLow) {
839         reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
840                           "Cannot use hi16 subreg on VI!");
841       }
842 
843       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
844         .addReg(NewSrcReg, getKillRegState(KillSrc));
845       return;
846     }
847 
848     auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
849       .addImm(0) // src0_modifiers
850       .addReg(NewSrcReg)
851       .addImm(0) // clamp
852       .addImm(DstLow ? AMDGPU::SDWA::SdwaSel::WORD_0
853                      : AMDGPU::SDWA::SdwaSel::WORD_1)
854       .addImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE)
855       .addImm(SrcLow ? AMDGPU::SDWA::SdwaSel::WORD_0
856                      : AMDGPU::SDWA::SdwaSel::WORD_1)
857       .addReg(NewDestReg, RegState::Implicit | RegState::Undef);
858     // First implicit operand is $exec.
859     MIB->tieOperands(0, MIB->getNumOperands() - 1);
860     return;
861   }
862 
863   const TargetRegisterClass *SrcRC = RI.getPhysRegClass(SrcReg);
864   if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
865     if (ST.hasPackedFP32Ops()) {
866       BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
867         .addImm(SISrcMods::OP_SEL_1)
868         .addReg(SrcReg)
869         .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
870         .addReg(SrcReg)
871         .addImm(0) // op_sel_lo
872         .addImm(0) // op_sel_hi
873         .addImm(0) // neg_lo
874         .addImm(0) // neg_hi
875         .addImm(0) // clamp
876         .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
877       return;
878     }
879   }
880 
881   const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
882   if (RI.isSGPRClass(RC)) {
883     if (!RI.isSGPRClass(SrcRC)) {
884       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
885       return;
886     }
887     expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RC, Forward);
888     return;
889   }
890 
891   unsigned EltSize = 4;
892   unsigned Opcode = AMDGPU::V_MOV_B32_e32;
893   if (RI.hasAGPRs(RC)) {
894     Opcode = (RI.hasVGPRs(SrcRC)) ?
895       AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
896   } else if (RI.hasVGPRs(RC) && RI.hasAGPRs(SrcRC)) {
897     Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
898   } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
899              (RI.isProperlyAlignedRC(*RC) &&
900               (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
901     // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
902     if (ST.hasPackedFP32Ops()) {
903       Opcode = AMDGPU::V_PK_MOV_B32;
904       EltSize = 8;
905     }
906   }
907 
908   // For the cases where we need an intermediate instruction/temporary register
909   // (destination is an AGPR), we need a scavenger.
910   //
911   // FIXME: The pass should maintain this for us so we don't have to re-scan the
912   // whole block for every handled copy.
913   std::unique_ptr<RegScavenger> RS;
914   if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
915     RS.reset(new RegScavenger());
916 
917   ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
918 
919   // If there is an overlap, we can't kill the super-register on the last
920   // instruction, since it will also kill the components made live by this def.
921   const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
922 
923   for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
924     unsigned SubIdx;
925     if (Forward)
926       SubIdx = SubIndices[Idx];
927     else
928       SubIdx = SubIndices[SubIndices.size() - Idx - 1];
929 
930     bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
931 
932     if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
933       Register ImpDefSuper = Idx == 0 ? Register(DestReg) : Register();
934       Register ImpUseSuper = SrcReg;
935       indirectCopyToAGPR(*this, MBB, MI, DL, RI.getSubReg(DestReg, SubIdx),
936                          RI.getSubReg(SrcReg, SubIdx), UseKill, *RS,
937                          ImpDefSuper, ImpUseSuper);
938     } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
939       Register DstSubReg = RI.getSubReg(DestReg, SubIdx);
940       Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
941       MachineInstrBuilder MIB =
942         BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DstSubReg)
943         .addImm(SISrcMods::OP_SEL_1)
944         .addReg(SrcSubReg)
945         .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
946         .addReg(SrcSubReg)
947         .addImm(0) // op_sel_lo
948         .addImm(0) // op_sel_hi
949         .addImm(0) // neg_lo
950         .addImm(0) // neg_hi
951         .addImm(0) // clamp
952         .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
953       if (Idx == 0)
954         MIB.addReg(DestReg, RegState::Define | RegState::Implicit);
955     } else {
956       MachineInstrBuilder Builder =
957         BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DestReg, SubIdx))
958         .addReg(RI.getSubReg(SrcReg, SubIdx));
959       if (Idx == 0)
960         Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
961 
962       Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
963     }
964   }
965 }
966 
967 int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
968   int NewOpc;
969 
970   // Try to map original to commuted opcode
971   NewOpc = AMDGPU::getCommuteRev(Opcode);
972   if (NewOpc != -1)
973     // Check if the commuted (REV) opcode exists on the target.
974     return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
975 
976   // Try to map commuted to original opcode
977   NewOpc = AMDGPU::getCommuteOrig(Opcode);
978   if (NewOpc != -1)
979     // Check if the original (non-REV) opcode exists on the target.
980     return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
981 
982   return Opcode;
983 }
984 
985 void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB,
986                                        MachineBasicBlock::iterator MI,
987                                        const DebugLoc &DL, unsigned DestReg,
988                                        int64_t Value) const {
989   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
990   const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
991   if (RegClass == &AMDGPU::SReg_32RegClass ||
992       RegClass == &AMDGPU::SGPR_32RegClass ||
993       RegClass == &AMDGPU::SReg_32_XM0RegClass ||
994       RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) {
995     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
996       .addImm(Value);
997     return;
998   }
999 
1000   if (RegClass == &AMDGPU::SReg_64RegClass ||
1001       RegClass == &AMDGPU::SGPR_64RegClass ||
1002       RegClass == &AMDGPU::SReg_64_XEXECRegClass) {
1003     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
1004       .addImm(Value);
1005     return;
1006   }
1007 
1008   if (RegClass == &AMDGPU::VGPR_32RegClass) {
1009     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
1010       .addImm(Value);
1011     return;
1012   }
1013   if (RegClass->hasSuperClassEq(&AMDGPU::VReg_64RegClass)) {
1014     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
1015       .addImm(Value);
1016     return;
1017   }
1018 
1019   unsigned EltSize = 4;
1020   unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1021   if (RI.isSGPRClass(RegClass)) {
1022     if (RI.getRegSizeInBits(*RegClass) > 32) {
1023       Opcode =  AMDGPU::S_MOV_B64;
1024       EltSize = 8;
1025     } else {
1026       Opcode = AMDGPU::S_MOV_B32;
1027       EltSize = 4;
1028     }
1029   }
1030 
1031   ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
1032   for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1033     int64_t IdxValue = Idx == 0 ? Value : 0;
1034 
1035     MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
1036       get(Opcode), RI.getSubReg(DestReg, SubIndices[Idx]));
1037     Builder.addImm(IdxValue);
1038   }
1039 }
1040 
1041 const TargetRegisterClass *
1042 SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const {
1043   return &AMDGPU::VGPR_32RegClass;
1044 }
1045 
1046 void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
1047                                      MachineBasicBlock::iterator I,
1048                                      const DebugLoc &DL, Register DstReg,
1049                                      ArrayRef<MachineOperand> Cond,
1050                                      Register TrueReg,
1051                                      Register FalseReg) const {
1052   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1053   const TargetRegisterClass *BoolXExecRC =
1054     RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1055   assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1056          "Not a VGPR32 reg");
1057 
1058   if (Cond.size() == 1) {
1059     Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1060     BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1061       .add(Cond[0]);
1062     BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1063       .addImm(0)
1064       .addReg(FalseReg)
1065       .addImm(0)
1066       .addReg(TrueReg)
1067       .addReg(SReg);
1068   } else if (Cond.size() == 2) {
1069     assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1070     switch (Cond[0].getImm()) {
1071     case SIInstrInfo::SCC_TRUE: {
1072       Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1073       BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1074                                             : AMDGPU::S_CSELECT_B64), SReg)
1075         .addImm(1)
1076         .addImm(0);
1077       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1078         .addImm(0)
1079         .addReg(FalseReg)
1080         .addImm(0)
1081         .addReg(TrueReg)
1082         .addReg(SReg);
1083       break;
1084     }
1085     case SIInstrInfo::SCC_FALSE: {
1086       Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1087       BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1088                                             : AMDGPU::S_CSELECT_B64), SReg)
1089         .addImm(0)
1090         .addImm(1);
1091       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1092         .addImm(0)
1093         .addReg(FalseReg)
1094         .addImm(0)
1095         .addReg(TrueReg)
1096         .addReg(SReg);
1097       break;
1098     }
1099     case SIInstrInfo::VCCNZ: {
1100       MachineOperand RegOp = Cond[1];
1101       RegOp.setImplicit(false);
1102       Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1103       BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1104         .add(RegOp);
1105       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1106           .addImm(0)
1107           .addReg(FalseReg)
1108           .addImm(0)
1109           .addReg(TrueReg)
1110           .addReg(SReg);
1111       break;
1112     }
1113     case SIInstrInfo::VCCZ: {
1114       MachineOperand RegOp = Cond[1];
1115       RegOp.setImplicit(false);
1116       Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1117       BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1118         .add(RegOp);
1119       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1120           .addImm(0)
1121           .addReg(TrueReg)
1122           .addImm(0)
1123           .addReg(FalseReg)
1124           .addReg(SReg);
1125       break;
1126     }
1127     case SIInstrInfo::EXECNZ: {
1128       Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1129       Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1130       BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1131                                             : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1132         .addImm(0);
1133       BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1134                                             : AMDGPU::S_CSELECT_B64), SReg)
1135         .addImm(1)
1136         .addImm(0);
1137       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1138         .addImm(0)
1139         .addReg(FalseReg)
1140         .addImm(0)
1141         .addReg(TrueReg)
1142         .addReg(SReg);
1143       break;
1144     }
1145     case SIInstrInfo::EXECZ: {
1146       Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1147       Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1148       BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1149                                             : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1150         .addImm(0);
1151       BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1152                                             : AMDGPU::S_CSELECT_B64), SReg)
1153         .addImm(0)
1154         .addImm(1);
1155       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1156         .addImm(0)
1157         .addReg(FalseReg)
1158         .addImm(0)
1159         .addReg(TrueReg)
1160         .addReg(SReg);
1161       llvm_unreachable("Unhandled branch predicate EXECZ");
1162       break;
1163     }
1164     default:
1165       llvm_unreachable("invalid branch predicate");
1166     }
1167   } else {
1168     llvm_unreachable("Can only handle Cond size 1 or 2");
1169   }
1170 }
1171 
1172 Register SIInstrInfo::insertEQ(MachineBasicBlock *MBB,
1173                                MachineBasicBlock::iterator I,
1174                                const DebugLoc &DL,
1175                                Register SrcReg, int Value) const {
1176   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1177   Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1178   BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1179     .addImm(Value)
1180     .addReg(SrcReg);
1181 
1182   return Reg;
1183 }
1184 
1185 Register SIInstrInfo::insertNE(MachineBasicBlock *MBB,
1186                                MachineBasicBlock::iterator I,
1187                                const DebugLoc &DL,
1188                                Register SrcReg, int Value) const {
1189   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1190   Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1191   BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1192     .addImm(Value)
1193     .addReg(SrcReg);
1194 
1195   return Reg;
1196 }
1197 
1198 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
1199 
1200   if (RI.hasAGPRs(DstRC))
1201     return AMDGPU::COPY;
1202   if (RI.getRegSizeInBits(*DstRC) == 32) {
1203     return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1204   } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) {
1205     return AMDGPU::S_MOV_B64;
1206   } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) {
1207     return  AMDGPU::V_MOV_B64_PSEUDO;
1208   }
1209   return AMDGPU::COPY;
1210 }
1211 
1212 const MCInstrDesc &
1213 SIInstrInfo::getIndirectGPRIDXPseudo(unsigned VecSize,
1214                                      bool IsIndirectSrc) const {
1215   if (IsIndirectSrc) {
1216     if (VecSize <= 32) // 4 bytes
1217       return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1218     if (VecSize <= 64) // 8 bytes
1219       return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1220     if (VecSize <= 96) // 12 bytes
1221       return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1222     if (VecSize <= 128) // 16 bytes
1223       return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1224     if (VecSize <= 160) // 20 bytes
1225       return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1226     if (VecSize <= 256) // 32 bytes
1227       return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1228     if (VecSize <= 512) // 64 bytes
1229       return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1230     if (VecSize <= 1024) // 128 bytes
1231       return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1232 
1233     llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1234   }
1235 
1236   if (VecSize <= 32) // 4 bytes
1237     return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1238   if (VecSize <= 64) // 8 bytes
1239     return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1240   if (VecSize <= 96) // 12 bytes
1241     return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1242   if (VecSize <= 128) // 16 bytes
1243     return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1244   if (VecSize <= 160) // 20 bytes
1245     return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1246   if (VecSize <= 256) // 32 bytes
1247     return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1248   if (VecSize <= 512) // 64 bytes
1249     return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1250   if (VecSize <= 1024) // 128 bytes
1251     return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1252 
1253   llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1254 }
1255 
1256 static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1257   if (VecSize <= 32) // 4 bytes
1258     return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1259   if (VecSize <= 64) // 8 bytes
1260     return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1261   if (VecSize <= 96) // 12 bytes
1262     return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1263   if (VecSize <= 128) // 16 bytes
1264     return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1265   if (VecSize <= 160) // 20 bytes
1266     return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1267   if (VecSize <= 256) // 32 bytes
1268     return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1269   if (VecSize <= 512) // 64 bytes
1270     return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1271   if (VecSize <= 1024) // 128 bytes
1272     return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1273 
1274   llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1275 }
1276 
1277 static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1278   if (VecSize <= 32) // 4 bytes
1279     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1280   if (VecSize <= 64) // 8 bytes
1281     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1282   if (VecSize <= 96) // 12 bytes
1283     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1284   if (VecSize <= 128) // 16 bytes
1285     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1286   if (VecSize <= 160) // 20 bytes
1287     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1288   if (VecSize <= 256) // 32 bytes
1289     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1290   if (VecSize <= 512) // 64 bytes
1291     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1292   if (VecSize <= 1024) // 128 bytes
1293     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1294 
1295   llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1296 }
1297 
1298 static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1299   if (VecSize <= 64) // 8 bytes
1300     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1301   if (VecSize <= 128) // 16 bytes
1302     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1303   if (VecSize <= 256) // 32 bytes
1304     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1305   if (VecSize <= 512) // 64 bytes
1306     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1307   if (VecSize <= 1024) // 128 bytes
1308     return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1309 
1310   llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1311 }
1312 
1313 const MCInstrDesc &
1314 SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1315                                              bool IsSGPR) const {
1316   if (IsSGPR) {
1317     switch (EltSize) {
1318     case 32:
1319       return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1320     case 64:
1321       return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1322     default:
1323       llvm_unreachable("invalid reg indexing elt size");
1324     }
1325   }
1326 
1327   assert(EltSize == 32 && "invalid reg indexing elt size");
1328   return get(getIndirectVGPRWriteMovRelPseudoOpc(VecSize));
1329 }
1330 
1331 static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1332   switch (Size) {
1333   case 4:
1334     return AMDGPU::SI_SPILL_S32_SAVE;
1335   case 8:
1336     return AMDGPU::SI_SPILL_S64_SAVE;
1337   case 12:
1338     return AMDGPU::SI_SPILL_S96_SAVE;
1339   case 16:
1340     return AMDGPU::SI_SPILL_S128_SAVE;
1341   case 20:
1342     return AMDGPU::SI_SPILL_S160_SAVE;
1343   case 24:
1344     return AMDGPU::SI_SPILL_S192_SAVE;
1345   case 28:
1346     return AMDGPU::SI_SPILL_S224_SAVE;
1347   case 32:
1348     return AMDGPU::SI_SPILL_S256_SAVE;
1349   case 64:
1350     return AMDGPU::SI_SPILL_S512_SAVE;
1351   case 128:
1352     return AMDGPU::SI_SPILL_S1024_SAVE;
1353   default:
1354     llvm_unreachable("unknown register size");
1355   }
1356 }
1357 
1358 static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1359   switch (Size) {
1360   case 4:
1361     return AMDGPU::SI_SPILL_V32_SAVE;
1362   case 8:
1363     return AMDGPU::SI_SPILL_V64_SAVE;
1364   case 12:
1365     return AMDGPU::SI_SPILL_V96_SAVE;
1366   case 16:
1367     return AMDGPU::SI_SPILL_V128_SAVE;
1368   case 20:
1369     return AMDGPU::SI_SPILL_V160_SAVE;
1370   case 24:
1371     return AMDGPU::SI_SPILL_V192_SAVE;
1372   case 28:
1373     return AMDGPU::SI_SPILL_V224_SAVE;
1374   case 32:
1375     return AMDGPU::SI_SPILL_V256_SAVE;
1376   case 64:
1377     return AMDGPU::SI_SPILL_V512_SAVE;
1378   case 128:
1379     return AMDGPU::SI_SPILL_V1024_SAVE;
1380   default:
1381     llvm_unreachable("unknown register size");
1382   }
1383 }
1384 
1385 static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
1386   switch (Size) {
1387   case 4:
1388     return AMDGPU::SI_SPILL_A32_SAVE;
1389   case 8:
1390     return AMDGPU::SI_SPILL_A64_SAVE;
1391   case 12:
1392     return AMDGPU::SI_SPILL_A96_SAVE;
1393   case 16:
1394     return AMDGPU::SI_SPILL_A128_SAVE;
1395   case 20:
1396     return AMDGPU::SI_SPILL_A160_SAVE;
1397   case 24:
1398     return AMDGPU::SI_SPILL_A192_SAVE;
1399   case 28:
1400     return AMDGPU::SI_SPILL_A224_SAVE;
1401   case 32:
1402     return AMDGPU::SI_SPILL_A256_SAVE;
1403   case 64:
1404     return AMDGPU::SI_SPILL_A512_SAVE;
1405   case 128:
1406     return AMDGPU::SI_SPILL_A1024_SAVE;
1407   default:
1408     llvm_unreachable("unknown register size");
1409   }
1410 }
1411 
1412 void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
1413                                       MachineBasicBlock::iterator MI,
1414                                       Register SrcReg, bool isKill,
1415                                       int FrameIndex,
1416                                       const TargetRegisterClass *RC,
1417                                       const TargetRegisterInfo *TRI) const {
1418   MachineFunction *MF = MBB.getParent();
1419   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1420   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1421   const DebugLoc &DL = MBB.findDebugLoc(MI);
1422 
1423   MachinePointerInfo PtrInfo
1424     = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1425   MachineMemOperand *MMO = MF->getMachineMemOperand(
1426       PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1427       FrameInfo.getObjectAlign(FrameIndex));
1428   unsigned SpillSize = TRI->getSpillSize(*RC);
1429 
1430   if (RI.isSGPRClass(RC)) {
1431     MFI->setHasSpilledSGPRs();
1432     assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1433     assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1434            SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1435 
1436     // We are only allowed to create one new instruction when spilling
1437     // registers, so we need to use pseudo instruction for spilling SGPRs.
1438     const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1439 
1440     // The SGPR spill/restore instructions only work on number sgprs, so we need
1441     // to make sure we are using the correct register class.
1442     if (SrcReg.isVirtual() && SpillSize == 4) {
1443       MachineRegisterInfo &MRI = MF->getRegInfo();
1444       MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1445     }
1446 
1447     BuildMI(MBB, MI, DL, OpDesc)
1448       .addReg(SrcReg, getKillRegState(isKill)) // data
1449       .addFrameIndex(FrameIndex)               // addr
1450       .addMemOperand(MMO)
1451       .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit);
1452 
1453     if (RI.spillSGPRToVGPR())
1454       FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1455     return;
1456   }
1457 
1458   unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillSaveOpcode(SpillSize)
1459                                     : getVGPRSpillSaveOpcode(SpillSize);
1460   MFI->setHasSpilledVGPRs();
1461 
1462   BuildMI(MBB, MI, DL, get(Opcode))
1463     .addReg(SrcReg, getKillRegState(isKill)) // data
1464     .addFrameIndex(FrameIndex)               // addr
1465     .addReg(MFI->getStackPtrOffsetReg())     // scratch_offset
1466     .addImm(0)                               // offset
1467     .addMemOperand(MMO);
1468 }
1469 
1470 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1471   switch (Size) {
1472   case 4:
1473     return AMDGPU::SI_SPILL_S32_RESTORE;
1474   case 8:
1475     return AMDGPU::SI_SPILL_S64_RESTORE;
1476   case 12:
1477     return AMDGPU::SI_SPILL_S96_RESTORE;
1478   case 16:
1479     return AMDGPU::SI_SPILL_S128_RESTORE;
1480   case 20:
1481     return AMDGPU::SI_SPILL_S160_RESTORE;
1482   case 24:
1483     return AMDGPU::SI_SPILL_S192_RESTORE;
1484   case 28:
1485     return AMDGPU::SI_SPILL_S224_RESTORE;
1486   case 32:
1487     return AMDGPU::SI_SPILL_S256_RESTORE;
1488   case 64:
1489     return AMDGPU::SI_SPILL_S512_RESTORE;
1490   case 128:
1491     return AMDGPU::SI_SPILL_S1024_RESTORE;
1492   default:
1493     llvm_unreachable("unknown register size");
1494   }
1495 }
1496 
1497 static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1498   switch (Size) {
1499   case 4:
1500     return AMDGPU::SI_SPILL_V32_RESTORE;
1501   case 8:
1502     return AMDGPU::SI_SPILL_V64_RESTORE;
1503   case 12:
1504     return AMDGPU::SI_SPILL_V96_RESTORE;
1505   case 16:
1506     return AMDGPU::SI_SPILL_V128_RESTORE;
1507   case 20:
1508     return AMDGPU::SI_SPILL_V160_RESTORE;
1509   case 24:
1510     return AMDGPU::SI_SPILL_V192_RESTORE;
1511   case 28:
1512     return AMDGPU::SI_SPILL_V224_RESTORE;
1513   case 32:
1514     return AMDGPU::SI_SPILL_V256_RESTORE;
1515   case 64:
1516     return AMDGPU::SI_SPILL_V512_RESTORE;
1517   case 128:
1518     return AMDGPU::SI_SPILL_V1024_RESTORE;
1519   default:
1520     llvm_unreachable("unknown register size");
1521   }
1522 }
1523 
1524 static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
1525   switch (Size) {
1526   case 4:
1527     return AMDGPU::SI_SPILL_A32_RESTORE;
1528   case 8:
1529     return AMDGPU::SI_SPILL_A64_RESTORE;
1530   case 12:
1531     return AMDGPU::SI_SPILL_A96_RESTORE;
1532   case 16:
1533     return AMDGPU::SI_SPILL_A128_RESTORE;
1534   case 20:
1535     return AMDGPU::SI_SPILL_A160_RESTORE;
1536   case 24:
1537     return AMDGPU::SI_SPILL_A192_RESTORE;
1538   case 28:
1539     return AMDGPU::SI_SPILL_A224_RESTORE;
1540   case 32:
1541     return AMDGPU::SI_SPILL_A256_RESTORE;
1542   case 64:
1543     return AMDGPU::SI_SPILL_A512_RESTORE;
1544   case 128:
1545     return AMDGPU::SI_SPILL_A1024_RESTORE;
1546   default:
1547     llvm_unreachable("unknown register size");
1548   }
1549 }
1550 
1551 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
1552                                        MachineBasicBlock::iterator MI,
1553                                        Register DestReg, int FrameIndex,
1554                                        const TargetRegisterClass *RC,
1555                                        const TargetRegisterInfo *TRI) const {
1556   MachineFunction *MF = MBB.getParent();
1557   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1558   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1559   const DebugLoc &DL = MBB.findDebugLoc(MI);
1560   unsigned SpillSize = TRI->getSpillSize(*RC);
1561 
1562   MachinePointerInfo PtrInfo
1563     = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1564 
1565   MachineMemOperand *MMO = MF->getMachineMemOperand(
1566       PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1567       FrameInfo.getObjectAlign(FrameIndex));
1568 
1569   if (RI.isSGPRClass(RC)) {
1570     MFI->setHasSpilledSGPRs();
1571     assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1572     assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1573            DestReg != AMDGPU::EXEC && "exec should not be spilled");
1574 
1575     // FIXME: Maybe this should not include a memoperand because it will be
1576     // lowered to non-memory instructions.
1577     const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1578     if (DestReg.isVirtual() && SpillSize == 4) {
1579       MachineRegisterInfo &MRI = MF->getRegInfo();
1580       MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1581     }
1582 
1583     if (RI.spillSGPRToVGPR())
1584       FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1585     BuildMI(MBB, MI, DL, OpDesc, DestReg)
1586       .addFrameIndex(FrameIndex) // addr
1587       .addMemOperand(MMO)
1588       .addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit);
1589 
1590     return;
1591   }
1592 
1593   unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillRestoreOpcode(SpillSize)
1594                                     : getVGPRSpillRestoreOpcode(SpillSize);
1595   BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1596     .addFrameIndex(FrameIndex)        // vaddr
1597     .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1598     .addImm(0)                           // offset
1599     .addMemOperand(MMO);
1600 }
1601 
1602 void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,
1603                              MachineBasicBlock::iterator MI) const {
1604   insertNoops(MBB, MI, 1);
1605 }
1606 
1607 void SIInstrInfo::insertNoops(MachineBasicBlock &MBB,
1608                               MachineBasicBlock::iterator MI,
1609                               unsigned Quantity) const {
1610   DebugLoc DL = MBB.findDebugLoc(MI);
1611   while (Quantity > 0) {
1612     unsigned Arg = std::min(Quantity, 8u);
1613     Quantity -= Arg;
1614     BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
1615   }
1616 }
1617 
1618 void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const {
1619   auto MF = MBB.getParent();
1620   SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1621 
1622   assert(Info->isEntryFunction());
1623 
1624   if (MBB.succ_empty()) {
1625     bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1626     if (HasNoTerminator) {
1627       if (Info->returnsVoid()) {
1628         BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
1629       } else {
1630         BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
1631       }
1632     }
1633   }
1634 }
1635 
1636 unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
1637   switch (MI.getOpcode()) {
1638   default: return 1; // FIXME: Do wait states equal cycles?
1639 
1640   case AMDGPU::S_NOP:
1641     return MI.getOperand(0).getImm() + 1;
1642   }
1643 }
1644 
1645 bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
1646   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1647   MachineBasicBlock &MBB = *MI.getParent();
1648   DebugLoc DL = MBB.findDebugLoc(MI);
1649   switch (MI.getOpcode()) {
1650   default: return TargetInstrInfo::expandPostRAPseudo(MI);
1651   case AMDGPU::S_MOV_B64_term:
1652     // This is only a terminator to get the correct spill code placement during
1653     // register allocation.
1654     MI.setDesc(get(AMDGPU::S_MOV_B64));
1655     break;
1656 
1657   case AMDGPU::S_MOV_B32_term:
1658     // This is only a terminator to get the correct spill code placement during
1659     // register allocation.
1660     MI.setDesc(get(AMDGPU::S_MOV_B32));
1661     break;
1662 
1663   case AMDGPU::S_XOR_B64_term:
1664     // This is only a terminator to get the correct spill code placement during
1665     // register allocation.
1666     MI.setDesc(get(AMDGPU::S_XOR_B64));
1667     break;
1668 
1669   case AMDGPU::S_XOR_B32_term:
1670     // This is only a terminator to get the correct spill code placement during
1671     // register allocation.
1672     MI.setDesc(get(AMDGPU::S_XOR_B32));
1673     break;
1674   case AMDGPU::S_OR_B64_term:
1675     // This is only a terminator to get the correct spill code placement during
1676     // register allocation.
1677     MI.setDesc(get(AMDGPU::S_OR_B64));
1678     break;
1679   case AMDGPU::S_OR_B32_term:
1680     // This is only a terminator to get the correct spill code placement during
1681     // register allocation.
1682     MI.setDesc(get(AMDGPU::S_OR_B32));
1683     break;
1684 
1685   case AMDGPU::S_ANDN2_B64_term:
1686     // This is only a terminator to get the correct spill code placement during
1687     // register allocation.
1688     MI.setDesc(get(AMDGPU::S_ANDN2_B64));
1689     break;
1690 
1691   case AMDGPU::S_ANDN2_B32_term:
1692     // This is only a terminator to get the correct spill code placement during
1693     // register allocation.
1694     MI.setDesc(get(AMDGPU::S_ANDN2_B32));
1695     break;
1696 
1697   case AMDGPU::S_AND_B64_term:
1698     // This is only a terminator to get the correct spill code placement during
1699     // register allocation.
1700     MI.setDesc(get(AMDGPU::S_AND_B64));
1701     break;
1702 
1703   case AMDGPU::S_AND_B32_term:
1704     // This is only a terminator to get the correct spill code placement during
1705     // register allocation.
1706     MI.setDesc(get(AMDGPU::S_AND_B32));
1707     break;
1708 
1709   case AMDGPU::V_MOV_B64_PSEUDO: {
1710     Register Dst = MI.getOperand(0).getReg();
1711     Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
1712     Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
1713 
1714     const MachineOperand &SrcOp = MI.getOperand(1);
1715     // FIXME: Will this work for 64-bit floating point immediates?
1716     assert(!SrcOp.isFPImm());
1717     if (SrcOp.isImm()) {
1718       APInt Imm(64, SrcOp.getImm());
1719       APInt Lo(32, Imm.getLoBits(32).getZExtValue());
1720       APInt Hi(32, Imm.getHiBits(32).getZExtValue());
1721       if (ST.hasPackedFP32Ops() && Lo == Hi && isInlineConstant(Lo)) {
1722         BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
1723           .addImm(SISrcMods::OP_SEL_1)
1724           .addImm(Lo.getSExtValue())
1725           .addImm(SISrcMods::OP_SEL_1)
1726           .addImm(Lo.getSExtValue())
1727           .addImm(0)  // op_sel_lo
1728           .addImm(0)  // op_sel_hi
1729           .addImm(0)  // neg_lo
1730           .addImm(0)  // neg_hi
1731           .addImm(0); // clamp
1732       } else {
1733         BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
1734           .addImm(Lo.getSExtValue())
1735           .addReg(Dst, RegState::Implicit | RegState::Define);
1736         BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
1737           .addImm(Hi.getSExtValue())
1738           .addReg(Dst, RegState::Implicit | RegState::Define);
1739       }
1740     } else {
1741       assert(SrcOp.isReg());
1742       if (ST.hasPackedFP32Ops() &&
1743           !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
1744         BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
1745           .addImm(SISrcMods::OP_SEL_1) // src0_mod
1746           .addReg(SrcOp.getReg())
1747           .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) // src1_mod
1748           .addReg(SrcOp.getReg())
1749           .addImm(0)  // op_sel_lo
1750           .addImm(0)  // op_sel_hi
1751           .addImm(0)  // neg_lo
1752           .addImm(0)  // neg_hi
1753           .addImm(0); // clamp
1754       } else {
1755         BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
1756           .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
1757           .addReg(Dst, RegState::Implicit | RegState::Define);
1758         BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
1759           .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
1760           .addReg(Dst, RegState::Implicit | RegState::Define);
1761       }
1762     }
1763     MI.eraseFromParent();
1764     break;
1765   }
1766   case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
1767     expandMovDPP64(MI);
1768     break;
1769   }
1770   case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
1771     const MachineOperand &SrcOp = MI.getOperand(1);
1772     assert(!SrcOp.isFPImm());
1773     APInt Imm(64, SrcOp.getImm());
1774     if (Imm.isIntN(32) || isInlineConstant(Imm)) {
1775       MI.setDesc(get(AMDGPU::S_MOV_B64));
1776       break;
1777     }
1778 
1779     Register Dst = MI.getOperand(0).getReg();
1780     Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
1781     Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
1782 
1783     APInt Lo(32, Imm.getLoBits(32).getZExtValue());
1784     APInt Hi(32, Imm.getHiBits(32).getZExtValue());
1785     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
1786       .addImm(Lo.getSExtValue())
1787       .addReg(Dst, RegState::Implicit | RegState::Define);
1788     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
1789       .addImm(Hi.getSExtValue())
1790       .addReg(Dst, RegState::Implicit | RegState::Define);
1791     MI.eraseFromParent();
1792     break;
1793   }
1794   case AMDGPU::V_SET_INACTIVE_B32: {
1795     unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
1796     unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
1797     auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
1798     FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
1799     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
1800       .add(MI.getOperand(2));
1801     BuildMI(MBB, MI, DL, get(NotOpc), Exec)
1802       .addReg(Exec);
1803     MI.eraseFromParent();
1804     break;
1805   }
1806   case AMDGPU::V_SET_INACTIVE_B64: {
1807     unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
1808     unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
1809     auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
1810     FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
1811     MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
1812                                  MI.getOperand(0).getReg())
1813       .add(MI.getOperand(2));
1814     expandPostRAPseudo(*Copy);
1815     BuildMI(MBB, MI, DL, get(NotOpc), Exec)
1816       .addReg(Exec);
1817     MI.eraseFromParent();
1818     break;
1819   }
1820   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
1821   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
1822   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
1823   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
1824   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
1825   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
1826   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
1827   case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
1828   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
1829   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
1830   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
1831   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
1832   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
1833   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
1834   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
1835   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
1836   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
1837   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
1838   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
1839   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
1840   case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
1841     const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
1842 
1843     unsigned Opc;
1844     if (RI.hasVGPRs(EltRC)) {
1845       Opc = AMDGPU::V_MOVRELD_B32_e32;
1846     } else {
1847       Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
1848                                               : AMDGPU::S_MOVRELD_B32;
1849     }
1850 
1851     const MCInstrDesc &OpDesc = get(Opc);
1852     Register VecReg = MI.getOperand(0).getReg();
1853     bool IsUndef = MI.getOperand(1).isUndef();
1854     unsigned SubReg = MI.getOperand(3).getImm();
1855     assert(VecReg == MI.getOperand(1).getReg());
1856 
1857     MachineInstrBuilder MIB =
1858       BuildMI(MBB, MI, DL, OpDesc)
1859         .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
1860         .add(MI.getOperand(2))
1861         .addReg(VecReg, RegState::ImplicitDefine)
1862         .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
1863 
1864     const int ImpDefIdx =
1865       OpDesc.getNumOperands() + OpDesc.getNumImplicitUses();
1866     const int ImpUseIdx = ImpDefIdx + 1;
1867     MIB->tieOperands(ImpDefIdx, ImpUseIdx);
1868     MI.eraseFromParent();
1869     break;
1870   }
1871   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
1872   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
1873   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
1874   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
1875   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
1876   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
1877   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
1878   case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
1879     assert(ST.useVGPRIndexMode());
1880     Register VecReg = MI.getOperand(0).getReg();
1881     bool IsUndef = MI.getOperand(1).isUndef();
1882     Register Idx = MI.getOperand(3).getReg();
1883     Register SubReg = MI.getOperand(4).getImm();
1884 
1885     MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
1886                               .addReg(Idx)
1887                               .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE);
1888     SetOn->getOperand(3).setIsUndef();
1889 
1890     const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect);
1891     MachineInstrBuilder MIB =
1892         BuildMI(MBB, MI, DL, OpDesc)
1893             .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
1894             .add(MI.getOperand(2))
1895             .addReg(VecReg, RegState::ImplicitDefine)
1896             .addReg(VecReg,
1897                     RegState::Implicit | (IsUndef ? RegState::Undef : 0));
1898 
1899     const int ImpDefIdx = OpDesc.getNumOperands() + OpDesc.getNumImplicitUses();
1900     const int ImpUseIdx = ImpDefIdx + 1;
1901     MIB->tieOperands(ImpDefIdx, ImpUseIdx);
1902 
1903     MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
1904 
1905     finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
1906 
1907     MI.eraseFromParent();
1908     break;
1909   }
1910   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
1911   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
1912   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
1913   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
1914   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
1915   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
1916   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
1917   case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
1918     assert(ST.useVGPRIndexMode());
1919     Register Dst = MI.getOperand(0).getReg();
1920     Register VecReg = MI.getOperand(1).getReg();
1921     bool IsUndef = MI.getOperand(1).isUndef();
1922     Register Idx = MI.getOperand(2).getReg();
1923     Register SubReg = MI.getOperand(3).getImm();
1924 
1925     MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
1926                               .addReg(Idx)
1927                               .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE);
1928     SetOn->getOperand(3).setIsUndef();
1929 
1930     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32))
1931         .addDef(Dst)
1932         .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
1933         .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0))
1934         .addReg(AMDGPU::M0, RegState::Implicit);
1935 
1936     MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
1937 
1938     finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
1939 
1940     MI.eraseFromParent();
1941     break;
1942   }
1943   case AMDGPU::SI_PC_ADD_REL_OFFSET: {
1944     MachineFunction &MF = *MBB.getParent();
1945     Register Reg = MI.getOperand(0).getReg();
1946     Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
1947     Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
1948 
1949     // Create a bundle so these instructions won't be re-ordered by the
1950     // post-RA scheduler.
1951     MIBundleBuilder Bundler(MBB, MI);
1952     Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
1953 
1954     // Add 32-bit offset from this instruction to the start of the
1955     // constant data.
1956     Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo)
1957                        .addReg(RegLo)
1958                        .add(MI.getOperand(1)));
1959 
1960     MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
1961                                   .addReg(RegHi);
1962     MIB.add(MI.getOperand(2));
1963 
1964     Bundler.append(MIB);
1965     finalizeBundle(MBB, Bundler.begin());
1966 
1967     MI.eraseFromParent();
1968     break;
1969   }
1970   case AMDGPU::ENTER_STRICT_WWM: {
1971     // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
1972     // Whole Wave Mode is entered.
1973     MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1974                                  : AMDGPU::S_OR_SAVEEXEC_B64));
1975     break;
1976   }
1977   case AMDGPU::ENTER_STRICT_WQM: {
1978     // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
1979     // STRICT_WQM is entered.
1980     const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
1981     const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64;
1982     const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1983     BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec);
1984     BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec);
1985 
1986     MI.eraseFromParent();
1987     break;
1988   }
1989   case AMDGPU::EXIT_STRICT_WWM:
1990   case AMDGPU::EXIT_STRICT_WQM: {
1991     // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
1992     // WWM/STICT_WQM is exited.
1993     MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
1994     break;
1995   }
1996   }
1997   return true;
1998 }
1999 
2000 std::pair<MachineInstr*, MachineInstr*>
2001 SIInstrInfo::expandMovDPP64(MachineInstr &MI) const {
2002   assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2003 
2004   MachineBasicBlock &MBB = *MI.getParent();
2005   DebugLoc DL = MBB.findDebugLoc(MI);
2006   MachineFunction *MF = MBB.getParent();
2007   MachineRegisterInfo &MRI = MF->getRegInfo();
2008   Register Dst = MI.getOperand(0).getReg();
2009   unsigned Part = 0;
2010   MachineInstr *Split[2];
2011 
2012   for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2013     auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2014     if (Dst.isPhysical()) {
2015       MovDPP.addDef(RI.getSubReg(Dst, Sub));
2016     } else {
2017       assert(MRI.isSSA());
2018       auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2019       MovDPP.addDef(Tmp);
2020     }
2021 
2022     for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2023       const MachineOperand &SrcOp = MI.getOperand(I);
2024       assert(!SrcOp.isFPImm());
2025       if (SrcOp.isImm()) {
2026         APInt Imm(64, SrcOp.getImm());
2027         Imm.ashrInPlace(Part * 32);
2028         MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2029       } else {
2030         assert(SrcOp.isReg());
2031         Register Src = SrcOp.getReg();
2032         if (Src.isPhysical())
2033           MovDPP.addReg(RI.getSubReg(Src, Sub));
2034         else
2035           MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
2036       }
2037     }
2038 
2039     for (unsigned I = 3; I < MI.getNumExplicitOperands(); ++I)
2040       MovDPP.addImm(MI.getOperand(I).getImm());
2041 
2042     Split[Part] = MovDPP;
2043     ++Part;
2044   }
2045 
2046   if (Dst.isVirtual())
2047     BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2048       .addReg(Split[0]->getOperand(0).getReg())
2049       .addImm(AMDGPU::sub0)
2050       .addReg(Split[1]->getOperand(0).getReg())
2051       .addImm(AMDGPU::sub1);
2052 
2053   MI.eraseFromParent();
2054   return std::make_pair(Split[0], Split[1]);
2055 }
2056 
2057 bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI,
2058                                       MachineOperand &Src0,
2059                                       unsigned Src0OpName,
2060                                       MachineOperand &Src1,
2061                                       unsigned Src1OpName) const {
2062   MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2063   if (!Src0Mods)
2064     return false;
2065 
2066   MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2067   assert(Src1Mods &&
2068          "All commutable instructions have both src0 and src1 modifiers");
2069 
2070   int Src0ModsVal = Src0Mods->getImm();
2071   int Src1ModsVal = Src1Mods->getImm();
2072 
2073   Src1Mods->setImm(Src0ModsVal);
2074   Src0Mods->setImm(Src1ModsVal);
2075   return true;
2076 }
2077 
2078 static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI,
2079                                              MachineOperand &RegOp,
2080                                              MachineOperand &NonRegOp) {
2081   Register Reg = RegOp.getReg();
2082   unsigned SubReg = RegOp.getSubReg();
2083   bool IsKill = RegOp.isKill();
2084   bool IsDead = RegOp.isDead();
2085   bool IsUndef = RegOp.isUndef();
2086   bool IsDebug = RegOp.isDebug();
2087 
2088   if (NonRegOp.isImm())
2089     RegOp.ChangeToImmediate(NonRegOp.getImm());
2090   else if (NonRegOp.isFI())
2091     RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2092   else if (NonRegOp.isGlobal()) {
2093     RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2094                      NonRegOp.getTargetFlags());
2095   } else
2096     return nullptr;
2097 
2098   // Make sure we don't reinterpret a subreg index in the target flags.
2099   RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2100 
2101   NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2102   NonRegOp.setSubReg(SubReg);
2103 
2104   return &MI;
2105 }
2106 
2107 MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
2108                                                   unsigned Src0Idx,
2109                                                   unsigned Src1Idx) const {
2110   assert(!NewMI && "this should never be used");
2111 
2112   unsigned Opc = MI.getOpcode();
2113   int CommutedOpcode = commuteOpcode(Opc);
2114   if (CommutedOpcode == -1)
2115     return nullptr;
2116 
2117   assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2118            static_cast<int>(Src0Idx) &&
2119          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2120            static_cast<int>(Src1Idx) &&
2121          "inconsistency with findCommutedOpIndices");
2122 
2123   MachineOperand &Src0 = MI.getOperand(Src0Idx);
2124   MachineOperand &Src1 = MI.getOperand(Src1Idx);
2125 
2126   MachineInstr *CommutedMI = nullptr;
2127   if (Src0.isReg() && Src1.isReg()) {
2128     if (isOperandLegal(MI, Src1Idx, &Src0)) {
2129       // Be sure to copy the source modifiers to the right place.
2130       CommutedMI
2131         = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2132     }
2133 
2134   } else if (Src0.isReg() && !Src1.isReg()) {
2135     // src0 should always be able to support any operand type, so no need to
2136     // check operand legality.
2137     CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2138   } else if (!Src0.isReg() && Src1.isReg()) {
2139     if (isOperandLegal(MI, Src1Idx, &Src0))
2140       CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2141   } else {
2142     // FIXME: Found two non registers to commute. This does happen.
2143     return nullptr;
2144   }
2145 
2146   if (CommutedMI) {
2147     swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2148                         Src1, AMDGPU::OpName::src1_modifiers);
2149 
2150     CommutedMI->setDesc(get(CommutedOpcode));
2151   }
2152 
2153   return CommutedMI;
2154 }
2155 
2156 // This needs to be implemented because the source modifiers may be inserted
2157 // between the true commutable operands, and the base
2158 // TargetInstrInfo::commuteInstruction uses it.
2159 bool SIInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
2160                                         unsigned &SrcOpIdx0,
2161                                         unsigned &SrcOpIdx1) const {
2162   return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2163 }
2164 
2165 bool SIInstrInfo::findCommutedOpIndices(MCInstrDesc Desc, unsigned &SrcOpIdx0,
2166                                         unsigned &SrcOpIdx1) const {
2167   if (!Desc.isCommutable())
2168     return false;
2169 
2170   unsigned Opc = Desc.getOpcode();
2171   int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2172   if (Src0Idx == -1)
2173     return false;
2174 
2175   int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2176   if (Src1Idx == -1)
2177     return false;
2178 
2179   return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2180 }
2181 
2182 bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
2183                                         int64_t BrOffset) const {
2184   // BranchRelaxation should never have to check s_setpc_b64 because its dest
2185   // block is unanalyzable.
2186   assert(BranchOp != AMDGPU::S_SETPC_B64);
2187 
2188   // Convert to dwords.
2189   BrOffset /= 4;
2190 
2191   // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2192   // from the next instruction.
2193   BrOffset -= 1;
2194 
2195   return isIntN(BranchOffsetBits, BrOffset);
2196 }
2197 
2198 MachineBasicBlock *SIInstrInfo::getBranchDestBlock(
2199   const MachineInstr &MI) const {
2200   if (MI.getOpcode() == AMDGPU::S_SETPC_B64) {
2201     // This would be a difficult analysis to perform, but can always be legal so
2202     // there's no need to analyze it.
2203     return nullptr;
2204   }
2205 
2206   return MI.getOperand(0).getMBB();
2207 }
2208 
2209 unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
2210                                            MachineBasicBlock &DestBB,
2211                                            const DebugLoc &DL,
2212                                            int64_t BrOffset,
2213                                            RegScavenger *RS) const {
2214   assert(RS && "RegScavenger required for long branching");
2215   assert(MBB.empty() &&
2216          "new block should be inserted for expanding unconditional branch");
2217   assert(MBB.pred_size() == 1);
2218 
2219   MachineFunction *MF = MBB.getParent();
2220   MachineRegisterInfo &MRI = MF->getRegInfo();
2221 
2222   // FIXME: Virtual register workaround for RegScavenger not working with empty
2223   // blocks.
2224   Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2225 
2226   auto I = MBB.end();
2227 
2228   // We need to compute the offset relative to the instruction immediately after
2229   // s_getpc_b64. Insert pc arithmetic code before last terminator.
2230   MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
2231 
2232   // TODO: Handle > 32-bit block address.
2233   if (BrOffset >= 0) {
2234     BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
2235       .addReg(PCReg, RegState::Define, AMDGPU::sub0)
2236       .addReg(PCReg, 0, AMDGPU::sub0)
2237       .addMBB(&DestBB, MO_LONG_BRANCH_FORWARD);
2238     BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
2239       .addReg(PCReg, RegState::Define, AMDGPU::sub1)
2240       .addReg(PCReg, 0, AMDGPU::sub1)
2241       .addImm(0);
2242   } else {
2243     // Backwards branch.
2244     BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32))
2245       .addReg(PCReg, RegState::Define, AMDGPU::sub0)
2246       .addReg(PCReg, 0, AMDGPU::sub0)
2247       .addMBB(&DestBB, MO_LONG_BRANCH_BACKWARD);
2248     BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32))
2249       .addReg(PCReg, RegState::Define, AMDGPU::sub1)
2250       .addReg(PCReg, 0, AMDGPU::sub1)
2251       .addImm(0);
2252   }
2253 
2254   // Insert the indirect branch after the other terminator.
2255   BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
2256     .addReg(PCReg);
2257 
2258   // FIXME: If spilling is necessary, this will fail because this scavenger has
2259   // no emergency stack slots. It is non-trivial to spill in this situation,
2260   // because the restore code needs to be specially placed after the
2261   // jump. BranchRelaxation then needs to be made aware of the newly inserted
2262   // block.
2263   //
2264   // If a spill is needed for the pc register pair, we need to insert a spill
2265   // restore block right before the destination block, and insert a short branch
2266   // into the old destination block's fallthrough predecessor.
2267   // e.g.:
2268   //
2269   // s_cbranch_scc0 skip_long_branch:
2270   //
2271   // long_branch_bb:
2272   //   spill s[8:9]
2273   //   s_getpc_b64 s[8:9]
2274   //   s_add_u32 s8, s8, restore_bb
2275   //   s_addc_u32 s9, s9, 0
2276   //   s_setpc_b64 s[8:9]
2277   //
2278   // skip_long_branch:
2279   //   foo;
2280   //
2281   // .....
2282   //
2283   // dest_bb_fallthrough_predecessor:
2284   // bar;
2285   // s_branch dest_bb
2286   //
2287   // restore_bb:
2288   //  restore s[8:9]
2289   //  fallthrough dest_bb
2290   ///
2291   // dest_bb:
2292   //   buzz;
2293 
2294   RS->enterBasicBlockEnd(MBB);
2295   Register Scav = RS->scavengeRegisterBackwards(
2296     AMDGPU::SReg_64RegClass,
2297     MachineBasicBlock::iterator(GetPC), false, 0);
2298   MRI.replaceRegWith(PCReg, Scav);
2299   MRI.clearVirtRegs();
2300   RS->setRegUsed(Scav);
2301 
2302   return 4 + 8 + 4 + 4;
2303 }
2304 
2305 unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
2306   switch (Cond) {
2307   case SIInstrInfo::SCC_TRUE:
2308     return AMDGPU::S_CBRANCH_SCC1;
2309   case SIInstrInfo::SCC_FALSE:
2310     return AMDGPU::S_CBRANCH_SCC0;
2311   case SIInstrInfo::VCCNZ:
2312     return AMDGPU::S_CBRANCH_VCCNZ;
2313   case SIInstrInfo::VCCZ:
2314     return AMDGPU::S_CBRANCH_VCCZ;
2315   case SIInstrInfo::EXECNZ:
2316     return AMDGPU::S_CBRANCH_EXECNZ;
2317   case SIInstrInfo::EXECZ:
2318     return AMDGPU::S_CBRANCH_EXECZ;
2319   default:
2320     llvm_unreachable("invalid branch predicate");
2321   }
2322 }
2323 
2324 SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
2325   switch (Opcode) {
2326   case AMDGPU::S_CBRANCH_SCC0:
2327     return SCC_FALSE;
2328   case AMDGPU::S_CBRANCH_SCC1:
2329     return SCC_TRUE;
2330   case AMDGPU::S_CBRANCH_VCCNZ:
2331     return VCCNZ;
2332   case AMDGPU::S_CBRANCH_VCCZ:
2333     return VCCZ;
2334   case AMDGPU::S_CBRANCH_EXECNZ:
2335     return EXECNZ;
2336   case AMDGPU::S_CBRANCH_EXECZ:
2337     return EXECZ;
2338   default:
2339     return INVALID_BR;
2340   }
2341 }
2342 
2343 bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB,
2344                                     MachineBasicBlock::iterator I,
2345                                     MachineBasicBlock *&TBB,
2346                                     MachineBasicBlock *&FBB,
2347                                     SmallVectorImpl<MachineOperand> &Cond,
2348                                     bool AllowModify) const {
2349   if (I->getOpcode() == AMDGPU::S_BRANCH) {
2350     // Unconditional Branch
2351     TBB = I->getOperand(0).getMBB();
2352     return false;
2353   }
2354 
2355   MachineBasicBlock *CondBB = nullptr;
2356 
2357   if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
2358     CondBB = I->getOperand(1).getMBB();
2359     Cond.push_back(I->getOperand(0));
2360   } else {
2361     BranchPredicate Pred = getBranchPredicate(I->getOpcode());
2362     if (Pred == INVALID_BR)
2363       return true;
2364 
2365     CondBB = I->getOperand(0).getMBB();
2366     Cond.push_back(MachineOperand::CreateImm(Pred));
2367     Cond.push_back(I->getOperand(1)); // Save the branch register.
2368   }
2369   ++I;
2370 
2371   if (I == MBB.end()) {
2372     // Conditional branch followed by fall-through.
2373     TBB = CondBB;
2374     return false;
2375   }
2376 
2377   if (I->getOpcode() == AMDGPU::S_BRANCH) {
2378     TBB = CondBB;
2379     FBB = I->getOperand(0).getMBB();
2380     return false;
2381   }
2382 
2383   return true;
2384 }
2385 
2386 bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
2387                                 MachineBasicBlock *&FBB,
2388                                 SmallVectorImpl<MachineOperand> &Cond,
2389                                 bool AllowModify) const {
2390   MachineBasicBlock::iterator I = MBB.getFirstTerminator();
2391   auto E = MBB.end();
2392   if (I == E)
2393     return false;
2394 
2395   // Skip over the instructions that are artificially terminators for special
2396   // exec management.
2397   while (I != E && !I->isBranch() && !I->isReturn()) {
2398     switch (I->getOpcode()) {
2399     case AMDGPU::S_MOV_B64_term:
2400     case AMDGPU::S_XOR_B64_term:
2401     case AMDGPU::S_OR_B64_term:
2402     case AMDGPU::S_ANDN2_B64_term:
2403     case AMDGPU::S_AND_B64_term:
2404     case AMDGPU::S_MOV_B32_term:
2405     case AMDGPU::S_XOR_B32_term:
2406     case AMDGPU::S_OR_B32_term:
2407     case AMDGPU::S_ANDN2_B32_term:
2408     case AMDGPU::S_AND_B32_term:
2409       break;
2410     case AMDGPU::SI_IF:
2411     case AMDGPU::SI_ELSE:
2412     case AMDGPU::SI_KILL_I1_TERMINATOR:
2413     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
2414       // FIXME: It's messy that these need to be considered here at all.
2415       return true;
2416     default:
2417       llvm_unreachable("unexpected non-branch terminator inst");
2418     }
2419 
2420     ++I;
2421   }
2422 
2423   if (I == E)
2424     return false;
2425 
2426   return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
2427 }
2428 
2429 unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB,
2430                                    int *BytesRemoved) const {
2431   MachineBasicBlock::iterator I = MBB.getFirstTerminator();
2432 
2433   unsigned Count = 0;
2434   unsigned RemovedSize = 0;
2435   while (I != MBB.end()) {
2436     MachineBasicBlock::iterator Next = std::next(I);
2437     RemovedSize += getInstSizeInBytes(*I);
2438     I->eraseFromParent();
2439     ++Count;
2440     I = Next;
2441   }
2442 
2443   if (BytesRemoved)
2444     *BytesRemoved = RemovedSize;
2445 
2446   return Count;
2447 }
2448 
2449 // Copy the flags onto the implicit condition register operand.
2450 static void preserveCondRegFlags(MachineOperand &CondReg,
2451                                  const MachineOperand &OrigCond) {
2452   CondReg.setIsUndef(OrigCond.isUndef());
2453   CondReg.setIsKill(OrigCond.isKill());
2454 }
2455 
2456 unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
2457                                    MachineBasicBlock *TBB,
2458                                    MachineBasicBlock *FBB,
2459                                    ArrayRef<MachineOperand> Cond,
2460                                    const DebugLoc &DL,
2461                                    int *BytesAdded) const {
2462   if (!FBB && Cond.empty()) {
2463     BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
2464       .addMBB(TBB);
2465     if (BytesAdded)
2466       *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
2467     return 1;
2468   }
2469 
2470   if(Cond.size() == 1 && Cond[0].isReg()) {
2471      BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO))
2472        .add(Cond[0])
2473        .addMBB(TBB);
2474      return 1;
2475   }
2476 
2477   assert(TBB && Cond[0].isImm());
2478 
2479   unsigned Opcode
2480     = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
2481 
2482   if (!FBB) {
2483     Cond[1].isUndef();
2484     MachineInstr *CondBr =
2485       BuildMI(&MBB, DL, get(Opcode))
2486       .addMBB(TBB);
2487 
2488     // Copy the flags onto the implicit condition register operand.
2489     preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
2490     fixImplicitOperands(*CondBr);
2491 
2492     if (BytesAdded)
2493       *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
2494     return 1;
2495   }
2496 
2497   assert(TBB && FBB);
2498 
2499   MachineInstr *CondBr =
2500     BuildMI(&MBB, DL, get(Opcode))
2501     .addMBB(TBB);
2502   fixImplicitOperands(*CondBr);
2503   BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
2504     .addMBB(FBB);
2505 
2506   MachineOperand &CondReg = CondBr->getOperand(1);
2507   CondReg.setIsUndef(Cond[1].isUndef());
2508   CondReg.setIsKill(Cond[1].isKill());
2509 
2510   if (BytesAdded)
2511     *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
2512 
2513   return 2;
2514 }
2515 
2516 bool SIInstrInfo::reverseBranchCondition(
2517   SmallVectorImpl<MachineOperand> &Cond) const {
2518   if (Cond.size() != 2) {
2519     return true;
2520   }
2521 
2522   if (Cond[0].isImm()) {
2523     Cond[0].setImm(-Cond[0].getImm());
2524     return false;
2525   }
2526 
2527   return true;
2528 }
2529 
2530 bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
2531                                   ArrayRef<MachineOperand> Cond,
2532                                   Register DstReg, Register TrueReg,
2533                                   Register FalseReg, int &CondCycles,
2534                                   int &TrueCycles, int &FalseCycles) const {
2535   switch (Cond[0].getImm()) {
2536   case VCCNZ:
2537   case VCCZ: {
2538     const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
2539     const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
2540     if (MRI.getRegClass(FalseReg) != RC)
2541       return false;
2542 
2543     int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
2544     CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
2545 
2546     // Limit to equal cost for branch vs. N v_cndmask_b32s.
2547     return RI.hasVGPRs(RC) && NumInsts <= 6;
2548   }
2549   case SCC_TRUE:
2550   case SCC_FALSE: {
2551     // FIXME: We could insert for VGPRs if we could replace the original compare
2552     // with a vector one.
2553     const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
2554     const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
2555     if (MRI.getRegClass(FalseReg) != RC)
2556       return false;
2557 
2558     int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
2559 
2560     // Multiples of 8 can do s_cselect_b64
2561     if (NumInsts % 2 == 0)
2562       NumInsts /= 2;
2563 
2564     CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
2565     return RI.isSGPRClass(RC);
2566   }
2567   default:
2568     return false;
2569   }
2570 }
2571 
2572 void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
2573                                MachineBasicBlock::iterator I, const DebugLoc &DL,
2574                                Register DstReg, ArrayRef<MachineOperand> Cond,
2575                                Register TrueReg, Register FalseReg) const {
2576   BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
2577   if (Pred == VCCZ || Pred == SCC_FALSE) {
2578     Pred = static_cast<BranchPredicate>(-Pred);
2579     std::swap(TrueReg, FalseReg);
2580   }
2581 
2582   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
2583   const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
2584   unsigned DstSize = RI.getRegSizeInBits(*DstRC);
2585 
2586   if (DstSize == 32) {
2587     MachineInstr *Select;
2588     if (Pred == SCC_TRUE) {
2589       Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
2590         .addReg(TrueReg)
2591         .addReg(FalseReg);
2592     } else {
2593       // Instruction's operands are backwards from what is expected.
2594       Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
2595         .addReg(FalseReg)
2596         .addReg(TrueReg);
2597     }
2598 
2599     preserveCondRegFlags(Select->getOperand(3), Cond[1]);
2600     return;
2601   }
2602 
2603   if (DstSize == 64 && Pred == SCC_TRUE) {
2604     MachineInstr *Select =
2605       BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
2606       .addReg(TrueReg)
2607       .addReg(FalseReg);
2608 
2609     preserveCondRegFlags(Select->getOperand(3), Cond[1]);
2610     return;
2611   }
2612 
2613   static const int16_t Sub0_15[] = {
2614     AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
2615     AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
2616     AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
2617     AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
2618   };
2619 
2620   static const int16_t Sub0_15_64[] = {
2621     AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
2622     AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
2623     AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
2624     AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
2625   };
2626 
2627   unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
2628   const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
2629   const int16_t *SubIndices = Sub0_15;
2630   int NElts = DstSize / 32;
2631 
2632   // 64-bit select is only available for SALU.
2633   // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
2634   if (Pred == SCC_TRUE) {
2635     if (NElts % 2) {
2636       SelOp = AMDGPU::S_CSELECT_B32;
2637       EltRC = &AMDGPU::SGPR_32RegClass;
2638     } else {
2639       SelOp = AMDGPU::S_CSELECT_B64;
2640       EltRC = &AMDGPU::SGPR_64RegClass;
2641       SubIndices = Sub0_15_64;
2642       NElts /= 2;
2643     }
2644   }
2645 
2646   MachineInstrBuilder MIB = BuildMI(
2647     MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
2648 
2649   I = MIB->getIterator();
2650 
2651   SmallVector<Register, 8> Regs;
2652   for (int Idx = 0; Idx != NElts; ++Idx) {
2653     Register DstElt = MRI.createVirtualRegister(EltRC);
2654     Regs.push_back(DstElt);
2655 
2656     unsigned SubIdx = SubIndices[Idx];
2657 
2658     MachineInstr *Select;
2659     if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
2660       Select =
2661         BuildMI(MBB, I, DL, get(SelOp), DstElt)
2662         .addReg(FalseReg, 0, SubIdx)
2663         .addReg(TrueReg, 0, SubIdx);
2664     } else {
2665       Select =
2666         BuildMI(MBB, I, DL, get(SelOp), DstElt)
2667         .addReg(TrueReg, 0, SubIdx)
2668         .addReg(FalseReg, 0, SubIdx);
2669     }
2670 
2671     preserveCondRegFlags(Select->getOperand(3), Cond[1]);
2672     fixImplicitOperands(*Select);
2673 
2674     MIB.addReg(DstElt)
2675        .addImm(SubIdx);
2676   }
2677 }
2678 
2679 bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const {
2680   switch (MI.getOpcode()) {
2681   case AMDGPU::V_MOV_B32_e32:
2682   case AMDGPU::V_MOV_B32_e64:
2683   case AMDGPU::V_MOV_B64_PSEUDO: {
2684     // If there are additional implicit register operands, this may be used for
2685     // register indexing so the source register operand isn't simply copied.
2686     unsigned NumOps = MI.getDesc().getNumOperands() +
2687       MI.getDesc().getNumImplicitUses();
2688 
2689     return MI.getNumOperands() == NumOps;
2690   }
2691   case AMDGPU::S_MOV_B32:
2692   case AMDGPU::S_MOV_B64:
2693   case AMDGPU::COPY:
2694   case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
2695   case AMDGPU::V_ACCVGPR_READ_B32_e64:
2696   case AMDGPU::V_ACCVGPR_MOV_B32:
2697     return true;
2698   default:
2699     return false;
2700   }
2701 }
2702 
2703 unsigned SIInstrInfo::getAddressSpaceForPseudoSourceKind(
2704     unsigned Kind) const {
2705   switch(Kind) {
2706   case PseudoSourceValue::Stack:
2707   case PseudoSourceValue::FixedStack:
2708     return AMDGPUAS::PRIVATE_ADDRESS;
2709   case PseudoSourceValue::ConstantPool:
2710   case PseudoSourceValue::GOT:
2711   case PseudoSourceValue::JumpTable:
2712   case PseudoSourceValue::GlobalValueCallEntry:
2713   case PseudoSourceValue::ExternalSymbolCallEntry:
2714   case PseudoSourceValue::TargetCustom:
2715     return AMDGPUAS::CONSTANT_ADDRESS;
2716   }
2717   return AMDGPUAS::FLAT_ADDRESS;
2718 }
2719 
2720 static void removeModOperands(MachineInstr &MI) {
2721   unsigned Opc = MI.getOpcode();
2722   int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc,
2723                                               AMDGPU::OpName::src0_modifiers);
2724   int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc,
2725                                               AMDGPU::OpName::src1_modifiers);
2726   int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc,
2727                                               AMDGPU::OpName::src2_modifiers);
2728 
2729   MI.RemoveOperand(Src2ModIdx);
2730   MI.RemoveOperand(Src1ModIdx);
2731   MI.RemoveOperand(Src0ModIdx);
2732 }
2733 
2734 bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
2735                                 Register Reg, MachineRegisterInfo *MRI) const {
2736   if (!MRI->hasOneNonDBGUse(Reg))
2737     return false;
2738 
2739   switch (DefMI.getOpcode()) {
2740   default:
2741     return false;
2742   case AMDGPU::S_MOV_B64:
2743     // TODO: We could fold 64-bit immediates, but this get compilicated
2744     // when there are sub-registers.
2745     return false;
2746 
2747   case AMDGPU::V_MOV_B32_e32:
2748   case AMDGPU::S_MOV_B32:
2749   case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
2750     break;
2751   }
2752 
2753   const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0);
2754   assert(ImmOp);
2755   // FIXME: We could handle FrameIndex values here.
2756   if (!ImmOp->isImm())
2757     return false;
2758 
2759   unsigned Opc = UseMI.getOpcode();
2760   if (Opc == AMDGPU::COPY) {
2761     Register DstReg = UseMI.getOperand(0).getReg();
2762     bool Is16Bit = getOpSize(UseMI, 0) == 2;
2763     bool isVGPRCopy = RI.isVGPR(*MRI, DstReg);
2764     unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2765     APInt Imm(32, ImmOp->getImm());
2766 
2767     if (UseMI.getOperand(1).getSubReg() == AMDGPU::hi16)
2768       Imm = Imm.ashr(16);
2769 
2770     if (RI.isAGPR(*MRI, DstReg)) {
2771       if (!isInlineConstant(Imm))
2772         return false;
2773       NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
2774     }
2775 
2776     if (Is16Bit) {
2777        if (isVGPRCopy)
2778          return false; // Do not clobber vgpr_hi16
2779 
2780        if (DstReg.isVirtual() &&
2781            UseMI.getOperand(0).getSubReg() != AMDGPU::lo16)
2782          return false;
2783 
2784       UseMI.getOperand(0).setSubReg(0);
2785       if (DstReg.isPhysical()) {
2786         DstReg = RI.get32BitRegister(DstReg);
2787         UseMI.getOperand(0).setReg(DstReg);
2788       }
2789       assert(UseMI.getOperand(1).getReg().isVirtual());
2790     }
2791 
2792     UseMI.setDesc(get(NewOpc));
2793     UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue());
2794     UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
2795     return true;
2796   }
2797 
2798   if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
2799       Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
2800       Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
2801       Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64) {
2802     // Don't fold if we are using source or output modifiers. The new VOP2
2803     // instructions don't have them.
2804     if (hasAnyModifiersSet(UseMI))
2805       return false;
2806 
2807     // If this is a free constant, there's no reason to do this.
2808     // TODO: We could fold this here instead of letting SIFoldOperands do it
2809     // later.
2810     MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
2811 
2812     // Any src operand can be used for the legality check.
2813     if (isInlineConstant(UseMI, *Src0, *ImmOp))
2814       return false;
2815 
2816     bool IsF32 = Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
2817                  Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64;
2818     bool IsFMA = Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
2819                  Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64;
2820     MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
2821     MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
2822 
2823     // Multiplied part is the constant: Use v_madmk_{f16, f32}.
2824     // We should only expect these to be on src0 due to canonicalizations.
2825     if (Src0->isReg() && Src0->getReg() == Reg) {
2826       if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
2827         return false;
2828 
2829       if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
2830         return false;
2831 
2832       unsigned NewOpc =
2833         IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32 : AMDGPU::V_FMAMK_F16)
2834               : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
2835       if (pseudoToMCOpcode(NewOpc) == -1)
2836         return false;
2837 
2838       // We need to swap operands 0 and 1 since madmk constant is at operand 1.
2839 
2840       const int64_t Imm = ImmOp->getImm();
2841 
2842       // FIXME: This would be a lot easier if we could return a new instruction
2843       // instead of having to modify in place.
2844 
2845       // Remove these first since they are at the end.
2846       UseMI.RemoveOperand(
2847           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
2848       UseMI.RemoveOperand(
2849           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
2850 
2851       Register Src1Reg = Src1->getReg();
2852       unsigned Src1SubReg = Src1->getSubReg();
2853       Src0->setReg(Src1Reg);
2854       Src0->setSubReg(Src1SubReg);
2855       Src0->setIsKill(Src1->isKill());
2856 
2857       if (Opc == AMDGPU::V_MAC_F32_e64 ||
2858           Opc == AMDGPU::V_MAC_F16_e64 ||
2859           Opc == AMDGPU::V_FMAC_F32_e64 ||
2860           Opc == AMDGPU::V_FMAC_F16_e64)
2861         UseMI.untieRegOperand(
2862             AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
2863 
2864       Src1->ChangeToImmediate(Imm);
2865 
2866       removeModOperands(UseMI);
2867       UseMI.setDesc(get(NewOpc));
2868 
2869       bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
2870       if (DeleteDef)
2871         DefMI.eraseFromParent();
2872 
2873       return true;
2874     }
2875 
2876     // Added part is the constant: Use v_madak_{f16, f32}.
2877     if (Src2->isReg() && Src2->getReg() == Reg) {
2878       // Not allowed to use constant bus for another operand.
2879       // We can however allow an inline immediate as src0.
2880       bool Src0Inlined = false;
2881       if (Src0->isReg()) {
2882         // Try to inline constant if possible.
2883         // If the Def moves immediate and the use is single
2884         // We are saving VGPR here.
2885         MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
2886         if (Def && Def->isMoveImmediate() &&
2887           isInlineConstant(Def->getOperand(1)) &&
2888           MRI->hasOneUse(Src0->getReg())) {
2889           Src0->ChangeToImmediate(Def->getOperand(1).getImm());
2890           Src0Inlined = true;
2891         } else if ((Src0->getReg().isPhysical() &&
2892                     (ST.getConstantBusLimit(Opc) <= 1 &&
2893                      RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg())))) ||
2894                    (Src0->getReg().isVirtual() &&
2895                     (ST.getConstantBusLimit(Opc) <= 1 &&
2896                      RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))))
2897           return false;
2898           // VGPR is okay as Src0 - fallthrough
2899       }
2900 
2901       if (Src1->isReg() && !Src0Inlined ) {
2902         // We have one slot for inlinable constant so far - try to fill it
2903         MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
2904         if (Def && Def->isMoveImmediate() &&
2905             isInlineConstant(Def->getOperand(1)) &&
2906             MRI->hasOneUse(Src1->getReg()) &&
2907             commuteInstruction(UseMI)) {
2908             Src0->ChangeToImmediate(Def->getOperand(1).getImm());
2909         } else if ((Src1->getReg().isPhysical() &&
2910                     RI.isSGPRClass(RI.getPhysRegClass(Src1->getReg()))) ||
2911                    (Src1->getReg().isVirtual() &&
2912                     RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
2913           return false;
2914           // VGPR is okay as Src1 - fallthrough
2915       }
2916 
2917       unsigned NewOpc =
2918         IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32 : AMDGPU::V_FMAAK_F16)
2919               : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
2920       if (pseudoToMCOpcode(NewOpc) == -1)
2921         return false;
2922 
2923       const int64_t Imm = ImmOp->getImm();
2924 
2925       // FIXME: This would be a lot easier if we could return a new instruction
2926       // instead of having to modify in place.
2927 
2928       // Remove these first since they are at the end.
2929       UseMI.RemoveOperand(
2930           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
2931       UseMI.RemoveOperand(
2932           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
2933 
2934       if (Opc == AMDGPU::V_MAC_F32_e64 ||
2935           Opc == AMDGPU::V_MAC_F16_e64 ||
2936           Opc == AMDGPU::V_FMAC_F32_e64 ||
2937           Opc == AMDGPU::V_FMAC_F16_e64)
2938         UseMI.untieRegOperand(
2939             AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
2940 
2941       // ChangingToImmediate adds Src2 back to the instruction.
2942       Src2->ChangeToImmediate(Imm);
2943 
2944       // These come before src2.
2945       removeModOperands(UseMI);
2946       UseMI.setDesc(get(NewOpc));
2947       // It might happen that UseMI was commuted
2948       // and we now have SGPR as SRC1. If so 2 inlined
2949       // constant and SGPR are illegal.
2950       legalizeOperands(UseMI);
2951 
2952       bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
2953       if (DeleteDef)
2954         DefMI.eraseFromParent();
2955 
2956       return true;
2957     }
2958   }
2959 
2960   return false;
2961 }
2962 
2963 static bool
2964 memOpsHaveSameBaseOperands(ArrayRef<const MachineOperand *> BaseOps1,
2965                            ArrayRef<const MachineOperand *> BaseOps2) {
2966   if (BaseOps1.size() != BaseOps2.size())
2967     return false;
2968   for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
2969     if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
2970       return false;
2971   }
2972   return true;
2973 }
2974 
2975 static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
2976                                 int WidthB, int OffsetB) {
2977   int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
2978   int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
2979   int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
2980   return LowOffset + LowWidth <= HighOffset;
2981 }
2982 
2983 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
2984                                                const MachineInstr &MIb) const {
2985   SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
2986   int64_t Offset0, Offset1;
2987   unsigned Dummy0, Dummy1;
2988   bool Offset0IsScalable, Offset1IsScalable;
2989   if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
2990                                      Dummy0, &RI) ||
2991       !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
2992                                      Dummy1, &RI))
2993     return false;
2994 
2995   if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
2996     return false;
2997 
2998   if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
2999     // FIXME: Handle ds_read2 / ds_write2.
3000     return false;
3001   }
3002   unsigned Width0 = MIa.memoperands().front()->getSize();
3003   unsigned Width1 = MIb.memoperands().front()->getSize();
3004   return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
3005 }
3006 
3007 bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
3008                                                   const MachineInstr &MIb) const {
3009   assert(MIa.mayLoadOrStore() &&
3010          "MIa must load from or modify a memory location");
3011   assert(MIb.mayLoadOrStore() &&
3012          "MIb must load from or modify a memory location");
3013 
3014   if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects())
3015     return false;
3016 
3017   // XXX - Can we relax this between address spaces?
3018   if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
3019     return false;
3020 
3021   // TODO: Should we check the address space from the MachineMemOperand? That
3022   // would allow us to distinguish objects we know don't alias based on the
3023   // underlying address space, even if it was lowered to a different one,
3024   // e.g. private accesses lowered to use MUBUF instructions on a scratch
3025   // buffer.
3026   if (isDS(MIa)) {
3027     if (isDS(MIb))
3028       return checkInstOffsetsDoNotOverlap(MIa, MIb);
3029 
3030     return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
3031   }
3032 
3033   if (isMUBUF(MIa) || isMTBUF(MIa)) {
3034     if (isMUBUF(MIb) || isMTBUF(MIb))
3035       return checkInstOffsetsDoNotOverlap(MIa, MIb);
3036 
3037     return !isFLAT(MIb) && !isSMRD(MIb);
3038   }
3039 
3040   if (isSMRD(MIa)) {
3041     if (isSMRD(MIb))
3042       return checkInstOffsetsDoNotOverlap(MIa, MIb);
3043 
3044     return !isFLAT(MIb) && !isMUBUF(MIb) && !isMTBUF(MIb);
3045   }
3046 
3047   if (isFLAT(MIa)) {
3048     if (isFLAT(MIb))
3049       return checkInstOffsetsDoNotOverlap(MIa, MIb);
3050 
3051     return false;
3052   }
3053 
3054   return false;
3055 }
3056 
3057 static int64_t getFoldableImm(const MachineOperand* MO) {
3058   if (!MO->isReg())
3059     return false;
3060   const MachineFunction *MF = MO->getParent()->getParent()->getParent();
3061   const MachineRegisterInfo &MRI = MF->getRegInfo();
3062   auto Def = MRI.getUniqueVRegDef(MO->getReg());
3063   if (Def && Def->getOpcode() == AMDGPU::V_MOV_B32_e32 &&
3064       Def->getOperand(1).isImm())
3065     return Def->getOperand(1).getImm();
3066   return AMDGPU::NoRegister;
3067 }
3068 
3069 static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI,
3070                                 MachineInstr &NewMI) {
3071   if (LV) {
3072     unsigned NumOps = MI.getNumOperands();
3073     for (unsigned I = 1; I < NumOps; ++I) {
3074       MachineOperand &Op = MI.getOperand(I);
3075       if (Op.isReg() && Op.isKill())
3076         LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
3077     }
3078   }
3079 }
3080 
3081 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
3082                                                  MachineInstr &MI,
3083                                                  LiveVariables *LV) const {
3084   unsigned Opc = MI.getOpcode();
3085   bool IsF16 = false;
3086   bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3087                Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3088                Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3089   bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3090 
3091   switch (Opc) {
3092   default:
3093     return nullptr;
3094   case AMDGPU::V_MAC_F16_e64:
3095   case AMDGPU::V_FMAC_F16_e64:
3096     IsF16 = true;
3097     LLVM_FALLTHROUGH;
3098   case AMDGPU::V_MAC_F32_e64:
3099   case AMDGPU::V_FMAC_F32_e64:
3100   case AMDGPU::V_FMAC_F64_e64:
3101     break;
3102   case AMDGPU::V_MAC_F16_e32:
3103   case AMDGPU::V_FMAC_F16_e32:
3104     IsF16 = true;
3105     LLVM_FALLTHROUGH;
3106   case AMDGPU::V_MAC_F32_e32:
3107   case AMDGPU::V_FMAC_F32_e32:
3108   case AMDGPU::V_FMAC_F64_e32: {
3109     int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
3110                                              AMDGPU::OpName::src0);
3111     const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
3112     if (!Src0->isReg() && !Src0->isImm())
3113       return nullptr;
3114 
3115     if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
3116       return nullptr;
3117 
3118     break;
3119   }
3120   }
3121 
3122   const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
3123   const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
3124   const MachineOperand *Src0Mods =
3125     getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
3126   const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
3127   const MachineOperand *Src1Mods =
3128     getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
3129   const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
3130   const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
3131   const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
3132   MachineInstrBuilder MIB;
3133 
3134   if (!Src0Mods && !Src1Mods && !Clamp && !Omod && !IsF64 &&
3135       // If we have an SGPR input, we will violate the constant bus restriction.
3136       (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
3137        !RI.isSGPRReg(MBB->getParent()->getRegInfo(), Src0->getReg()))) {
3138     if (auto Imm = getFoldableImm(Src2)) {
3139       unsigned NewOpc =
3140           IsFMA ? (IsF16 ? AMDGPU::V_FMAAK_F16 : AMDGPU::V_FMAAK_F32)
3141                 : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
3142       if (pseudoToMCOpcode(NewOpc) != -1) {
3143         MIB = BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
3144                   .add(*Dst)
3145                   .add(*Src0)
3146                   .add(*Src1)
3147                   .addImm(Imm);
3148         updateLiveVariables(LV, MI, *MIB);
3149         return MIB;
3150       }
3151     }
3152     unsigned NewOpc = IsFMA
3153                           ? (IsF16 ? AMDGPU::V_FMAMK_F16 : AMDGPU::V_FMAMK_F32)
3154                           : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
3155     if (auto Imm = getFoldableImm(Src1)) {
3156       if (pseudoToMCOpcode(NewOpc) != -1) {
3157         MIB = BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
3158                   .add(*Dst)
3159                   .add(*Src0)
3160                   .addImm(Imm)
3161                   .add(*Src2);
3162         updateLiveVariables(LV, MI, *MIB);
3163         return MIB;
3164       }
3165     }
3166     if (auto Imm = getFoldableImm(Src0)) {
3167       if (pseudoToMCOpcode(NewOpc) != -1 &&
3168           isOperandLegal(
3169               MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
3170               Src1)) {
3171         MIB = BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
3172                   .add(*Dst)
3173                   .add(*Src1)
3174                   .addImm(Imm)
3175                   .add(*Src2);
3176         updateLiveVariables(LV, MI, *MIB);
3177         return MIB;
3178       }
3179     }
3180   }
3181 
3182   unsigned NewOpc = IsFMA ? (IsF16 ? AMDGPU::V_FMA_F16_e64
3183                                    : IsF64 ? AMDGPU::V_FMA_F64_e64
3184                                            : AMDGPU::V_FMA_F32_e64)
3185                           : (IsF16 ? AMDGPU::V_MAD_F16_e64 : AMDGPU::V_MAD_F32_e64);
3186   if (pseudoToMCOpcode(NewOpc) == -1)
3187     return nullptr;
3188 
3189   MIB = BuildMI(*MBB, MI, MI.getDebugLoc(), get(NewOpc))
3190             .add(*Dst)
3191             .addImm(Src0Mods ? Src0Mods->getImm() : 0)
3192             .add(*Src0)
3193             .addImm(Src1Mods ? Src1Mods->getImm() : 0)
3194             .add(*Src1)
3195             .addImm(0) // Src mods
3196             .add(*Src2)
3197             .addImm(Clamp ? Clamp->getImm() : 0)
3198             .addImm(Omod ? Omod->getImm() : 0);
3199   updateLiveVariables(LV, MI, *MIB);
3200   return MIB;
3201 }
3202 
3203 // It's not generally safe to move VALU instructions across these since it will
3204 // start using the register as a base index rather than directly.
3205 // XXX - Why isn't hasSideEffects sufficient for these?
3206 static bool changesVGPRIndexingMode(const MachineInstr &MI) {
3207   switch (MI.getOpcode()) {
3208   case AMDGPU::S_SET_GPR_IDX_ON:
3209   case AMDGPU::S_SET_GPR_IDX_MODE:
3210   case AMDGPU::S_SET_GPR_IDX_OFF:
3211     return true;
3212   default:
3213     return false;
3214   }
3215 }
3216 
3217 bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
3218                                        const MachineBasicBlock *MBB,
3219                                        const MachineFunction &MF) const {
3220   // Skipping the check for SP writes in the base implementation. The reason it
3221   // was added was apparently due to compile time concerns.
3222   //
3223   // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
3224   // but is probably avoidable.
3225 
3226   // Copied from base implementation.
3227   // Terminators and labels can't be scheduled around.
3228   if (MI.isTerminator() || MI.isPosition())
3229     return true;
3230 
3231   // INLINEASM_BR can jump to another block
3232   if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
3233     return true;
3234 
3235   // Target-independent instructions do not have an implicit-use of EXEC, even
3236   // when they operate on VGPRs. Treating EXEC modifications as scheduling
3237   // boundaries prevents incorrect movements of such instructions.
3238   return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
3239          MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
3240          MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
3241          changesVGPRIndexingMode(MI);
3242 }
3243 
3244 bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const {
3245   return Opcode == AMDGPU::DS_ORDERED_COUNT ||
3246          Opcode == AMDGPU::DS_GWS_INIT ||
3247          Opcode == AMDGPU::DS_GWS_SEMA_V ||
3248          Opcode == AMDGPU::DS_GWS_SEMA_BR ||
3249          Opcode == AMDGPU::DS_GWS_SEMA_P ||
3250          Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL ||
3251          Opcode == AMDGPU::DS_GWS_BARRIER;
3252 }
3253 
3254 bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) {
3255   // Skip the full operand and register alias search modifiesRegister
3256   // does. There's only a handful of instructions that touch this, it's only an
3257   // implicit def, and doesn't alias any other registers.
3258   if (const MCPhysReg *ImpDef = MI.getDesc().getImplicitDefs()) {
3259     for (; ImpDef && *ImpDef; ++ImpDef) {
3260       if (*ImpDef == AMDGPU::MODE)
3261         return true;
3262     }
3263   }
3264 
3265   return false;
3266 }
3267 
3268 bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const {
3269   unsigned Opcode = MI.getOpcode();
3270 
3271   if (MI.mayStore() && isSMRD(MI))
3272     return true; // scalar store or atomic
3273 
3274   // This will terminate the function when other lanes may need to continue.
3275   if (MI.isReturn())
3276     return true;
3277 
3278   // These instructions cause shader I/O that may cause hardware lockups
3279   // when executed with an empty EXEC mask.
3280   //
3281   // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
3282   //       EXEC = 0, but checking for that case here seems not worth it
3283   //       given the typical code patterns.
3284   if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
3285       isEXP(Opcode) ||
3286       Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP ||
3287       Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER)
3288     return true;
3289 
3290   if (MI.isCall() || MI.isInlineAsm())
3291     return true; // conservative assumption
3292 
3293   // A mode change is a scalar operation that influences vector instructions.
3294   if (modifiesModeRegister(MI))
3295     return true;
3296 
3297   // These are like SALU instructions in terms of effects, so it's questionable
3298   // whether we should return true for those.
3299   //
3300   // However, executing them with EXEC = 0 causes them to operate on undefined
3301   // data, which we avoid by returning true here.
3302   if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
3303       Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32)
3304     return true;
3305 
3306   return false;
3307 }
3308 
3309 bool SIInstrInfo::mayReadEXEC(const MachineRegisterInfo &MRI,
3310                               const MachineInstr &MI) const {
3311   if (MI.isMetaInstruction())
3312     return false;
3313 
3314   // This won't read exec if this is an SGPR->SGPR copy.
3315   if (MI.isCopyLike()) {
3316     if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
3317       return true;
3318 
3319     // Make sure this isn't copying exec as a normal operand
3320     return MI.readsRegister(AMDGPU::EXEC, &RI);
3321   }
3322 
3323   // Make a conservative assumption about the callee.
3324   if (MI.isCall())
3325     return true;
3326 
3327   // Be conservative with any unhandled generic opcodes.
3328   if (!isTargetSpecificOpcode(MI.getOpcode()))
3329     return true;
3330 
3331   return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
3332 }
3333 
3334 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
3335   switch (Imm.getBitWidth()) {
3336   case 1: // This likely will be a condition code mask.
3337     return true;
3338 
3339   case 32:
3340     return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
3341                                         ST.hasInv2PiInlineImm());
3342   case 64:
3343     return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
3344                                         ST.hasInv2PiInlineImm());
3345   case 16:
3346     return ST.has16BitInsts() &&
3347            AMDGPU::isInlinableLiteral16(Imm.getSExtValue(),
3348                                         ST.hasInv2PiInlineImm());
3349   default:
3350     llvm_unreachable("invalid bitwidth");
3351   }
3352 }
3353 
3354 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
3355                                    uint8_t OperandType) const {
3356   if (!MO.isImm() ||
3357       OperandType < AMDGPU::OPERAND_SRC_FIRST ||
3358       OperandType > AMDGPU::OPERAND_SRC_LAST)
3359     return false;
3360 
3361   // MachineOperand provides no way to tell the true operand size, since it only
3362   // records a 64-bit value. We need to know the size to determine if a 32-bit
3363   // floating point immediate bit pattern is legal for an integer immediate. It
3364   // would be for any 32-bit integer operand, but would not be for a 64-bit one.
3365 
3366   int64_t Imm = MO.getImm();
3367   switch (OperandType) {
3368   case AMDGPU::OPERAND_REG_IMM_INT32:
3369   case AMDGPU::OPERAND_REG_IMM_FP32:
3370   case AMDGPU::OPERAND_REG_INLINE_C_INT32:
3371   case AMDGPU::OPERAND_REG_INLINE_C_FP32:
3372   case AMDGPU::OPERAND_REG_IMM_V2FP32:
3373   case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
3374   case AMDGPU::OPERAND_REG_IMM_V2INT32:
3375   case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
3376   case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
3377   case AMDGPU::OPERAND_REG_INLINE_AC_FP32: {
3378     int32_t Trunc = static_cast<int32_t>(Imm);
3379     return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
3380   }
3381   case AMDGPU::OPERAND_REG_IMM_INT64:
3382   case AMDGPU::OPERAND_REG_IMM_FP64:
3383   case AMDGPU::OPERAND_REG_INLINE_C_INT64:
3384   case AMDGPU::OPERAND_REG_INLINE_C_FP64:
3385   case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
3386     return AMDGPU::isInlinableLiteral64(MO.getImm(),
3387                                         ST.hasInv2PiInlineImm());
3388   case AMDGPU::OPERAND_REG_IMM_INT16:
3389   case AMDGPU::OPERAND_REG_INLINE_C_INT16:
3390   case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
3391     // We would expect inline immediates to not be concerned with an integer/fp
3392     // distinction. However, in the case of 16-bit integer operations, the
3393     // "floating point" values appear to not work. It seems read the low 16-bits
3394     // of 32-bit immediates, which happens to always work for the integer
3395     // values.
3396     //
3397     // See llvm bugzilla 46302.
3398     //
3399     // TODO: Theoretically we could use op-sel to use the high bits of the
3400     // 32-bit FP values.
3401     return AMDGPU::isInlinableIntLiteral(Imm);
3402   case AMDGPU::OPERAND_REG_IMM_V2INT16:
3403   case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
3404   case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
3405     // This suffers the same problem as the scalar 16-bit cases.
3406     return AMDGPU::isInlinableIntLiteralV216(Imm);
3407   case AMDGPU::OPERAND_REG_IMM_FP16:
3408   case AMDGPU::OPERAND_REG_INLINE_C_FP16:
3409   case AMDGPU::OPERAND_REG_INLINE_AC_FP16: {
3410     if (isInt<16>(Imm) || isUInt<16>(Imm)) {
3411       // A few special case instructions have 16-bit operands on subtargets
3412       // where 16-bit instructions are not legal.
3413       // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
3414       // constants in these cases
3415       int16_t Trunc = static_cast<int16_t>(Imm);
3416       return ST.has16BitInsts() &&
3417              AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm());
3418     }
3419 
3420     return false;
3421   }
3422   case AMDGPU::OPERAND_REG_IMM_V2FP16:
3423   case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
3424   case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: {
3425     uint32_t Trunc = static_cast<uint32_t>(Imm);
3426     return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm());
3427   }
3428   default:
3429     llvm_unreachable("invalid bitwidth");
3430   }
3431 }
3432 
3433 bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO,
3434                                         const MCOperandInfo &OpInfo) const {
3435   switch (MO.getType()) {
3436   case MachineOperand::MO_Register:
3437     return false;
3438   case MachineOperand::MO_Immediate:
3439     return !isInlineConstant(MO, OpInfo);
3440   case MachineOperand::MO_FrameIndex:
3441   case MachineOperand::MO_MachineBasicBlock:
3442   case MachineOperand::MO_ExternalSymbol:
3443   case MachineOperand::MO_GlobalAddress:
3444   case MachineOperand::MO_MCSymbol:
3445     return true;
3446   default:
3447     llvm_unreachable("unexpected operand type");
3448   }
3449 }
3450 
3451 static bool compareMachineOp(const MachineOperand &Op0,
3452                              const MachineOperand &Op1) {
3453   if (Op0.getType() != Op1.getType())
3454     return false;
3455 
3456   switch (Op0.getType()) {
3457   case MachineOperand::MO_Register:
3458     return Op0.getReg() == Op1.getReg();
3459   case MachineOperand::MO_Immediate:
3460     return Op0.getImm() == Op1.getImm();
3461   default:
3462     llvm_unreachable("Didn't expect to be comparing these operand types");
3463   }
3464 }
3465 
3466 bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
3467                                     const MachineOperand &MO) const {
3468   const MCInstrDesc &InstDesc = MI.getDesc();
3469   const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpNo];
3470 
3471   assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
3472 
3473   if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
3474     return true;
3475 
3476   if (OpInfo.RegClass < 0)
3477     return false;
3478 
3479   if (MO.isImm() && isInlineConstant(MO, OpInfo)) {
3480     if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() &&
3481         OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(),
3482                                                     AMDGPU::OpName::src2))
3483       return false;
3484     return RI.opCanUseInlineConstant(OpInfo.OperandType);
3485   }
3486 
3487   if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
3488     return false;
3489 
3490   if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo))
3491     return true;
3492 
3493   return ST.hasVOP3Literal();
3494 }
3495 
3496 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
3497   // GFX90A does not have V_MUL_LEGACY_F32_e32.
3498   if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
3499     return false;
3500 
3501   int Op32 = AMDGPU::getVOPe32(Opcode);
3502   if (Op32 == -1)
3503     return false;
3504 
3505   return pseudoToMCOpcode(Op32) != -1;
3506 }
3507 
3508 bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
3509   // The src0_modifier operand is present on all instructions
3510   // that have modifiers.
3511 
3512   return AMDGPU::getNamedOperandIdx(Opcode,
3513                                     AMDGPU::OpName::src0_modifiers) != -1;
3514 }
3515 
3516 bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI,
3517                                   unsigned OpName) const {
3518   const MachineOperand *Mods = getNamedOperand(MI, OpName);
3519   return Mods && Mods->getImm();
3520 }
3521 
3522 bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const {
3523   return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
3524          hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
3525          hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) ||
3526          hasModifiersSet(MI, AMDGPU::OpName::clamp) ||
3527          hasModifiersSet(MI, AMDGPU::OpName::omod);
3528 }
3529 
3530 bool SIInstrInfo::canShrink(const MachineInstr &MI,
3531                             const MachineRegisterInfo &MRI) const {
3532   const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
3533   // Can't shrink instruction with three operands.
3534   // FIXME: v_cndmask_b32 has 3 operands and is shrinkable, but we need to add
3535   // a special case for it.  It can only be shrunk if the third operand
3536   // is vcc, and src0_modifiers and src1_modifiers are not set.
3537   // We should handle this the same way we handle vopc, by addding
3538   // a register allocation hint pre-regalloc and then do the shrinking
3539   // post-regalloc.
3540   if (Src2) {
3541     switch (MI.getOpcode()) {
3542       default: return false;
3543 
3544       case AMDGPU::V_ADDC_U32_e64:
3545       case AMDGPU::V_SUBB_U32_e64:
3546       case AMDGPU::V_SUBBREV_U32_e64: {
3547         const MachineOperand *Src1
3548           = getNamedOperand(MI, AMDGPU::OpName::src1);
3549         if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
3550           return false;
3551         // Additional verification is needed for sdst/src2.
3552         return true;
3553       }
3554       case AMDGPU::V_MAC_F32_e64:
3555       case AMDGPU::V_MAC_F16_e64:
3556       case AMDGPU::V_FMAC_F32_e64:
3557       case AMDGPU::V_FMAC_F16_e64:
3558       case AMDGPU::V_FMAC_F64_e64:
3559         if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
3560             hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
3561           return false;
3562         break;
3563 
3564       case AMDGPU::V_CNDMASK_B32_e64:
3565         break;
3566     }
3567   }
3568 
3569   const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
3570   if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
3571                hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
3572     return false;
3573 
3574   // We don't need to check src0, all input types are legal, so just make sure
3575   // src0 isn't using any modifiers.
3576   if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
3577     return false;
3578 
3579   // Can it be shrunk to a valid 32 bit opcode?
3580   if (!hasVALU32BitEncoding(MI.getOpcode()))
3581     return false;
3582 
3583   // Check output modifiers
3584   return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
3585          !hasModifiersSet(MI, AMDGPU::OpName::clamp);
3586 }
3587 
3588 // Set VCC operand with all flags from \p Orig, except for setting it as
3589 // implicit.
3590 static void copyFlagsToImplicitVCC(MachineInstr &MI,
3591                                    const MachineOperand &Orig) {
3592 
3593   for (MachineOperand &Use : MI.implicit_operands()) {
3594     if (Use.isUse() &&
3595         (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
3596       Use.setIsUndef(Orig.isUndef());
3597       Use.setIsKill(Orig.isKill());
3598       return;
3599     }
3600   }
3601 }
3602 
3603 MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI,
3604                                            unsigned Op32) const {
3605   MachineBasicBlock *MBB = MI.getParent();;
3606   MachineInstrBuilder Inst32 =
3607     BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32))
3608     .setMIFlags(MI.getFlags());
3609 
3610   // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
3611   // For VOPC instructions, this is replaced by an implicit def of vcc.
3612   int Op32DstIdx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::vdst);
3613   if (Op32DstIdx != -1) {
3614     // dst
3615     Inst32.add(MI.getOperand(0));
3616   } else {
3617     assert(((MI.getOperand(0).getReg() == AMDGPU::VCC) ||
3618             (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) &&
3619            "Unexpected case");
3620   }
3621 
3622   Inst32.add(*getNamedOperand(MI, AMDGPU::OpName::src0));
3623 
3624   const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
3625   if (Src1)
3626     Inst32.add(*Src1);
3627 
3628   const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
3629 
3630   if (Src2) {
3631     int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
3632     if (Op32Src2Idx != -1) {
3633       Inst32.add(*Src2);
3634     } else {
3635       // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
3636       // replaced with an implicit read of vcc or vcc_lo. The implicit read
3637       // of vcc was already added during the initial BuildMI, but we
3638       // 1) may need to change vcc to vcc_lo to preserve the original register
3639       // 2) have to preserve the original flags.
3640       fixImplicitOperands(*Inst32);
3641       copyFlagsToImplicitVCC(*Inst32, *Src2);
3642     }
3643   }
3644 
3645   return Inst32;
3646 }
3647 
3648 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
3649                                   const MachineOperand &MO,
3650                                   const MCOperandInfo &OpInfo) const {
3651   // Literal constants use the constant bus.
3652   //if (isLiteralConstantLike(MO, OpInfo))
3653   // return true;
3654   if (MO.isImm())
3655     return !isInlineConstant(MO, OpInfo);
3656 
3657   if (!MO.isReg())
3658     return true; // Misc other operands like FrameIndex
3659 
3660   if (!MO.isUse())
3661     return false;
3662 
3663   if (MO.getReg().isVirtual())
3664     return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
3665 
3666   // Null is free
3667   if (MO.getReg() == AMDGPU::SGPR_NULL)
3668     return false;
3669 
3670   // SGPRs use the constant bus
3671   if (MO.isImplicit()) {
3672     return MO.getReg() == AMDGPU::M0 ||
3673            MO.getReg() == AMDGPU::VCC ||
3674            MO.getReg() == AMDGPU::VCC_LO;
3675   } else {
3676     return AMDGPU::SReg_32RegClass.contains(MO.getReg()) ||
3677            AMDGPU::SReg_64RegClass.contains(MO.getReg());
3678   }
3679 }
3680 
3681 static Register findImplicitSGPRRead(const MachineInstr &MI) {
3682   for (const MachineOperand &MO : MI.implicit_operands()) {
3683     // We only care about reads.
3684     if (MO.isDef())
3685       continue;
3686 
3687     switch (MO.getReg()) {
3688     case AMDGPU::VCC:
3689     case AMDGPU::VCC_LO:
3690     case AMDGPU::VCC_HI:
3691     case AMDGPU::M0:
3692     case AMDGPU::FLAT_SCR:
3693       return MO.getReg();
3694 
3695     default:
3696       break;
3697     }
3698   }
3699 
3700   return AMDGPU::NoRegister;
3701 }
3702 
3703 static bool shouldReadExec(const MachineInstr &MI) {
3704   if (SIInstrInfo::isVALU(MI)) {
3705     switch (MI.getOpcode()) {
3706     case AMDGPU::V_READLANE_B32:
3707     case AMDGPU::V_WRITELANE_B32:
3708       return false;
3709     }
3710 
3711     return true;
3712   }
3713 
3714   if (MI.isPreISelOpcode() ||
3715       SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
3716       SIInstrInfo::isSALU(MI) ||
3717       SIInstrInfo::isSMRD(MI))
3718     return false;
3719 
3720   return true;
3721 }
3722 
3723 static bool isSubRegOf(const SIRegisterInfo &TRI,
3724                        const MachineOperand &SuperVec,
3725                        const MachineOperand &SubReg) {
3726   if (SubReg.getReg().isPhysical())
3727     return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
3728 
3729   return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
3730          SubReg.getReg() == SuperVec.getReg();
3731 }
3732 
3733 bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
3734                                     StringRef &ErrInfo) const {
3735   uint16_t Opcode = MI.getOpcode();
3736   if (SIInstrInfo::isGenericOpcode(MI.getOpcode()))
3737     return true;
3738 
3739   const MachineFunction *MF = MI.getParent()->getParent();
3740   const MachineRegisterInfo &MRI = MF->getRegInfo();
3741 
3742   int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
3743   int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
3744   int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
3745 
3746   // Make sure the number of operands is correct.
3747   const MCInstrDesc &Desc = get(Opcode);
3748   if (!Desc.isVariadic() &&
3749       Desc.getNumOperands() != MI.getNumExplicitOperands()) {
3750     ErrInfo = "Instruction has wrong number of operands.";
3751     return false;
3752   }
3753 
3754   if (MI.isInlineAsm()) {
3755     // Verify register classes for inlineasm constraints.
3756     for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
3757          I != E; ++I) {
3758       const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
3759       if (!RC)
3760         continue;
3761 
3762       const MachineOperand &Op = MI.getOperand(I);
3763       if (!Op.isReg())
3764         continue;
3765 
3766       Register Reg = Op.getReg();
3767       if (!Reg.isVirtual() && !RC->contains(Reg)) {
3768         ErrInfo = "inlineasm operand has incorrect register class.";
3769         return false;
3770       }
3771     }
3772 
3773     return true;
3774   }
3775 
3776   if (isMIMG(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
3777     ErrInfo = "missing memory operand from MIMG instruction.";
3778     return false;
3779   }
3780 
3781   // Make sure the register classes are correct.
3782   for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
3783     const MachineOperand &MO = MI.getOperand(i);
3784     if (MO.isFPImm()) {
3785       ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
3786                 "all fp values to integers.";
3787       return false;
3788     }
3789 
3790     int RegClass = Desc.OpInfo[i].RegClass;
3791 
3792     switch (Desc.OpInfo[i].OperandType) {
3793     case MCOI::OPERAND_REGISTER:
3794       if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
3795         ErrInfo = "Illegal immediate value for operand.";
3796         return false;
3797       }
3798       break;
3799     case AMDGPU::OPERAND_REG_IMM_INT32:
3800     case AMDGPU::OPERAND_REG_IMM_FP32:
3801       break;
3802     case AMDGPU::OPERAND_REG_INLINE_C_INT32:
3803     case AMDGPU::OPERAND_REG_INLINE_C_FP32:
3804     case AMDGPU::OPERAND_REG_INLINE_C_INT64:
3805     case AMDGPU::OPERAND_REG_INLINE_C_FP64:
3806     case AMDGPU::OPERAND_REG_INLINE_C_INT16:
3807     case AMDGPU::OPERAND_REG_INLINE_C_FP16:
3808     case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
3809     case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
3810     case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
3811     case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
3812     case AMDGPU::OPERAND_REG_INLINE_AC_FP64: {
3813       if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
3814         ErrInfo = "Illegal immediate value for operand.";
3815         return false;
3816       }
3817       break;
3818     }
3819     case MCOI::OPERAND_IMMEDIATE:
3820     case AMDGPU::OPERAND_KIMM32:
3821       // Check if this operand is an immediate.
3822       // FrameIndex operands will be replaced by immediates, so they are
3823       // allowed.
3824       if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
3825         ErrInfo = "Expected immediate, but got non-immediate";
3826         return false;
3827       }
3828       LLVM_FALLTHROUGH;
3829     default:
3830       continue;
3831     }
3832 
3833     if (!MO.isReg())
3834       continue;
3835     Register Reg = MO.getReg();
3836     if (!Reg)
3837       continue;
3838 
3839     // FIXME: Ideally we would have separate instruction definitions with the
3840     // aligned register constraint.
3841     // FIXME: We do not verify inline asm operands, but custom inline asm
3842     // verification is broken anyway
3843     if (ST.needsAlignedVGPRs()) {
3844       const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
3845       const bool IsVGPR = RI.hasVGPRs(RC);
3846       const bool IsAGPR = !IsVGPR && RI.hasAGPRs(RC);
3847       if ((IsVGPR || IsAGPR) && MO.getSubReg()) {
3848         const TargetRegisterClass *SubRC =
3849             RI.getSubRegClass(RC, MO.getSubReg());
3850         RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
3851         if (RC)
3852           RC = SubRC;
3853       }
3854 
3855       // Check that this is the aligned version of the class.
3856       if (!RC || !RI.isProperlyAlignedRC(*RC)) {
3857         ErrInfo = "Subtarget requires even aligned vector registers";
3858         return false;
3859       }
3860     }
3861 
3862     if (RegClass != -1) {
3863       if (Reg.isVirtual())
3864         continue;
3865 
3866       const TargetRegisterClass *RC = RI.getRegClass(RegClass);
3867       if (!RC->contains(Reg)) {
3868         ErrInfo = "Operand has incorrect register class.";
3869         return false;
3870       }
3871     }
3872   }
3873 
3874   // Verify SDWA
3875   if (isSDWA(MI)) {
3876     if (!ST.hasSDWA()) {
3877       ErrInfo = "SDWA is not supported on this target";
3878       return false;
3879     }
3880 
3881     int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
3882 
3883     const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx };
3884 
3885     for (int OpIdx: OpIndicies) {
3886       if (OpIdx == -1)
3887         continue;
3888       const MachineOperand &MO = MI.getOperand(OpIdx);
3889 
3890       if (!ST.hasSDWAScalar()) {
3891         // Only VGPRS on VI
3892         if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
3893           ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
3894           return false;
3895         }
3896       } else {
3897         // No immediates on GFX9
3898         if (!MO.isReg()) {
3899           ErrInfo =
3900             "Only reg allowed as operands in SDWA instructions on GFX9+";
3901           return false;
3902         }
3903       }
3904     }
3905 
3906     if (!ST.hasSDWAOmod()) {
3907       // No omod allowed on VI
3908       const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
3909       if (OMod != nullptr &&
3910         (!OMod->isImm() || OMod->getImm() != 0)) {
3911         ErrInfo = "OMod not allowed in SDWA instructions on VI";
3912         return false;
3913       }
3914     }
3915 
3916     uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
3917     if (isVOPC(BasicOpcode)) {
3918       if (!ST.hasSDWASdst() && DstIdx != -1) {
3919         // Only vcc allowed as dst on VI for VOPC
3920         const MachineOperand &Dst = MI.getOperand(DstIdx);
3921         if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
3922           ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
3923           return false;
3924         }
3925       } else if (!ST.hasSDWAOutModsVOPC()) {
3926         // No clamp allowed on GFX9 for VOPC
3927         const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
3928         if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
3929           ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
3930           return false;
3931         }
3932 
3933         // No omod allowed on GFX9 for VOPC
3934         const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
3935         if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
3936           ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
3937           return false;
3938         }
3939       }
3940     }
3941 
3942     const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
3943     if (DstUnused && DstUnused->isImm() &&
3944         DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
3945       const MachineOperand &Dst = MI.getOperand(DstIdx);
3946       if (!Dst.isReg() || !Dst.isTied()) {
3947         ErrInfo = "Dst register should have tied register";
3948         return false;
3949       }
3950 
3951       const MachineOperand &TiedMO =
3952           MI.getOperand(MI.findTiedOperandIdx(DstIdx));
3953       if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
3954         ErrInfo =
3955             "Dst register should be tied to implicit use of preserved register";
3956         return false;
3957       } else if (TiedMO.getReg().isPhysical() &&
3958                  Dst.getReg() != TiedMO.getReg()) {
3959         ErrInfo = "Dst register should use same physical register as preserved";
3960         return false;
3961       }
3962     }
3963   }
3964 
3965   // Verify MIMG
3966   if (isMIMG(MI.getOpcode()) && !MI.mayStore()) {
3967     // Ensure that the return type used is large enough for all the options
3968     // being used TFE/LWE require an extra result register.
3969     const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
3970     if (DMask) {
3971       uint64_t DMaskImm = DMask->getImm();
3972       uint32_t RegCount =
3973           isGather4(MI.getOpcode()) ? 4 : countPopulation(DMaskImm);
3974       const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
3975       const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
3976       const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
3977 
3978       // Adjust for packed 16 bit values
3979       if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
3980         RegCount >>= 1;
3981 
3982       // Adjust if using LWE or TFE
3983       if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
3984         RegCount += 1;
3985 
3986       const uint32_t DstIdx =
3987           AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
3988       const MachineOperand &Dst = MI.getOperand(DstIdx);
3989       if (Dst.isReg()) {
3990         const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
3991         uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
3992         if (RegCount > DstSize) {
3993           ErrInfo = "MIMG instruction returns too many registers for dst "
3994                     "register class";
3995           return false;
3996         }
3997       }
3998     }
3999   }
4000 
4001   // Verify VOP*. Ignore multiple sgpr operands on writelane.
4002   if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32
4003       && (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) {
4004     // Only look at the true operands. Only a real operand can use the constant
4005     // bus, and we don't want to check pseudo-operands like the source modifier
4006     // flags.
4007     const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
4008 
4009     unsigned ConstantBusCount = 0;
4010     bool UsesLiteral = false;
4011     const MachineOperand *LiteralVal = nullptr;
4012 
4013     if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1)
4014       ++ConstantBusCount;
4015 
4016     SmallVector<Register, 2> SGPRsUsed;
4017     Register SGPRUsed;
4018 
4019     for (int OpIdx : OpIndices) {
4020       if (OpIdx == -1)
4021         break;
4022       const MachineOperand &MO = MI.getOperand(OpIdx);
4023       if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) {
4024         if (MO.isReg()) {
4025           SGPRUsed = MO.getReg();
4026           if (llvm::all_of(SGPRsUsed, [SGPRUsed](unsigned SGPR) {
4027                 return SGPRUsed != SGPR;
4028               })) {
4029             ++ConstantBusCount;
4030             SGPRsUsed.push_back(SGPRUsed);
4031           }
4032         } else {
4033           if (!UsesLiteral) {
4034             ++ConstantBusCount;
4035             UsesLiteral = true;
4036             LiteralVal = &MO;
4037           } else if (!MO.isIdenticalTo(*LiteralVal)) {
4038             assert(isVOP3(MI));
4039             ErrInfo = "VOP3 instruction uses more than one literal";
4040             return false;
4041           }
4042         }
4043       }
4044     }
4045 
4046     SGPRUsed = findImplicitSGPRRead(MI);
4047     if (SGPRUsed != AMDGPU::NoRegister) {
4048       // Implicit uses may safely overlap true overands
4049       if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
4050             return !RI.regsOverlap(SGPRUsed, SGPR);
4051           })) {
4052         ++ConstantBusCount;
4053         SGPRsUsed.push_back(SGPRUsed);
4054       }
4055     }
4056 
4057     // v_writelane_b32 is an exception from constant bus restriction:
4058     // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
4059     if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
4060         Opcode != AMDGPU::V_WRITELANE_B32) {
4061       ErrInfo = "VOP* instruction violates constant bus restriction";
4062       return false;
4063     }
4064 
4065     if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
4066       ErrInfo = "VOP3 instruction uses literal";
4067       return false;
4068     }
4069   }
4070 
4071   // Special case for writelane - this can break the multiple constant bus rule,
4072   // but still can't use more than one SGPR register
4073   if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
4074     unsigned SGPRCount = 0;
4075     Register SGPRUsed = AMDGPU::NoRegister;
4076 
4077     for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx}) {
4078       if (OpIdx == -1)
4079         break;
4080 
4081       const MachineOperand &MO = MI.getOperand(OpIdx);
4082 
4083       if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) {
4084         if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
4085           if (MO.getReg() != SGPRUsed)
4086             ++SGPRCount;
4087           SGPRUsed = MO.getReg();
4088         }
4089       }
4090       if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
4091         ErrInfo = "WRITELANE instruction violates constant bus restriction";
4092         return false;
4093       }
4094     }
4095   }
4096 
4097   // Verify misc. restrictions on specific instructions.
4098   if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
4099       Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
4100     const MachineOperand &Src0 = MI.getOperand(Src0Idx);
4101     const MachineOperand &Src1 = MI.getOperand(Src1Idx);
4102     const MachineOperand &Src2 = MI.getOperand(Src2Idx);
4103     if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
4104       if (!compareMachineOp(Src0, Src1) &&
4105           !compareMachineOp(Src0, Src2)) {
4106         ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
4107         return false;
4108       }
4109     }
4110     if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
4111          SISrcMods::ABS) ||
4112         (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
4113          SISrcMods::ABS) ||
4114         (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
4115          SISrcMods::ABS)) {
4116       ErrInfo = "ABS not allowed in VOP3B instructions";
4117       return false;
4118     }
4119   }
4120 
4121   if (isSOP2(MI) || isSOPC(MI)) {
4122     const MachineOperand &Src0 = MI.getOperand(Src0Idx);
4123     const MachineOperand &Src1 = MI.getOperand(Src1Idx);
4124     unsigned Immediates = 0;
4125 
4126     if (!Src0.isReg() &&
4127         !isInlineConstant(Src0, Desc.OpInfo[Src0Idx].OperandType))
4128       Immediates++;
4129     if (!Src1.isReg() &&
4130         !isInlineConstant(Src1, Desc.OpInfo[Src1Idx].OperandType))
4131       Immediates++;
4132 
4133     if (Immediates > 1) {
4134       ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
4135       return false;
4136     }
4137   }
4138 
4139   if (isSOPK(MI)) {
4140     auto Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
4141     if (Desc.isBranch()) {
4142       if (!Op->isMBB()) {
4143         ErrInfo = "invalid branch target for SOPK instruction";
4144         return false;
4145       }
4146     } else {
4147       uint64_t Imm = Op->getImm();
4148       if (sopkIsZext(MI)) {
4149         if (!isUInt<16>(Imm)) {
4150           ErrInfo = "invalid immediate for SOPK instruction";
4151           return false;
4152         }
4153       } else {
4154         if (!isInt<16>(Imm)) {
4155           ErrInfo = "invalid immediate for SOPK instruction";
4156           return false;
4157         }
4158       }
4159     }
4160   }
4161 
4162   if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
4163       Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
4164       Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
4165       Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
4166     const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
4167                        Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
4168 
4169     const unsigned StaticNumOps = Desc.getNumOperands() +
4170       Desc.getNumImplicitUses();
4171     const unsigned NumImplicitOps = IsDst ? 2 : 1;
4172 
4173     // Allow additional implicit operands. This allows a fixup done by the post
4174     // RA scheduler where the main implicit operand is killed and implicit-defs
4175     // are added for sub-registers that remain live after this instruction.
4176     if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
4177       ErrInfo = "missing implicit register operands";
4178       return false;
4179     }
4180 
4181     const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
4182     if (IsDst) {
4183       if (!Dst->isUse()) {
4184         ErrInfo = "v_movreld_b32 vdst should be a use operand";
4185         return false;
4186       }
4187 
4188       unsigned UseOpIdx;
4189       if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
4190           UseOpIdx != StaticNumOps + 1) {
4191         ErrInfo = "movrel implicit operands should be tied";
4192         return false;
4193       }
4194     }
4195 
4196     const MachineOperand &Src0 = MI.getOperand(Src0Idx);
4197     const MachineOperand &ImpUse
4198       = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
4199     if (!ImpUse.isReg() || !ImpUse.isUse() ||
4200         !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
4201       ErrInfo = "src0 should be subreg of implicit vector use";
4202       return false;
4203     }
4204   }
4205 
4206   // Make sure we aren't losing exec uses in the td files. This mostly requires
4207   // being careful when using let Uses to try to add other use registers.
4208   if (shouldReadExec(MI)) {
4209     if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
4210       ErrInfo = "VALU instruction does not implicitly read exec mask";
4211       return false;
4212     }
4213   }
4214 
4215   if (isSMRD(MI)) {
4216     if (MI.mayStore()) {
4217       // The register offset form of scalar stores may only use m0 as the
4218       // soffset register.
4219       const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff);
4220       if (Soff && Soff->getReg() != AMDGPU::M0) {
4221         ErrInfo = "scalar stores must use m0 as offset register";
4222         return false;
4223       }
4224     }
4225   }
4226 
4227   if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
4228     const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
4229     if (Offset->getImm() != 0) {
4230       ErrInfo = "subtarget does not support offsets in flat instructions";
4231       return false;
4232     }
4233   }
4234 
4235   if (isMIMG(MI)) {
4236     const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
4237     if (DimOp) {
4238       int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
4239                                                  AMDGPU::OpName::vaddr0);
4240       int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
4241       const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
4242       const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4243           AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
4244       const AMDGPU::MIMGDimInfo *Dim =
4245           AMDGPU::getMIMGDimInfoByEncoding(DimOp->getImm());
4246 
4247       if (!Dim) {
4248         ErrInfo = "dim is out of range";
4249         return false;
4250       }
4251 
4252       bool IsA16 = false;
4253       if (ST.hasR128A16()) {
4254         const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
4255         IsA16 = R128A16->getImm() != 0;
4256       } else if (ST.hasGFX10A16()) {
4257         const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
4258         IsA16 = A16->getImm() != 0;
4259       }
4260 
4261       bool IsNSA = SRsrcIdx - VAddr0Idx > 1;
4262 
4263       unsigned AddrWords =
4264           AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
4265 
4266       unsigned VAddrWords;
4267       if (IsNSA) {
4268         VAddrWords = SRsrcIdx - VAddr0Idx;
4269       } else {
4270         const TargetRegisterClass *RC = getOpRegClass(MI, VAddr0Idx);
4271         VAddrWords = MRI.getTargetRegisterInfo()->getRegSizeInBits(*RC) / 32;
4272         if (AddrWords > 8)
4273           AddrWords = 16;
4274         else if (AddrWords > 5)
4275           AddrWords = 8;
4276       }
4277 
4278       if (VAddrWords != AddrWords) {
4279         LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
4280                           << " but got " << VAddrWords << "\n");
4281         ErrInfo = "bad vaddr size";
4282         return false;
4283       }
4284     }
4285   }
4286 
4287   const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
4288   if (DppCt) {
4289     using namespace AMDGPU::DPP;
4290 
4291     unsigned DC = DppCt->getImm();
4292     if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
4293         DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
4294         (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
4295         (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
4296         (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
4297         (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
4298         (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
4299       ErrInfo = "Invalid dpp_ctrl value";
4300       return false;
4301     }
4302     if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
4303         ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
4304       ErrInfo = "Invalid dpp_ctrl value: "
4305                 "wavefront shifts are not supported on GFX10+";
4306       return false;
4307     }
4308     if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
4309         ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
4310       ErrInfo = "Invalid dpp_ctrl value: "
4311                 "broadcasts are not supported on GFX10+";
4312       return false;
4313     }
4314     if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
4315         ST.getGeneration() < AMDGPUSubtarget::GFX10) {
4316       if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
4317           DC <= DppCtrl::ROW_NEWBCAST_LAST &&
4318           !ST.hasGFX90AInsts()) {
4319         ErrInfo = "Invalid dpp_ctrl value: "
4320                   "row_newbroadcast/row_share is not supported before "
4321                   "GFX90A/GFX10";
4322         return false;
4323       } else if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
4324         ErrInfo = "Invalid dpp_ctrl value: "
4325                   "row_share and row_xmask are not supported before GFX10";
4326         return false;
4327       }
4328     }
4329 
4330     int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
4331     int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
4332 
4333     if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
4334         ((DstIdx >= 0 &&
4335           (Desc.OpInfo[DstIdx].RegClass == AMDGPU::VReg_64RegClassID ||
4336            Desc.OpInfo[DstIdx].RegClass == AMDGPU::VReg_64_Align2RegClassID)) ||
4337          ((Src0Idx >= 0 &&
4338            (Desc.OpInfo[Src0Idx].RegClass == AMDGPU::VReg_64RegClassID ||
4339             Desc.OpInfo[Src0Idx].RegClass ==
4340                 AMDGPU::VReg_64_Align2RegClassID)))) &&
4341         !AMDGPU::isLegal64BitDPPControl(DC)) {
4342       ErrInfo = "Invalid dpp_ctrl value: "
4343                 "64 bit dpp only support row_newbcast";
4344       return false;
4345     }
4346   }
4347 
4348   if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
4349     const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
4350     uint16_t DataNameIdx = isDS(Opcode) ? AMDGPU::OpName::data0
4351                                         : AMDGPU::OpName::vdata;
4352     const MachineOperand *Data = getNamedOperand(MI, DataNameIdx);
4353     const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
4354     if (Data && !Data->isReg())
4355       Data = nullptr;
4356 
4357     if (ST.hasGFX90AInsts()) {
4358       if (Dst && Data &&
4359           (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
4360         ErrInfo = "Invalid register class: "
4361                   "vdata and vdst should be both VGPR or AGPR";
4362         return false;
4363       }
4364       if (Data && Data2 &&
4365           (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
4366         ErrInfo = "Invalid register class: "
4367                   "both data operands should be VGPR or AGPR";
4368         return false;
4369       }
4370     } else {
4371       if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
4372           (Data && RI.isAGPR(MRI, Data->getReg())) ||
4373           (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
4374         ErrInfo = "Invalid register class: "
4375                   "agpr loads and stores not supported on this GPU";
4376         return false;
4377       }
4378     }
4379   }
4380 
4381   if (ST.needsAlignedVGPRs() &&
4382       (MI.getOpcode() == AMDGPU::DS_GWS_INIT ||
4383        MI.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
4384        MI.getOpcode() == AMDGPU::DS_GWS_BARRIER)) {
4385     const MachineOperand *Op = getNamedOperand(MI, AMDGPU::OpName::data0);
4386     Register Reg = Op->getReg();
4387     bool Aligned = true;
4388     if (Reg.isPhysical()) {
4389       Aligned = !(RI.getHWRegIndex(Reg) & 1);
4390     } else {
4391       const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
4392       Aligned = RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
4393                 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
4394     }
4395 
4396     if (!Aligned) {
4397       ErrInfo = "Subtarget requires even aligned vector registers "
4398                 "for DS_GWS instructions";
4399       return false;
4400     }
4401   }
4402 
4403   return true;
4404 }
4405 
4406 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
4407   switch (MI.getOpcode()) {
4408   default: return AMDGPU::INSTRUCTION_LIST_END;
4409   case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
4410   case AMDGPU::COPY: return AMDGPU::COPY;
4411   case AMDGPU::PHI: return AMDGPU::PHI;
4412   case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
4413   case AMDGPU::WQM: return AMDGPU::WQM;
4414   case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
4415   case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
4416   case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
4417   case AMDGPU::S_MOV_B32: {
4418     const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
4419     return MI.getOperand(1).isReg() ||
4420            RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
4421            AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
4422   }
4423   case AMDGPU::S_ADD_I32:
4424     return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
4425   case AMDGPU::S_ADDC_U32:
4426     return AMDGPU::V_ADDC_U32_e32;
4427   case AMDGPU::S_SUB_I32:
4428     return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
4429     // FIXME: These are not consistently handled, and selected when the carry is
4430     // used.
4431   case AMDGPU::S_ADD_U32:
4432     return AMDGPU::V_ADD_CO_U32_e32;
4433   case AMDGPU::S_SUB_U32:
4434     return AMDGPU::V_SUB_CO_U32_e32;
4435   case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
4436   case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
4437   case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
4438   case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
4439   case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
4440   case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
4441   case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
4442   case AMDGPU::S_XNOR_B32:
4443     return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
4444   case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
4445   case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
4446   case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
4447   case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
4448   case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
4449   case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
4450   case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
4451   case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
4452   case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
4453   case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
4454   case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
4455   case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
4456   case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
4457   case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
4458   case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
4459   case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
4460   case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
4461   case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
4462   case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32;
4463   case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32;
4464   case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32;
4465   case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32;
4466   case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
4467   case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
4468   case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32;
4469   case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32;
4470   case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32;
4471   case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32;
4472   case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32;
4473   case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32;
4474   case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32;
4475   case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32;
4476   case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
4477   case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
4478   case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
4479   case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
4480   case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
4481   case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
4482   }
4483   llvm_unreachable(
4484       "Unexpected scalar opcode without corresponding vector one!");
4485 }
4486 
4487 static unsigned adjustAllocatableRegClass(const GCNSubtarget &ST,
4488                                           const MachineRegisterInfo &MRI,
4489                                           const MCInstrDesc &TID,
4490                                           unsigned RCID,
4491                                           bool IsAllocatable) {
4492   if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
4493       (TID.mayLoad() || TID.mayStore() ||
4494       (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::MIMG)))) {
4495     switch (RCID) {
4496     case AMDGPU::AV_32RegClassID: return AMDGPU::VGPR_32RegClassID;
4497     case AMDGPU::AV_64RegClassID: return AMDGPU::VReg_64RegClassID;
4498     case AMDGPU::AV_96RegClassID: return AMDGPU::VReg_96RegClassID;
4499     case AMDGPU::AV_128RegClassID: return AMDGPU::VReg_128RegClassID;
4500     case AMDGPU::AV_160RegClassID: return AMDGPU::VReg_160RegClassID;
4501     default:
4502       break;
4503     }
4504   }
4505   return RCID;
4506 }
4507 
4508 const TargetRegisterClass *SIInstrInfo::getRegClass(const MCInstrDesc &TID,
4509     unsigned OpNum, const TargetRegisterInfo *TRI,
4510     const MachineFunction &MF)
4511   const {
4512   if (OpNum >= TID.getNumOperands())
4513     return nullptr;
4514   auto RegClass = TID.OpInfo[OpNum].RegClass;
4515   bool IsAllocatable = false;
4516   if (TID.TSFlags & (SIInstrFlags::DS | SIInstrFlags::FLAT)) {
4517     // vdst and vdata should be both VGPR or AGPR, same for the DS instructions
4518     // with two data operands. Request register class constainted to VGPR only
4519     // of both operands present as Machine Copy Propagation can not check this
4520     // constraint and possibly other passes too.
4521     //
4522     // The check is limited to FLAT and DS because atomics in non-flat encoding
4523     // have their vdst and vdata tied to be the same register.
4524     const int VDstIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
4525                                                    AMDGPU::OpName::vdst);
4526     const int DataIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
4527         (TID.TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0
4528                                          : AMDGPU::OpName::vdata);
4529     if (DataIdx != -1) {
4530       IsAllocatable = VDstIdx != -1 ||
4531                       AMDGPU::getNamedOperandIdx(TID.Opcode,
4532                                                  AMDGPU::OpName::data1) != -1;
4533     }
4534   }
4535   RegClass = adjustAllocatableRegClass(ST, MF.getRegInfo(), TID, RegClass,
4536                                        IsAllocatable);
4537   return RI.getRegClass(RegClass);
4538 }
4539 
4540 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
4541                                                       unsigned OpNo) const {
4542   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
4543   const MCInstrDesc &Desc = get(MI.getOpcode());
4544   if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
4545       Desc.OpInfo[OpNo].RegClass == -1) {
4546     Register Reg = MI.getOperand(OpNo).getReg();
4547 
4548     if (Reg.isVirtual())
4549       return MRI.getRegClass(Reg);
4550     return RI.getPhysRegClass(Reg);
4551   }
4552 
4553   unsigned RCID = Desc.OpInfo[OpNo].RegClass;
4554   RCID = adjustAllocatableRegClass(ST, MRI, Desc, RCID, true);
4555   return RI.getRegClass(RCID);
4556 }
4557 
4558 void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
4559   MachineBasicBlock::iterator I = MI;
4560   MachineBasicBlock *MBB = MI.getParent();
4561   MachineOperand &MO = MI.getOperand(OpIdx);
4562   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
4563   unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass;
4564   const TargetRegisterClass *RC = RI.getRegClass(RCID);
4565   unsigned Size = RI.getRegSizeInBits(*RC);
4566   unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32;
4567   if (MO.isReg())
4568     Opcode = AMDGPU::COPY;
4569   else if (RI.isSGPRClass(RC))
4570     Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
4571 
4572   const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
4573   const TargetRegisterClass *VRC64 = RI.getVGPR64Class();
4574   if (RI.getCommonSubClass(VRC64, VRC))
4575     VRC = VRC64;
4576   else
4577     VRC = &AMDGPU::VGPR_32RegClass;
4578 
4579   Register Reg = MRI.createVirtualRegister(VRC);
4580   DebugLoc DL = MBB->findDebugLoc(I);
4581   BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
4582   MO.ChangeToRegister(Reg, false);
4583 }
4584 
4585 unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
4586                                          MachineRegisterInfo &MRI,
4587                                          MachineOperand &SuperReg,
4588                                          const TargetRegisterClass *SuperRC,
4589                                          unsigned SubIdx,
4590                                          const TargetRegisterClass *SubRC)
4591                                          const {
4592   MachineBasicBlock *MBB = MI->getParent();
4593   DebugLoc DL = MI->getDebugLoc();
4594   Register SubReg = MRI.createVirtualRegister(SubRC);
4595 
4596   if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) {
4597     BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
4598       .addReg(SuperReg.getReg(), 0, SubIdx);
4599     return SubReg;
4600   }
4601 
4602   // Just in case the super register is itself a sub-register, copy it to a new
4603   // value so we don't need to worry about merging its subreg index with the
4604   // SubIdx passed to this function. The register coalescer should be able to
4605   // eliminate this extra copy.
4606   Register NewSuperReg = MRI.createVirtualRegister(SuperRC);
4607 
4608   BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
4609     .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());
4610 
4611   BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
4612     .addReg(NewSuperReg, 0, SubIdx);
4613 
4614   return SubReg;
4615 }
4616 
4617 MachineOperand SIInstrInfo::buildExtractSubRegOrImm(
4618   MachineBasicBlock::iterator MII,
4619   MachineRegisterInfo &MRI,
4620   MachineOperand &Op,
4621   const TargetRegisterClass *SuperRC,
4622   unsigned SubIdx,
4623   const TargetRegisterClass *SubRC) const {
4624   if (Op.isImm()) {
4625     if (SubIdx == AMDGPU::sub0)
4626       return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
4627     if (SubIdx == AMDGPU::sub1)
4628       return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
4629 
4630     llvm_unreachable("Unhandled register index for immediate");
4631   }
4632 
4633   unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
4634                                        SubIdx, SubRC);
4635   return MachineOperand::CreateReg(SubReg, false);
4636 }
4637 
4638 // Change the order of operands from (0, 1, 2) to (0, 2, 1)
4639 void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
4640   assert(Inst.getNumExplicitOperands() == 3);
4641   MachineOperand Op1 = Inst.getOperand(1);
4642   Inst.RemoveOperand(1);
4643   Inst.addOperand(Op1);
4644 }
4645 
4646 bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
4647                                     const MCOperandInfo &OpInfo,
4648                                     const MachineOperand &MO) const {
4649   if (!MO.isReg())
4650     return false;
4651 
4652   Register Reg = MO.getReg();
4653 
4654   const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass);
4655   if (Reg.isPhysical())
4656     return DRC->contains(Reg);
4657 
4658   const TargetRegisterClass *RC = MRI.getRegClass(Reg);
4659 
4660   if (MO.getSubReg()) {
4661     const MachineFunction *MF = MO.getParent()->getParent()->getParent();
4662     const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
4663     if (!SuperRC)
4664       return false;
4665 
4666     DRC = RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg());
4667     if (!DRC)
4668       return false;
4669   }
4670   return RC->hasSuperClassEq(DRC);
4671 }
4672 
4673 bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
4674                                      const MCOperandInfo &OpInfo,
4675                                      const MachineOperand &MO) const {
4676   if (MO.isReg())
4677     return isLegalRegOperand(MRI, OpInfo, MO);
4678 
4679   // Handle non-register types that are treated like immediates.
4680   assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
4681   return true;
4682 }
4683 
4684 bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
4685                                  const MachineOperand *MO) const {
4686   const MachineFunction &MF = *MI.getParent()->getParent();
4687   const MachineRegisterInfo &MRI = MF.getRegInfo();
4688   const MCInstrDesc &InstDesc = MI.getDesc();
4689   const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx];
4690   const TargetRegisterClass *DefinedRC =
4691       OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
4692   if (!MO)
4693     MO = &MI.getOperand(OpIdx);
4694 
4695   int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
4696   int VOP3LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
4697   if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
4698     if (isVOP3(MI) && isLiteralConstantLike(*MO, OpInfo) && !VOP3LiteralLimit--)
4699       return false;
4700 
4701     SmallDenseSet<RegSubRegPair> SGPRsUsed;
4702     if (MO->isReg())
4703       SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
4704 
4705     for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
4706       if (i == OpIdx)
4707         continue;
4708       const MachineOperand &Op = MI.getOperand(i);
4709       if (Op.isReg()) {
4710         RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
4711         if (!SGPRsUsed.count(SGPR) &&
4712             usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) {
4713           if (--ConstantBusLimit <= 0)
4714             return false;
4715           SGPRsUsed.insert(SGPR);
4716         }
4717       } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) {
4718         if (--ConstantBusLimit <= 0)
4719           return false;
4720       } else if (isVOP3(MI) && AMDGPU::isSISrcOperand(InstDesc, i) &&
4721                  isLiteralConstantLike(Op, InstDesc.OpInfo[i])) {
4722         if (!VOP3LiteralLimit--)
4723           return false;
4724         if (--ConstantBusLimit <= 0)
4725           return false;
4726       }
4727     }
4728   }
4729 
4730   if (MO->isReg()) {
4731     assert(DefinedRC);
4732     if (!isLegalRegOperand(MRI, OpInfo, *MO))
4733       return false;
4734     bool IsAGPR = RI.isAGPR(MRI, MO->getReg());
4735     if (IsAGPR && !ST.hasMAIInsts())
4736       return false;
4737     unsigned Opc = MI.getOpcode();
4738     if (IsAGPR &&
4739         (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
4740         (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
4741       return false;
4742     // Atomics should have both vdst and vdata either vgpr or agpr.
4743     const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
4744     const int DataIdx = AMDGPU::getNamedOperandIdx(Opc,
4745         isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
4746     if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
4747         MI.getOperand(DataIdx).isReg() &&
4748         RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
4749       return false;
4750     if ((int)OpIdx == DataIdx) {
4751       if (VDstIdx != -1 &&
4752           RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
4753         return false;
4754       // DS instructions with 2 src operands also must have tied RC.
4755       const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc,
4756                                                       AMDGPU::OpName::data1);
4757       if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
4758           RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
4759         return false;
4760     }
4761     if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
4762         (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
4763         RI.isSGPRReg(MRI, MO->getReg()))
4764       return false;
4765     return true;
4766   }
4767 
4768   // Handle non-register types that are treated like immediates.
4769   assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
4770 
4771   if (!DefinedRC) {
4772     // This operand expects an immediate.
4773     return true;
4774   }
4775 
4776   return isImmOperandLegal(MI, OpIdx, *MO);
4777 }
4778 
4779 void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
4780                                        MachineInstr &MI) const {
4781   unsigned Opc = MI.getOpcode();
4782   const MCInstrDesc &InstrDesc = get(Opc);
4783 
4784   int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
4785   MachineOperand &Src0 = MI.getOperand(Src0Idx);
4786 
4787   int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
4788   MachineOperand &Src1 = MI.getOperand(Src1Idx);
4789 
4790   // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
4791   // we need to only have one constant bus use before GFX10.
4792   bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister;
4793   if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 &&
4794       Src0.isReg() && (RI.isSGPRReg(MRI, Src0.getReg()) ||
4795        isLiteralConstantLike(Src0, InstrDesc.OpInfo[Src0Idx])))
4796     legalizeOpWithMove(MI, Src0Idx);
4797 
4798   // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
4799   // both the value to write (src0) and lane select (src1).  Fix up non-SGPR
4800   // src0/src1 with V_READFIRSTLANE.
4801   if (Opc == AMDGPU::V_WRITELANE_B32) {
4802     const DebugLoc &DL = MI.getDebugLoc();
4803     if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
4804       Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4805       BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
4806           .add(Src0);
4807       Src0.ChangeToRegister(Reg, false);
4808     }
4809     if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
4810       Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4811       const DebugLoc &DL = MI.getDebugLoc();
4812       BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
4813           .add(Src1);
4814       Src1.ChangeToRegister(Reg, false);
4815     }
4816     return;
4817   }
4818 
4819   // No VOP2 instructions support AGPRs.
4820   if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg()))
4821     legalizeOpWithMove(MI, Src0Idx);
4822 
4823   if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg()))
4824     legalizeOpWithMove(MI, Src1Idx);
4825 
4826   // VOP2 src0 instructions support all operand types, so we don't need to check
4827   // their legality. If src1 is already legal, we don't need to do anything.
4828   if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
4829     return;
4830 
4831   // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
4832   // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
4833   // select is uniform.
4834   if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
4835       RI.isVGPR(MRI, Src1.getReg())) {
4836     Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4837     const DebugLoc &DL = MI.getDebugLoc();
4838     BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
4839         .add(Src1);
4840     Src1.ChangeToRegister(Reg, false);
4841     return;
4842   }
4843 
4844   // We do not use commuteInstruction here because it is too aggressive and will
4845   // commute if it is possible. We only want to commute here if it improves
4846   // legality. This can be called a fairly large number of times so don't waste
4847   // compile time pointlessly swapping and checking legality again.
4848   if (HasImplicitSGPR || !MI.isCommutable()) {
4849     legalizeOpWithMove(MI, Src1Idx);
4850     return;
4851   }
4852 
4853   // If src0 can be used as src1, commuting will make the operands legal.
4854   // Otherwise we have to give up and insert a move.
4855   //
4856   // TODO: Other immediate-like operand kinds could be commuted if there was a
4857   // MachineOperand::ChangeTo* for them.
4858   if ((!Src1.isImm() && !Src1.isReg()) ||
4859       !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) {
4860     legalizeOpWithMove(MI, Src1Idx);
4861     return;
4862   }
4863 
4864   int CommutedOpc = commuteOpcode(MI);
4865   if (CommutedOpc == -1) {
4866     legalizeOpWithMove(MI, Src1Idx);
4867     return;
4868   }
4869 
4870   MI.setDesc(get(CommutedOpc));
4871 
4872   Register Src0Reg = Src0.getReg();
4873   unsigned Src0SubReg = Src0.getSubReg();
4874   bool Src0Kill = Src0.isKill();
4875 
4876   if (Src1.isImm())
4877     Src0.ChangeToImmediate(Src1.getImm());
4878   else if (Src1.isReg()) {
4879     Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
4880     Src0.setSubReg(Src1.getSubReg());
4881   } else
4882     llvm_unreachable("Should only have register or immediate operands");
4883 
4884   Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
4885   Src1.setSubReg(Src0SubReg);
4886   fixImplicitOperands(MI);
4887 }
4888 
4889 // Legalize VOP3 operands. All operand types are supported for any operand
4890 // but only one literal constant and only starting from GFX10.
4891 void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
4892                                        MachineInstr &MI) const {
4893   unsigned Opc = MI.getOpcode();
4894 
4895   int VOP3Idx[3] = {
4896     AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
4897     AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
4898     AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
4899   };
4900 
4901   if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
4902       Opc == AMDGPU::V_PERMLANEX16_B32_e64) {
4903     // src1 and src2 must be scalar
4904     MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
4905     MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
4906     const DebugLoc &DL = MI.getDebugLoc();
4907     if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
4908       Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4909       BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
4910         .add(Src1);
4911       Src1.ChangeToRegister(Reg, false);
4912     }
4913     if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
4914       Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4915       BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
4916         .add(Src2);
4917       Src2.ChangeToRegister(Reg, false);
4918     }
4919   }
4920 
4921   // Find the one SGPR operand we are allowed to use.
4922   int ConstantBusLimit = ST.getConstantBusLimit(Opc);
4923   int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
4924   SmallDenseSet<unsigned> SGPRsUsed;
4925   Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
4926   if (SGPRReg != AMDGPU::NoRegister) {
4927     SGPRsUsed.insert(SGPRReg);
4928     --ConstantBusLimit;
4929   }
4930 
4931   for (unsigned i = 0; i < 3; ++i) {
4932     int Idx = VOP3Idx[i];
4933     if (Idx == -1)
4934       break;
4935     MachineOperand &MO = MI.getOperand(Idx);
4936 
4937     if (!MO.isReg()) {
4938       if (!isLiteralConstantLike(MO, get(Opc).OpInfo[Idx]))
4939         continue;
4940 
4941       if (LiteralLimit > 0 && ConstantBusLimit > 0) {
4942         --LiteralLimit;
4943         --ConstantBusLimit;
4944         continue;
4945       }
4946 
4947       --LiteralLimit;
4948       --ConstantBusLimit;
4949       legalizeOpWithMove(MI, Idx);
4950       continue;
4951     }
4952 
4953     if (RI.hasAGPRs(MRI.getRegClass(MO.getReg())) &&
4954         !isOperandLegal(MI, Idx, &MO)) {
4955       legalizeOpWithMove(MI, Idx);
4956       continue;
4957     }
4958 
4959     if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
4960       continue; // VGPRs are legal
4961 
4962     // We can use one SGPR in each VOP3 instruction prior to GFX10
4963     // and two starting from GFX10.
4964     if (SGPRsUsed.count(MO.getReg()))
4965       continue;
4966     if (ConstantBusLimit > 0) {
4967       SGPRsUsed.insert(MO.getReg());
4968       --ConstantBusLimit;
4969       continue;
4970     }
4971 
4972     // If we make it this far, then the operand is not legal and we must
4973     // legalize it.
4974     legalizeOpWithMove(MI, Idx);
4975   }
4976 }
4977 
4978 Register SIInstrInfo::readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI,
4979                                          MachineRegisterInfo &MRI) const {
4980   const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
4981   const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
4982   Register DstReg = MRI.createVirtualRegister(SRC);
4983   unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
4984 
4985   if (RI.hasAGPRs(VRC)) {
4986     VRC = RI.getEquivalentVGPRClass(VRC);
4987     Register NewSrcReg = MRI.createVirtualRegister(VRC);
4988     BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
4989             get(TargetOpcode::COPY), NewSrcReg)
4990         .addReg(SrcReg);
4991     SrcReg = NewSrcReg;
4992   }
4993 
4994   if (SubRegs == 1) {
4995     BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
4996             get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
4997         .addReg(SrcReg);
4998     return DstReg;
4999   }
5000 
5001   SmallVector<unsigned, 8> SRegs;
5002   for (unsigned i = 0; i < SubRegs; ++i) {
5003     Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5004     BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
5005             get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
5006         .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
5007     SRegs.push_back(SGPR);
5008   }
5009 
5010   MachineInstrBuilder MIB =
5011       BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
5012               get(AMDGPU::REG_SEQUENCE), DstReg);
5013   for (unsigned i = 0; i < SubRegs; ++i) {
5014     MIB.addReg(SRegs[i]);
5015     MIB.addImm(RI.getSubRegFromChannel(i));
5016   }
5017   return DstReg;
5018 }
5019 
5020 void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
5021                                        MachineInstr &MI) const {
5022 
5023   // If the pointer is store in VGPRs, then we need to move them to
5024   // SGPRs using v_readfirstlane.  This is safe because we only select
5025   // loads with uniform pointers to SMRD instruction so we know the
5026   // pointer value is uniform.
5027   MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
5028   if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
5029     Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
5030     SBase->setReg(SGPR);
5031   }
5032   MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soff);
5033   if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) {
5034     Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
5035     SOff->setReg(SGPR);
5036   }
5037 }
5038 
5039 bool SIInstrInfo::moveFlatAddrToVGPR(MachineInstr &Inst) const {
5040   unsigned Opc = Inst.getOpcode();
5041   int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
5042   if (OldSAddrIdx < 0)
5043     return false;
5044 
5045   assert(isSegmentSpecificFLAT(Inst));
5046 
5047   int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
5048   if (NewOpc < 0)
5049     NewOpc = AMDGPU::getFlatScratchInstSVfromSS(Opc);
5050   if (NewOpc < 0)
5051     return false;
5052 
5053   MachineRegisterInfo &MRI = Inst.getMF()->getRegInfo();
5054   MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
5055   if (RI.isSGPRReg(MRI, SAddr.getReg()))
5056     return false;
5057 
5058   int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
5059   if (NewVAddrIdx < 0)
5060     return false;
5061 
5062   int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
5063 
5064   // Check vaddr, it shall be zero or absent.
5065   MachineInstr *VAddrDef = nullptr;
5066   if (OldVAddrIdx >= 0) {
5067     MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
5068     VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
5069     if (!VAddrDef || VAddrDef->getOpcode() != AMDGPU::V_MOV_B32_e32 ||
5070         !VAddrDef->getOperand(1).isImm() ||
5071         VAddrDef->getOperand(1).getImm() != 0)
5072       return false;
5073   }
5074 
5075   const MCInstrDesc &NewDesc = get(NewOpc);
5076   Inst.setDesc(NewDesc);
5077 
5078   // Callers expect interator to be valid after this call, so modify the
5079   // instruction in place.
5080   if (OldVAddrIdx == NewVAddrIdx) {
5081     MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
5082     // Clear use list from the old vaddr holding a zero register.
5083     MRI.removeRegOperandFromUseList(&NewVAddr);
5084     MRI.moveOperands(&NewVAddr, &SAddr, 1);
5085     Inst.RemoveOperand(OldSAddrIdx);
5086     // Update the use list with the pointer we have just moved from vaddr to
5087     // saddr poisition. Otherwise new vaddr will be missing from the use list.
5088     MRI.removeRegOperandFromUseList(&NewVAddr);
5089     MRI.addRegOperandToUseList(&NewVAddr);
5090   } else {
5091     assert(OldSAddrIdx == NewVAddrIdx);
5092 
5093     if (OldVAddrIdx >= 0) {
5094       int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
5095                                                  AMDGPU::OpName::vdst_in);
5096 
5097       // RemoveOperand doesn't try to fixup tied operand indexes at it goes, so
5098       // it asserts. Untie the operands for now and retie them afterwards.
5099       if (NewVDstIn != -1) {
5100         int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
5101         Inst.untieRegOperand(OldVDstIn);
5102       }
5103 
5104       Inst.RemoveOperand(OldVAddrIdx);
5105 
5106       if (NewVDstIn != -1) {
5107         int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
5108         Inst.tieOperands(NewVDst, NewVDstIn);
5109       }
5110     }
5111   }
5112 
5113   if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
5114     VAddrDef->eraseFromParent();
5115 
5116   return true;
5117 }
5118 
5119 // FIXME: Remove this when SelectionDAG is obsoleted.
5120 void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI,
5121                                        MachineInstr &MI) const {
5122   if (!isSegmentSpecificFLAT(MI))
5123     return;
5124 
5125   // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
5126   // thinks they are uniform, so a readfirstlane should be valid.
5127   MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
5128   if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
5129     return;
5130 
5131   if (moveFlatAddrToVGPR(MI))
5132     return;
5133 
5134   Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI);
5135   SAddr->setReg(ToSGPR);
5136 }
5137 
5138 void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
5139                                          MachineBasicBlock::iterator I,
5140                                          const TargetRegisterClass *DstRC,
5141                                          MachineOperand &Op,
5142                                          MachineRegisterInfo &MRI,
5143                                          const DebugLoc &DL) const {
5144   Register OpReg = Op.getReg();
5145   unsigned OpSubReg = Op.getSubReg();
5146 
5147   const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
5148       RI.getRegClassForReg(MRI, OpReg), OpSubReg);
5149 
5150   // Check if operand is already the correct register class.
5151   if (DstRC == OpRC)
5152     return;
5153 
5154   Register DstReg = MRI.createVirtualRegister(DstRC);
5155   MachineInstr *Copy =
5156       BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op);
5157 
5158   Op.setReg(DstReg);
5159   Op.setSubReg(0);
5160 
5161   MachineInstr *Def = MRI.getVRegDef(OpReg);
5162   if (!Def)
5163     return;
5164 
5165   // Try to eliminate the copy if it is copying an immediate value.
5166   if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
5167     FoldImmediate(*Copy, *Def, OpReg, &MRI);
5168 
5169   bool ImpDef = Def->isImplicitDef();
5170   while (!ImpDef && Def && Def->isCopy()) {
5171     if (Def->getOperand(1).getReg().isPhysical())
5172       break;
5173     Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
5174     ImpDef = Def && Def->isImplicitDef();
5175   }
5176   if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
5177       !ImpDef)
5178     Copy->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
5179 }
5180 
5181 // Emit the actual waterfall loop, executing the wrapped instruction for each
5182 // unique value of \p Rsrc across all lanes. In the best case we execute 1
5183 // iteration, in the worst case we execute 64 (once per lane).
5184 static void
5185 emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI,
5186                           MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
5187                           const DebugLoc &DL, MachineOperand &Rsrc) {
5188   MachineFunction &MF = *OrigBB.getParent();
5189   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
5190   const SIRegisterInfo *TRI = ST.getRegisterInfo();
5191   unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5192   unsigned SaveExecOpc =
5193       ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
5194   unsigned XorTermOpc =
5195       ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
5196   unsigned AndOpc =
5197       ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
5198   const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5199 
5200   MachineBasicBlock::iterator I = LoopBB.begin();
5201 
5202   SmallVector<Register, 8> ReadlanePieces;
5203   Register CondReg = AMDGPU::NoRegister;
5204 
5205   Register VRsrc = Rsrc.getReg();
5206   unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef());
5207 
5208   unsigned RegSize = TRI->getRegSizeInBits(Rsrc.getReg(), MRI);
5209   unsigned NumSubRegs =  RegSize / 32;
5210   assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 && "Unhandled register size");
5211 
5212   for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
5213 
5214     Register CurRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5215     Register CurRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5216 
5217     // Read the next variant <- also loop target.
5218     BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
5219             .addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx));
5220 
5221     // Read the next variant <- also loop target.
5222     BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
5223             .addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx + 1));
5224 
5225     ReadlanePieces.push_back(CurRegLo);
5226     ReadlanePieces.push_back(CurRegHi);
5227 
5228     // Comparison is to be done as 64-bit.
5229     Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
5230     BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
5231             .addReg(CurRegLo)
5232             .addImm(AMDGPU::sub0)
5233             .addReg(CurRegHi)
5234             .addImm(AMDGPU::sub1);
5235 
5236     Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
5237     auto Cmp =
5238         BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), NewCondReg)
5239             .addReg(CurReg);
5240     if (NumSubRegs <= 2)
5241       Cmp.addReg(VRsrc);
5242     else
5243       Cmp.addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx, 2));
5244 
5245     // Combine the comparision results with AND.
5246     if (CondReg == AMDGPU::NoRegister) // First.
5247       CondReg = NewCondReg;
5248     else { // If not the first, we create an AND.
5249       Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
5250       BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
5251               .addReg(CondReg)
5252               .addReg(NewCondReg);
5253       CondReg = AndReg;
5254     }
5255   } // End for loop.
5256 
5257   auto SRsrcRC = TRI->getEquivalentSGPRClass(MRI.getRegClass(VRsrc));
5258   Register SRsrc = MRI.createVirtualRegister(SRsrcRC);
5259 
5260   // Build scalar Rsrc.
5261   auto Merge = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SRsrc);
5262   unsigned Channel = 0;
5263   for (Register Piece : ReadlanePieces) {
5264     Merge.addReg(Piece)
5265          .addImm(TRI->getSubRegFromChannel(Channel++));
5266   }
5267 
5268   // Update Rsrc operand to use the SGPR Rsrc.
5269   Rsrc.setReg(SRsrc);
5270   Rsrc.setIsKill(true);
5271 
5272   Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
5273   MRI.setSimpleHint(SaveExec, CondReg);
5274 
5275   // Update EXEC to matching lanes, saving original to SaveExec.
5276   BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec)
5277       .addReg(CondReg, RegState::Kill);
5278 
5279   // The original instruction is here; we insert the terminators after it.
5280   I = LoopBB.end();
5281 
5282   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
5283   BuildMI(LoopBB, I, DL, TII.get(XorTermOpc), Exec)
5284       .addReg(Exec)
5285       .addReg(SaveExec);
5286 
5287   BuildMI(LoopBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
5288 }
5289 
5290 // Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register
5291 // with SGPRs by iterating over all unique values across all lanes.
5292 // Returns the loop basic block that now contains \p MI.
5293 static MachineBasicBlock *
5294 loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
5295                   MachineOperand &Rsrc, MachineDominatorTree *MDT,
5296                   MachineBasicBlock::iterator Begin = nullptr,
5297                   MachineBasicBlock::iterator End = nullptr) {
5298   MachineBasicBlock &MBB = *MI.getParent();
5299   MachineFunction &MF = *MBB.getParent();
5300   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
5301   const SIRegisterInfo *TRI = ST.getRegisterInfo();
5302   MachineRegisterInfo &MRI = MF.getRegInfo();
5303   if (!Begin.isValid())
5304     Begin = &MI;
5305   if (!End.isValid()) {
5306     End = &MI;
5307     ++End;
5308   }
5309   const DebugLoc &DL = MI.getDebugLoc();
5310   unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5311   unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5312   const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5313 
5314   Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
5315 
5316   // Save the EXEC mask
5317   BuildMI(MBB, Begin, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec);
5318 
5319   // Killed uses in the instruction we are waterfalling around will be
5320   // incorrect due to the added control-flow.
5321   MachineBasicBlock::iterator AfterMI = MI;
5322   ++AfterMI;
5323   for (auto I = Begin; I != AfterMI; I++) {
5324     for (auto &MO : I->uses()) {
5325       if (MO.isReg() && MO.isUse()) {
5326         MRI.clearKillFlags(MO.getReg());
5327       }
5328     }
5329   }
5330 
5331   // To insert the loop we need to split the block. Move everything after this
5332   // point to a new block, and insert a new empty block between the two.
5333   MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock();
5334   MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
5335   MachineFunction::iterator MBBI(MBB);
5336   ++MBBI;
5337 
5338   MF.insert(MBBI, LoopBB);
5339   MF.insert(MBBI, RemainderBB);
5340 
5341   LoopBB->addSuccessor(LoopBB);
5342   LoopBB->addSuccessor(RemainderBB);
5343 
5344   // Move Begin to MI to the LoopBB, and the remainder of the block to
5345   // RemainderBB.
5346   RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
5347   RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
5348   LoopBB->splice(LoopBB->begin(), &MBB, Begin, MBB.end());
5349 
5350   MBB.addSuccessor(LoopBB);
5351 
5352   // Update dominators. We know that MBB immediately dominates LoopBB, that
5353   // LoopBB immediately dominates RemainderBB, and that RemainderBB immediately
5354   // dominates all of the successors transferred to it from MBB that MBB used
5355   // to properly dominate.
5356   if (MDT) {
5357     MDT->addNewBlock(LoopBB, &MBB);
5358     MDT->addNewBlock(RemainderBB, LoopBB);
5359     for (auto &Succ : RemainderBB->successors()) {
5360       if (MDT->properlyDominates(&MBB, Succ)) {
5361         MDT->changeImmediateDominator(Succ, RemainderBB);
5362       }
5363     }
5364   }
5365 
5366   emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, Rsrc);
5367 
5368   // Restore the EXEC mask
5369   MachineBasicBlock::iterator First = RemainderBB->begin();
5370   BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec);
5371   return LoopBB;
5372 }
5373 
5374 // Extract pointer from Rsrc and return a zero-value Rsrc replacement.
5375 static std::tuple<unsigned, unsigned>
5376 extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) {
5377   MachineBasicBlock &MBB = *MI.getParent();
5378   MachineFunction &MF = *MBB.getParent();
5379   MachineRegisterInfo &MRI = MF.getRegInfo();
5380 
5381   // Extract the ptr from the resource descriptor.
5382   unsigned RsrcPtr =
5383       TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
5384                              AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
5385 
5386   // Create an empty resource descriptor
5387   Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5388   Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5389   Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5390   Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
5391   uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
5392 
5393   // Zero64 = 0
5394   BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
5395       .addImm(0);
5396 
5397   // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
5398   BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
5399       .addImm(RsrcDataFormat & 0xFFFFFFFF);
5400 
5401   // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
5402   BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
5403       .addImm(RsrcDataFormat >> 32);
5404 
5405   // NewSRsrc = {Zero64, SRsrcFormat}
5406   BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
5407       .addReg(Zero64)
5408       .addImm(AMDGPU::sub0_sub1)
5409       .addReg(SRsrcFormatLo)
5410       .addImm(AMDGPU::sub2)
5411       .addReg(SRsrcFormatHi)
5412       .addImm(AMDGPU::sub3);
5413 
5414   return std::make_tuple(RsrcPtr, NewSRsrc);
5415 }
5416 
5417 MachineBasicBlock *
5418 SIInstrInfo::legalizeOperands(MachineInstr &MI,
5419                               MachineDominatorTree *MDT) const {
5420   MachineFunction &MF = *MI.getParent()->getParent();
5421   MachineRegisterInfo &MRI = MF.getRegInfo();
5422   MachineBasicBlock *CreatedBB = nullptr;
5423 
5424   // Legalize VOP2
5425   if (isVOP2(MI) || isVOPC(MI)) {
5426     legalizeOperandsVOP2(MRI, MI);
5427     return CreatedBB;
5428   }
5429 
5430   // Legalize VOP3
5431   if (isVOP3(MI)) {
5432     legalizeOperandsVOP3(MRI, MI);
5433     return CreatedBB;
5434   }
5435 
5436   // Legalize SMRD
5437   if (isSMRD(MI)) {
5438     legalizeOperandsSMRD(MRI, MI);
5439     return CreatedBB;
5440   }
5441 
5442   // Legalize FLAT
5443   if (isFLAT(MI)) {
5444     legalizeOperandsFLAT(MRI, MI);
5445     return CreatedBB;
5446   }
5447 
5448   // Legalize REG_SEQUENCE and PHI
5449   // The register class of the operands much be the same type as the register
5450   // class of the output.
5451   if (MI.getOpcode() == AMDGPU::PHI) {
5452     const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
5453     for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
5454       if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
5455         continue;
5456       const TargetRegisterClass *OpRC =
5457           MRI.getRegClass(MI.getOperand(i).getReg());
5458       if (RI.hasVectorRegisters(OpRC)) {
5459         VRC = OpRC;
5460       } else {
5461         SRC = OpRC;
5462       }
5463     }
5464 
5465     // If any of the operands are VGPR registers, then they all most be
5466     // otherwise we will create illegal VGPR->SGPR copies when legalizing
5467     // them.
5468     if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
5469       if (!VRC) {
5470         assert(SRC);
5471         if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
5472           VRC = &AMDGPU::VReg_1RegClass;
5473         } else
5474           VRC = RI.hasAGPRs(getOpRegClass(MI, 0))
5475                     ? RI.getEquivalentAGPRClass(SRC)
5476                     : RI.getEquivalentVGPRClass(SRC);
5477       } else {
5478           VRC = RI.hasAGPRs(getOpRegClass(MI, 0))
5479                     ? RI.getEquivalentAGPRClass(VRC)
5480                     : RI.getEquivalentVGPRClass(VRC);
5481       }
5482       RC = VRC;
5483     } else {
5484       RC = SRC;
5485     }
5486 
5487     // Update all the operands so they have the same type.
5488     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
5489       MachineOperand &Op = MI.getOperand(I);
5490       if (!Op.isReg() || !Op.getReg().isVirtual())
5491         continue;
5492 
5493       // MI is a PHI instruction.
5494       MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
5495       MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator();
5496 
5497       // Avoid creating no-op copies with the same src and dst reg class.  These
5498       // confuse some of the machine passes.
5499       legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
5500     }
5501   }
5502 
5503   // REG_SEQUENCE doesn't really require operand legalization, but if one has a
5504   // VGPR dest type and SGPR sources, insert copies so all operands are
5505   // VGPRs. This seems to help operand folding / the register coalescer.
5506   if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
5507     MachineBasicBlock *MBB = MI.getParent();
5508     const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
5509     if (RI.hasVGPRs(DstRC)) {
5510       // Update all the operands so they are VGPR register classes. These may
5511       // not be the same register class because REG_SEQUENCE supports mixing
5512       // subregister index types e.g. sub0_sub1 + sub2 + sub3
5513       for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
5514         MachineOperand &Op = MI.getOperand(I);
5515         if (!Op.isReg() || !Op.getReg().isVirtual())
5516           continue;
5517 
5518         const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
5519         const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
5520         if (VRC == OpRC)
5521           continue;
5522 
5523         legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
5524         Op.setIsKill();
5525       }
5526     }
5527 
5528     return CreatedBB;
5529   }
5530 
5531   // Legalize INSERT_SUBREG
5532   // src0 must have the same register class as dst
5533   if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
5534     Register Dst = MI.getOperand(0).getReg();
5535     Register Src0 = MI.getOperand(1).getReg();
5536     const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
5537     const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
5538     if (DstRC != Src0RC) {
5539       MachineBasicBlock *MBB = MI.getParent();
5540       MachineOperand &Op = MI.getOperand(1);
5541       legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
5542     }
5543     return CreatedBB;
5544   }
5545 
5546   // Legalize SI_INIT_M0
5547   if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
5548     MachineOperand &Src = MI.getOperand(0);
5549     if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
5550       Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
5551     return CreatedBB;
5552   }
5553 
5554   // Legalize MIMG and MUBUF/MTBUF for shaders.
5555   //
5556   // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
5557   // scratch memory access. In both cases, the legalization never involves
5558   // conversion to the addr64 form.
5559   if (isMIMG(MI) || (AMDGPU::isGraphics(MF.getFunction().getCallingConv()) &&
5560                      (isMUBUF(MI) || isMTBUF(MI)))) {
5561     MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc);
5562     if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
5563       CreatedBB = loadSRsrcFromVGPR(*this, MI, *SRsrc, MDT);
5564 
5565     MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp);
5566     if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
5567       CreatedBB = loadSRsrcFromVGPR(*this, MI, *SSamp, MDT);
5568 
5569     return CreatedBB;
5570   }
5571 
5572   // Legalize SI_CALL
5573   if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
5574     MachineOperand *Dest = &MI.getOperand(0);
5575     if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
5576       // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
5577       // following copies, we also need to move copies from and to physical
5578       // registers into the loop block.
5579       unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
5580       unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
5581 
5582       // Also move the copies to physical registers into the loop block
5583       MachineBasicBlock &MBB = *MI.getParent();
5584       MachineBasicBlock::iterator Start(&MI);
5585       while (Start->getOpcode() != FrameSetupOpcode)
5586         --Start;
5587       MachineBasicBlock::iterator End(&MI);
5588       while (End->getOpcode() != FrameDestroyOpcode)
5589         ++End;
5590       // Also include following copies of the return value
5591       ++End;
5592       while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
5593              MI.definesRegister(End->getOperand(1).getReg()))
5594         ++End;
5595       CreatedBB = loadSRsrcFromVGPR(*this, MI, *Dest, MDT, Start, End);
5596     }
5597   }
5598 
5599   // Legalize MUBUF* instructions.
5600   int RsrcIdx =
5601       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
5602   if (RsrcIdx != -1) {
5603     // We have an MUBUF instruction
5604     MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
5605     unsigned RsrcRC = get(MI.getOpcode()).OpInfo[RsrcIdx].RegClass;
5606     if (RI.getCommonSubClass(MRI.getRegClass(Rsrc->getReg()),
5607                              RI.getRegClass(RsrcRC))) {
5608       // The operands are legal.
5609       // FIXME: We may need to legalize operands besided srsrc.
5610       return CreatedBB;
5611     }
5612 
5613     // Legalize a VGPR Rsrc.
5614     //
5615     // If the instruction is _ADDR64, we can avoid a waterfall by extracting
5616     // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
5617     // a zero-value SRsrc.
5618     //
5619     // If the instruction is _OFFSET (both idxen and offen disabled), and we
5620     // support ADDR64 instructions, we can convert to ADDR64 and do the same as
5621     // above.
5622     //
5623     // Otherwise we are on non-ADDR64 hardware, and/or we have
5624     // idxen/offen/bothen and we fall back to a waterfall loop.
5625 
5626     MachineBasicBlock &MBB = *MI.getParent();
5627 
5628     MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
5629     if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
5630       // This is already an ADDR64 instruction so we need to add the pointer
5631       // extracted from the resource descriptor to the current value of VAddr.
5632       Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5633       Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5634       Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
5635 
5636       const auto *BoolXExecRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5637       Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
5638       Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
5639 
5640       unsigned RsrcPtr, NewSRsrc;
5641       std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
5642 
5643       // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
5644       const DebugLoc &DL = MI.getDebugLoc();
5645       BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
5646         .addDef(CondReg0)
5647         .addReg(RsrcPtr, 0, AMDGPU::sub0)
5648         .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
5649         .addImm(0);
5650 
5651       // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
5652       BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
5653         .addDef(CondReg1, RegState::Dead)
5654         .addReg(RsrcPtr, 0, AMDGPU::sub1)
5655         .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
5656         .addReg(CondReg0, RegState::Kill)
5657         .addImm(0);
5658 
5659       // NewVaddr = {NewVaddrHi, NewVaddrLo}
5660       BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
5661           .addReg(NewVAddrLo)
5662           .addImm(AMDGPU::sub0)
5663           .addReg(NewVAddrHi)
5664           .addImm(AMDGPU::sub1);
5665 
5666       VAddr->setReg(NewVAddr);
5667       Rsrc->setReg(NewSRsrc);
5668     } else if (!VAddr && ST.hasAddr64()) {
5669       // This instructions is the _OFFSET variant, so we need to convert it to
5670       // ADDR64.
5671       assert(ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS &&
5672              "FIXME: Need to emit flat atomics here");
5673 
5674       unsigned RsrcPtr, NewSRsrc;
5675       std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
5676 
5677       Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
5678       MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
5679       MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5680       MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
5681       unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
5682 
5683       // Atomics rith return have have an additional tied operand and are
5684       // missing some of the special bits.
5685       MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
5686       MachineInstr *Addr64;
5687 
5688       if (!VDataIn) {
5689         // Regular buffer load / store.
5690         MachineInstrBuilder MIB =
5691             BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
5692                 .add(*VData)
5693                 .addReg(NewVAddr)
5694                 .addReg(NewSRsrc)
5695                 .add(*SOffset)
5696                 .add(*Offset);
5697 
5698         if (const MachineOperand *CPol =
5699                 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
5700           MIB.addImm(CPol->getImm());
5701         }
5702 
5703         if (const MachineOperand *TFE =
5704                 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
5705           MIB.addImm(TFE->getImm());
5706         }
5707 
5708         MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
5709 
5710         MIB.cloneMemRefs(MI);
5711         Addr64 = MIB;
5712       } else {
5713         // Atomics with return.
5714         Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
5715                      .add(*VData)
5716                      .add(*VDataIn)
5717                      .addReg(NewVAddr)
5718                      .addReg(NewSRsrc)
5719                      .add(*SOffset)
5720                      .add(*Offset)
5721                      .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
5722                      .cloneMemRefs(MI);
5723       }
5724 
5725       MI.removeFromParent();
5726 
5727       // NewVaddr = {NewVaddrHi, NewVaddrLo}
5728       BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
5729               NewVAddr)
5730           .addReg(RsrcPtr, 0, AMDGPU::sub0)
5731           .addImm(AMDGPU::sub0)
5732           .addReg(RsrcPtr, 0, AMDGPU::sub1)
5733           .addImm(AMDGPU::sub1);
5734     } else {
5735       // This is another variant; legalize Rsrc with waterfall loop from VGPRs
5736       // to SGPRs.
5737       CreatedBB = loadSRsrcFromVGPR(*this, MI, *Rsrc, MDT);
5738       return CreatedBB;
5739     }
5740   }
5741   return CreatedBB;
5742 }
5743 
5744 MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst,
5745                                            MachineDominatorTree *MDT) const {
5746   SetVectorType Worklist;
5747   Worklist.insert(&TopInst);
5748   MachineBasicBlock *CreatedBB = nullptr;
5749   MachineBasicBlock *CreatedBBTmp = nullptr;
5750 
5751   while (!Worklist.empty()) {
5752     MachineInstr &Inst = *Worklist.pop_back_val();
5753     MachineBasicBlock *MBB = Inst.getParent();
5754     MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
5755 
5756     unsigned Opcode = Inst.getOpcode();
5757     unsigned NewOpcode = getVALUOp(Inst);
5758 
5759     // Handle some special cases
5760     switch (Opcode) {
5761     default:
5762       break;
5763     case AMDGPU::S_ADD_U64_PSEUDO:
5764     case AMDGPU::S_SUB_U64_PSEUDO:
5765       splitScalar64BitAddSub(Worklist, Inst, MDT);
5766       Inst.eraseFromParent();
5767       continue;
5768     case AMDGPU::S_ADD_I32:
5769     case AMDGPU::S_SUB_I32: {
5770       // FIXME: The u32 versions currently selected use the carry.
5771       bool Changed;
5772       std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
5773       if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp)
5774         CreatedBB = CreatedBBTmp;
5775       if (Changed)
5776         continue;
5777 
5778       // Default handling
5779       break;
5780     }
5781     case AMDGPU::S_AND_B64:
5782       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
5783       Inst.eraseFromParent();
5784       continue;
5785 
5786     case AMDGPU::S_OR_B64:
5787       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
5788       Inst.eraseFromParent();
5789       continue;
5790 
5791     case AMDGPU::S_XOR_B64:
5792       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
5793       Inst.eraseFromParent();
5794       continue;
5795 
5796     case AMDGPU::S_NAND_B64:
5797       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
5798       Inst.eraseFromParent();
5799       continue;
5800 
5801     case AMDGPU::S_NOR_B64:
5802       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
5803       Inst.eraseFromParent();
5804       continue;
5805 
5806     case AMDGPU::S_XNOR_B64:
5807       if (ST.hasDLInsts())
5808         splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
5809       else
5810         splitScalar64BitXnor(Worklist, Inst, MDT);
5811       Inst.eraseFromParent();
5812       continue;
5813 
5814     case AMDGPU::S_ANDN2_B64:
5815       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
5816       Inst.eraseFromParent();
5817       continue;
5818 
5819     case AMDGPU::S_ORN2_B64:
5820       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
5821       Inst.eraseFromParent();
5822       continue;
5823 
5824     case AMDGPU::S_BREV_B64:
5825       splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
5826       Inst.eraseFromParent();
5827       continue;
5828 
5829     case AMDGPU::S_NOT_B64:
5830       splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
5831       Inst.eraseFromParent();
5832       continue;
5833 
5834     case AMDGPU::S_BCNT1_I32_B64:
5835       splitScalar64BitBCNT(Worklist, Inst);
5836       Inst.eraseFromParent();
5837       continue;
5838 
5839     case AMDGPU::S_BFE_I64:
5840       splitScalar64BitBFE(Worklist, Inst);
5841       Inst.eraseFromParent();
5842       continue;
5843 
5844     case AMDGPU::S_LSHL_B32:
5845       if (ST.hasOnlyRevVALUShifts()) {
5846         NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
5847         swapOperands(Inst);
5848       }
5849       break;
5850     case AMDGPU::S_ASHR_I32:
5851       if (ST.hasOnlyRevVALUShifts()) {
5852         NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
5853         swapOperands(Inst);
5854       }
5855       break;
5856     case AMDGPU::S_LSHR_B32:
5857       if (ST.hasOnlyRevVALUShifts()) {
5858         NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
5859         swapOperands(Inst);
5860       }
5861       break;
5862     case AMDGPU::S_LSHL_B64:
5863       if (ST.hasOnlyRevVALUShifts()) {
5864         NewOpcode = AMDGPU::V_LSHLREV_B64_e64;
5865         swapOperands(Inst);
5866       }
5867       break;
5868     case AMDGPU::S_ASHR_I64:
5869       if (ST.hasOnlyRevVALUShifts()) {
5870         NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
5871         swapOperands(Inst);
5872       }
5873       break;
5874     case AMDGPU::S_LSHR_B64:
5875       if (ST.hasOnlyRevVALUShifts()) {
5876         NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
5877         swapOperands(Inst);
5878       }
5879       break;
5880 
5881     case AMDGPU::S_ABS_I32:
5882       lowerScalarAbs(Worklist, Inst);
5883       Inst.eraseFromParent();
5884       continue;
5885 
5886     case AMDGPU::S_CBRANCH_SCC0:
5887     case AMDGPU::S_CBRANCH_SCC1:
5888       // Clear unused bits of vcc
5889       if (ST.isWave32())
5890         BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B32),
5891                 AMDGPU::VCC_LO)
5892             .addReg(AMDGPU::EXEC_LO)
5893             .addReg(AMDGPU::VCC_LO);
5894       else
5895         BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64),
5896                 AMDGPU::VCC)
5897             .addReg(AMDGPU::EXEC)
5898             .addReg(AMDGPU::VCC);
5899       break;
5900 
5901     case AMDGPU::S_BFE_U64:
5902     case AMDGPU::S_BFM_B64:
5903       llvm_unreachable("Moving this op to VALU not implemented");
5904 
5905     case AMDGPU::S_PACK_LL_B32_B16:
5906     case AMDGPU::S_PACK_LH_B32_B16:
5907     case AMDGPU::S_PACK_HH_B32_B16:
5908       movePackToVALU(Worklist, MRI, Inst);
5909       Inst.eraseFromParent();
5910       continue;
5911 
5912     case AMDGPU::S_XNOR_B32:
5913       lowerScalarXnor(Worklist, Inst);
5914       Inst.eraseFromParent();
5915       continue;
5916 
5917     case AMDGPU::S_NAND_B32:
5918       splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
5919       Inst.eraseFromParent();
5920       continue;
5921 
5922     case AMDGPU::S_NOR_B32:
5923       splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
5924       Inst.eraseFromParent();
5925       continue;
5926 
5927     case AMDGPU::S_ANDN2_B32:
5928       splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
5929       Inst.eraseFromParent();
5930       continue;
5931 
5932     case AMDGPU::S_ORN2_B32:
5933       splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
5934       Inst.eraseFromParent();
5935       continue;
5936 
5937     // TODO: remove as soon as everything is ready
5938     // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
5939     // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
5940     // can only be selected from the uniform SDNode.
5941     case AMDGPU::S_ADD_CO_PSEUDO:
5942     case AMDGPU::S_SUB_CO_PSEUDO: {
5943       unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5944                          ? AMDGPU::V_ADDC_U32_e64
5945                          : AMDGPU::V_SUBB_U32_e64;
5946       const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5947 
5948       Register CarryInReg = Inst.getOperand(4).getReg();
5949       if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
5950         Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
5951         BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
5952             .addReg(CarryInReg);
5953       }
5954 
5955       Register CarryOutReg = Inst.getOperand(1).getReg();
5956 
5957       Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
5958           MRI.getRegClass(Inst.getOperand(0).getReg())));
5959       MachineInstr *CarryOp =
5960           BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
5961               .addReg(CarryOutReg, RegState::Define)
5962               .add(Inst.getOperand(2))
5963               .add(Inst.getOperand(3))
5964               .addReg(CarryInReg)
5965               .addImm(0);
5966       CreatedBBTmp = legalizeOperands(*CarryOp);
5967       if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp)
5968         CreatedBB = CreatedBBTmp;
5969       MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
5970       addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
5971       Inst.eraseFromParent();
5972     }
5973       continue;
5974     case AMDGPU::S_UADDO_PSEUDO:
5975     case AMDGPU::S_USUBO_PSEUDO: {
5976       const DebugLoc &DL = Inst.getDebugLoc();
5977       MachineOperand &Dest0 = Inst.getOperand(0);
5978       MachineOperand &Dest1 = Inst.getOperand(1);
5979       MachineOperand &Src0 = Inst.getOperand(2);
5980       MachineOperand &Src1 = Inst.getOperand(3);
5981 
5982       unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
5983                          ? AMDGPU::V_ADD_CO_U32_e64
5984                          : AMDGPU::V_SUB_CO_U32_e64;
5985       const TargetRegisterClass *NewRC =
5986           RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
5987       Register DestReg = MRI.createVirtualRegister(NewRC);
5988       MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
5989                                    .addReg(Dest1.getReg(), RegState::Define)
5990                                    .add(Src0)
5991                                    .add(Src1)
5992                                    .addImm(0); // clamp bit
5993 
5994       CreatedBBTmp = legalizeOperands(*NewInstr, MDT);
5995       if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp)
5996         CreatedBB = CreatedBBTmp;
5997 
5998       MRI.replaceRegWith(Dest0.getReg(), DestReg);
5999       addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
6000                                    Worklist);
6001       Inst.eraseFromParent();
6002     }
6003       continue;
6004 
6005     case AMDGPU::S_CSELECT_B32:
6006     case AMDGPU::S_CSELECT_B64:
6007       lowerSelect(Worklist, Inst, MDT);
6008       Inst.eraseFromParent();
6009       continue;
6010     }
6011 
6012     if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
6013       // We cannot move this instruction to the VALU, so we should try to
6014       // legalize its operands instead.
6015       CreatedBBTmp = legalizeOperands(Inst, MDT);
6016       if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp)
6017         CreatedBB = CreatedBBTmp;
6018       continue;
6019     }
6020 
6021     // Use the new VALU Opcode.
6022     const MCInstrDesc &NewDesc = get(NewOpcode);
6023     Inst.setDesc(NewDesc);
6024 
6025     // Remove any references to SCC. Vector instructions can't read from it, and
6026     // We're just about to add the implicit use / defs of VCC, and we don't want
6027     // both.
6028     for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) {
6029       MachineOperand &Op = Inst.getOperand(i);
6030       if (Op.isReg() && Op.getReg() == AMDGPU::SCC) {
6031         // Only propagate through live-def of SCC.
6032         if (Op.isDef() && !Op.isDead())
6033           addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
6034         if (Op.isUse())
6035           addSCCDefsToVALUWorklist(Op, Worklist);
6036         Inst.RemoveOperand(i);
6037       }
6038     }
6039 
6040     if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
6041       // We are converting these to a BFE, so we need to add the missing
6042       // operands for the size and offset.
6043       unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
6044       Inst.addOperand(MachineOperand::CreateImm(0));
6045       Inst.addOperand(MachineOperand::CreateImm(Size));
6046 
6047     } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
6048       // The VALU version adds the second operand to the result, so insert an
6049       // extra 0 operand.
6050       Inst.addOperand(MachineOperand::CreateImm(0));
6051     }
6052 
6053     Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent());
6054     fixImplicitOperands(Inst);
6055 
6056     if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
6057       const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
6058       // If we need to move this to VGPRs, we need to unpack the second operand
6059       // back into the 2 separate ones for bit offset and width.
6060       assert(OffsetWidthOp.isImm() &&
6061              "Scalar BFE is only implemented for constant width and offset");
6062       uint32_t Imm = OffsetWidthOp.getImm();
6063 
6064       uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
6065       uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
6066       Inst.RemoveOperand(2);                     // Remove old immediate.
6067       Inst.addOperand(MachineOperand::CreateImm(Offset));
6068       Inst.addOperand(MachineOperand::CreateImm(BitWidth));
6069     }
6070 
6071     bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef();
6072     unsigned NewDstReg = AMDGPU::NoRegister;
6073     if (HasDst) {
6074       Register DstReg = Inst.getOperand(0).getReg();
6075       if (DstReg.isPhysical())
6076         continue;
6077 
6078       // Update the destination register class.
6079       const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
6080       if (!NewDstRC)
6081         continue;
6082 
6083       if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
6084           NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
6085         // Instead of creating a copy where src and dst are the same register
6086         // class, we just replace all uses of dst with src.  These kinds of
6087         // copies interfere with the heuristics MachineSink uses to decide
6088         // whether or not to split a critical edge.  Since the pass assumes
6089         // that copies will end up as machine instructions and not be
6090         // eliminated.
6091         addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
6092         MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
6093         MRI.clearKillFlags(Inst.getOperand(1).getReg());
6094         Inst.getOperand(0).setReg(DstReg);
6095 
6096         // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
6097         // these are deleted later, but at -O0 it would leave a suspicious
6098         // looking illegal copy of an undef register.
6099         for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
6100           Inst.RemoveOperand(I);
6101         Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
6102         continue;
6103       }
6104 
6105       NewDstReg = MRI.createVirtualRegister(NewDstRC);
6106       MRI.replaceRegWith(DstReg, NewDstReg);
6107     }
6108 
6109     // Legalize the operands
6110     CreatedBBTmp = legalizeOperands(Inst, MDT);
6111     if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp)
6112       CreatedBB = CreatedBBTmp;
6113 
6114     if (HasDst)
6115      addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
6116   }
6117   return CreatedBB;
6118 }
6119 
6120 // Add/sub require special handling to deal with carry outs.
6121 std::pair<bool, MachineBasicBlock *>
6122 SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
6123                               MachineDominatorTree *MDT) const {
6124   if (ST.hasAddNoCarry()) {
6125     // Assume there is no user of scc since we don't select this in that case.
6126     // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
6127     // is used.
6128 
6129     MachineBasicBlock &MBB = *Inst.getParent();
6130     MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6131 
6132     Register OldDstReg = Inst.getOperand(0).getReg();
6133     Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6134 
6135     unsigned Opc = Inst.getOpcode();
6136     assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
6137 
6138     unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
6139       AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
6140 
6141     assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
6142     Inst.RemoveOperand(3);
6143 
6144     Inst.setDesc(get(NewOpc));
6145     Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
6146     Inst.addImplicitDefUseOperands(*MBB.getParent());
6147     MRI.replaceRegWith(OldDstReg, ResultReg);
6148     MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
6149 
6150     addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
6151     return std::make_pair(true, NewBB);
6152   }
6153 
6154   return std::make_pair(false, nullptr);
6155 }
6156 
6157 void SIInstrInfo::lowerSelect(SetVectorType &Worklist, MachineInstr &Inst,
6158                               MachineDominatorTree *MDT) const {
6159 
6160   MachineBasicBlock &MBB = *Inst.getParent();
6161   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6162   MachineBasicBlock::iterator MII = Inst;
6163   DebugLoc DL = Inst.getDebugLoc();
6164 
6165   MachineOperand &Dest = Inst.getOperand(0);
6166   MachineOperand &Src0 = Inst.getOperand(1);
6167   MachineOperand &Src1 = Inst.getOperand(2);
6168   MachineOperand &Cond = Inst.getOperand(3);
6169 
6170   Register SCCSource = Cond.getReg();
6171   // Find SCC def, and if that is a copy (SCC = COPY reg) then use reg instead.
6172   if (!Cond.isUndef()) {
6173     for (MachineInstr &CandI :
6174          make_range(std::next(MachineBasicBlock::reverse_iterator(Inst)),
6175                     Inst.getParent()->rend())) {
6176       if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) !=
6177           -1) {
6178         if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
6179           SCCSource = CandI.getOperand(1).getReg();
6180         }
6181         break;
6182       }
6183     }
6184   }
6185 
6186   // If this is a trivial select where the condition is effectively not SCC
6187   // (SCCSource is a source of copy to SCC), then the select is semantically
6188   // equivalent to copying SCCSource. Hence, there is no need to create
6189   // V_CNDMASK, we can just use that and bail out.
6190   if ((SCCSource != AMDGPU::SCC) && Src0.isImm() && (Src0.getImm() == -1) &&
6191       Src1.isImm() && (Src1.getImm() == 0)) {
6192     MRI.replaceRegWith(Dest.getReg(), SCCSource);
6193     return;
6194   }
6195 
6196   const TargetRegisterClass *TC = ST.getWavefrontSize() == 64
6197                                       ? &AMDGPU::SReg_64_XEXECRegClass
6198                                       : &AMDGPU::SReg_32_XM0_XEXECRegClass;
6199   Register CopySCC = MRI.createVirtualRegister(TC);
6200 
6201   if (SCCSource == AMDGPU::SCC) {
6202     // Insert a trivial select instead of creating a copy, because a copy from
6203     // SCC would semantically mean just copying a single bit, but we may need
6204     // the result to be a vector condition mask that needs preserving.
6205     unsigned Opcode = (ST.getWavefrontSize() == 64) ? AMDGPU::S_CSELECT_B64
6206                                                     : AMDGPU::S_CSELECT_B32;
6207     auto NewSelect =
6208         BuildMI(MBB, MII, DL, get(Opcode), CopySCC).addImm(-1).addImm(0);
6209     NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
6210   } else {
6211     BuildMI(MBB, MII, DL, get(AMDGPU::COPY), CopySCC).addReg(SCCSource);
6212   }
6213 
6214   Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6215 
6216   auto UpdatedInst =
6217       BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), ResultReg)
6218           .addImm(0)
6219           .add(Src1) // False
6220           .addImm(0)
6221           .add(Src0) // True
6222           .addReg(CopySCC);
6223 
6224   MRI.replaceRegWith(Dest.getReg(), ResultReg);
6225   legalizeOperands(*UpdatedInst, MDT);
6226   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
6227 }
6228 
6229 void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist,
6230                                  MachineInstr &Inst) const {
6231   MachineBasicBlock &MBB = *Inst.getParent();
6232   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6233   MachineBasicBlock::iterator MII = Inst;
6234   DebugLoc DL = Inst.getDebugLoc();
6235 
6236   MachineOperand &Dest = Inst.getOperand(0);
6237   MachineOperand &Src = Inst.getOperand(1);
6238   Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6239   Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6240 
6241   unsigned SubOp = ST.hasAddNoCarry() ?
6242     AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
6243 
6244   BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
6245     .addImm(0)
6246     .addReg(Src.getReg());
6247 
6248   BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
6249     .addReg(Src.getReg())
6250     .addReg(TmpReg);
6251 
6252   MRI.replaceRegWith(Dest.getReg(), ResultReg);
6253   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
6254 }
6255 
6256 void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
6257                                   MachineInstr &Inst) const {
6258   MachineBasicBlock &MBB = *Inst.getParent();
6259   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6260   MachineBasicBlock::iterator MII = Inst;
6261   const DebugLoc &DL = Inst.getDebugLoc();
6262 
6263   MachineOperand &Dest = Inst.getOperand(0);
6264   MachineOperand &Src0 = Inst.getOperand(1);
6265   MachineOperand &Src1 = Inst.getOperand(2);
6266 
6267   if (ST.hasDLInsts()) {
6268     Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6269     legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
6270     legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
6271 
6272     BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
6273       .add(Src0)
6274       .add(Src1);
6275 
6276     MRI.replaceRegWith(Dest.getReg(), NewDest);
6277     addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
6278   } else {
6279     // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
6280     // invert either source and then perform the XOR. If either source is a
6281     // scalar register, then we can leave the inversion on the scalar unit to
6282     // acheive a better distrubution of scalar and vector instructions.
6283     bool Src0IsSGPR = Src0.isReg() &&
6284                       RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
6285     bool Src1IsSGPR = Src1.isReg() &&
6286                       RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
6287     MachineInstr *Xor;
6288     Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6289     Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6290 
6291     // Build a pair of scalar instructions and add them to the work list.
6292     // The next iteration over the work list will lower these to the vector
6293     // unit as necessary.
6294     if (Src0IsSGPR) {
6295       BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
6296       Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
6297       .addReg(Temp)
6298       .add(Src1);
6299     } else if (Src1IsSGPR) {
6300       BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
6301       Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
6302       .add(Src0)
6303       .addReg(Temp);
6304     } else {
6305       Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
6306         .add(Src0)
6307         .add(Src1);
6308       MachineInstr *Not =
6309           BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
6310       Worklist.insert(Not);
6311     }
6312 
6313     MRI.replaceRegWith(Dest.getReg(), NewDest);
6314 
6315     Worklist.insert(Xor);
6316 
6317     addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
6318   }
6319 }
6320 
6321 void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist,
6322                                       MachineInstr &Inst,
6323                                       unsigned Opcode) const {
6324   MachineBasicBlock &MBB = *Inst.getParent();
6325   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6326   MachineBasicBlock::iterator MII = Inst;
6327   const DebugLoc &DL = Inst.getDebugLoc();
6328 
6329   MachineOperand &Dest = Inst.getOperand(0);
6330   MachineOperand &Src0 = Inst.getOperand(1);
6331   MachineOperand &Src1 = Inst.getOperand(2);
6332 
6333   Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6334   Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6335 
6336   MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
6337     .add(Src0)
6338     .add(Src1);
6339 
6340   MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
6341     .addReg(Interm);
6342 
6343   Worklist.insert(&Op);
6344   Worklist.insert(&Not);
6345 
6346   MRI.replaceRegWith(Dest.getReg(), NewDest);
6347   addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
6348 }
6349 
6350 void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist,
6351                                      MachineInstr &Inst,
6352                                      unsigned Opcode) const {
6353   MachineBasicBlock &MBB = *Inst.getParent();
6354   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6355   MachineBasicBlock::iterator MII = Inst;
6356   const DebugLoc &DL = Inst.getDebugLoc();
6357 
6358   MachineOperand &Dest = Inst.getOperand(0);
6359   MachineOperand &Src0 = Inst.getOperand(1);
6360   MachineOperand &Src1 = Inst.getOperand(2);
6361 
6362   Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6363   Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6364 
6365   MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
6366     .add(Src1);
6367 
6368   MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
6369     .add(Src0)
6370     .addReg(Interm);
6371 
6372   Worklist.insert(&Not);
6373   Worklist.insert(&Op);
6374 
6375   MRI.replaceRegWith(Dest.getReg(), NewDest);
6376   addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
6377 }
6378 
6379 void SIInstrInfo::splitScalar64BitUnaryOp(
6380     SetVectorType &Worklist, MachineInstr &Inst,
6381     unsigned Opcode, bool Swap) const {
6382   MachineBasicBlock &MBB = *Inst.getParent();
6383   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6384 
6385   MachineOperand &Dest = Inst.getOperand(0);
6386   MachineOperand &Src0 = Inst.getOperand(1);
6387   DebugLoc DL = Inst.getDebugLoc();
6388 
6389   MachineBasicBlock::iterator MII = Inst;
6390 
6391   const MCInstrDesc &InstDesc = get(Opcode);
6392   const TargetRegisterClass *Src0RC = Src0.isReg() ?
6393     MRI.getRegClass(Src0.getReg()) :
6394     &AMDGPU::SGPR_32RegClass;
6395 
6396   const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
6397 
6398   MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
6399                                                        AMDGPU::sub0, Src0SubRC);
6400 
6401   const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
6402   const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
6403   const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
6404 
6405   Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
6406   MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
6407 
6408   MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
6409                                                        AMDGPU::sub1, Src0SubRC);
6410 
6411   Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
6412   MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
6413 
6414   if (Swap)
6415     std::swap(DestSub0, DestSub1);
6416 
6417   Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
6418   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
6419     .addReg(DestSub0)
6420     .addImm(AMDGPU::sub0)
6421     .addReg(DestSub1)
6422     .addImm(AMDGPU::sub1);
6423 
6424   MRI.replaceRegWith(Dest.getReg(), FullDestReg);
6425 
6426   Worklist.insert(&LoHalf);
6427   Worklist.insert(&HiHalf);
6428 
6429   // We don't need to legalizeOperands here because for a single operand, src0
6430   // will support any kind of input.
6431 
6432   // Move all users of this moved value.
6433   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
6434 }
6435 
6436 void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist,
6437                                          MachineInstr &Inst,
6438                                          MachineDominatorTree *MDT) const {
6439   bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
6440 
6441   MachineBasicBlock &MBB = *Inst.getParent();
6442   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6443   const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
6444 
6445   Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
6446   Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6447   Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6448 
6449   Register CarryReg = MRI.createVirtualRegister(CarryRC);
6450   Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
6451 
6452   MachineOperand &Dest = Inst.getOperand(0);
6453   MachineOperand &Src0 = Inst.getOperand(1);
6454   MachineOperand &Src1 = Inst.getOperand(2);
6455   const DebugLoc &DL = Inst.getDebugLoc();
6456   MachineBasicBlock::iterator MII = Inst;
6457 
6458   const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
6459   const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
6460   const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
6461   const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
6462 
6463   MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
6464                                                        AMDGPU::sub0, Src0SubRC);
6465   MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
6466                                                        AMDGPU::sub0, Src1SubRC);
6467 
6468 
6469   MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
6470                                                        AMDGPU::sub1, Src0SubRC);
6471   MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
6472                                                        AMDGPU::sub1, Src1SubRC);
6473 
6474   unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6475   MachineInstr *LoHalf =
6476     BuildMI(MBB, MII, DL, get(LoOpc), DestSub0)
6477     .addReg(CarryReg, RegState::Define)
6478     .add(SrcReg0Sub0)
6479     .add(SrcReg1Sub0)
6480     .addImm(0); // clamp bit
6481 
6482   unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6483   MachineInstr *HiHalf =
6484     BuildMI(MBB, MII, DL, get(HiOpc), DestSub1)
6485     .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
6486     .add(SrcReg0Sub1)
6487     .add(SrcReg1Sub1)
6488     .addReg(CarryReg, RegState::Kill)
6489     .addImm(0); // clamp bit
6490 
6491   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
6492     .addReg(DestSub0)
6493     .addImm(AMDGPU::sub0)
6494     .addReg(DestSub1)
6495     .addImm(AMDGPU::sub1);
6496 
6497   MRI.replaceRegWith(Dest.getReg(), FullDestReg);
6498 
6499   // Try to legalize the operands in case we need to swap the order to keep it
6500   // valid.
6501   legalizeOperands(*LoHalf, MDT);
6502   legalizeOperands(*HiHalf, MDT);
6503 
6504   // Move all users of this moved vlaue.
6505   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
6506 }
6507 
6508 void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist,
6509                                            MachineInstr &Inst, unsigned Opcode,
6510                                            MachineDominatorTree *MDT) const {
6511   MachineBasicBlock &MBB = *Inst.getParent();
6512   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6513 
6514   MachineOperand &Dest = Inst.getOperand(0);
6515   MachineOperand &Src0 = Inst.getOperand(1);
6516   MachineOperand &Src1 = Inst.getOperand(2);
6517   DebugLoc DL = Inst.getDebugLoc();
6518 
6519   MachineBasicBlock::iterator MII = Inst;
6520 
6521   const MCInstrDesc &InstDesc = get(Opcode);
6522   const TargetRegisterClass *Src0RC = Src0.isReg() ?
6523     MRI.getRegClass(Src0.getReg()) :
6524     &AMDGPU::SGPR_32RegClass;
6525 
6526   const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
6527   const TargetRegisterClass *Src1RC = Src1.isReg() ?
6528     MRI.getRegClass(Src1.getReg()) :
6529     &AMDGPU::SGPR_32RegClass;
6530 
6531   const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
6532 
6533   MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
6534                                                        AMDGPU::sub0, Src0SubRC);
6535   MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
6536                                                        AMDGPU::sub0, Src1SubRC);
6537   MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
6538                                                        AMDGPU::sub1, Src0SubRC);
6539   MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
6540                                                        AMDGPU::sub1, Src1SubRC);
6541 
6542   const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
6543   const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
6544   const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
6545 
6546   Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
6547   MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
6548                               .add(SrcReg0Sub0)
6549                               .add(SrcReg1Sub0);
6550 
6551   Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
6552   MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
6553                               .add(SrcReg0Sub1)
6554                               .add(SrcReg1Sub1);
6555 
6556   Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
6557   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
6558     .addReg(DestSub0)
6559     .addImm(AMDGPU::sub0)
6560     .addReg(DestSub1)
6561     .addImm(AMDGPU::sub1);
6562 
6563   MRI.replaceRegWith(Dest.getReg(), FullDestReg);
6564 
6565   Worklist.insert(&LoHalf);
6566   Worklist.insert(&HiHalf);
6567 
6568   // Move all users of this moved vlaue.
6569   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
6570 }
6571 
6572 void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist,
6573                                        MachineInstr &Inst,
6574                                        MachineDominatorTree *MDT) const {
6575   MachineBasicBlock &MBB = *Inst.getParent();
6576   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6577 
6578   MachineOperand &Dest = Inst.getOperand(0);
6579   MachineOperand &Src0 = Inst.getOperand(1);
6580   MachineOperand &Src1 = Inst.getOperand(2);
6581   const DebugLoc &DL = Inst.getDebugLoc();
6582 
6583   MachineBasicBlock::iterator MII = Inst;
6584 
6585   const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
6586 
6587   Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6588 
6589   MachineOperand* Op0;
6590   MachineOperand* Op1;
6591 
6592   if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
6593     Op0 = &Src0;
6594     Op1 = &Src1;
6595   } else {
6596     Op0 = &Src1;
6597     Op1 = &Src0;
6598   }
6599 
6600   BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
6601     .add(*Op0);
6602 
6603   Register NewDest = MRI.createVirtualRegister(DestRC);
6604 
6605   MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
6606     .addReg(Interm)
6607     .add(*Op1);
6608 
6609   MRI.replaceRegWith(Dest.getReg(), NewDest);
6610 
6611   Worklist.insert(&Xor);
6612 }
6613 
6614 void SIInstrInfo::splitScalar64BitBCNT(
6615     SetVectorType &Worklist, MachineInstr &Inst) const {
6616   MachineBasicBlock &MBB = *Inst.getParent();
6617   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6618 
6619   MachineBasicBlock::iterator MII = Inst;
6620   const DebugLoc &DL = Inst.getDebugLoc();
6621 
6622   MachineOperand &Dest = Inst.getOperand(0);
6623   MachineOperand &Src = Inst.getOperand(1);
6624 
6625   const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
6626   const TargetRegisterClass *SrcRC = Src.isReg() ?
6627     MRI.getRegClass(Src.getReg()) :
6628     &AMDGPU::SGPR_32RegClass;
6629 
6630   Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6631   Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6632 
6633   const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0);
6634 
6635   MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
6636                                                       AMDGPU::sub0, SrcSubRC);
6637   MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
6638                                                       AMDGPU::sub1, SrcSubRC);
6639 
6640   BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
6641 
6642   BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
6643 
6644   MRI.replaceRegWith(Dest.getReg(), ResultReg);
6645 
6646   // We don't need to legalize operands here. src0 for etiher instruction can be
6647   // an SGPR, and the second input is unused or determined here.
6648   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
6649 }
6650 
6651 void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
6652                                       MachineInstr &Inst) const {
6653   MachineBasicBlock &MBB = *Inst.getParent();
6654   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
6655   MachineBasicBlock::iterator MII = Inst;
6656   const DebugLoc &DL = Inst.getDebugLoc();
6657 
6658   MachineOperand &Dest = Inst.getOperand(0);
6659   uint32_t Imm = Inst.getOperand(2).getImm();
6660   uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
6661   uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
6662 
6663   (void) Offset;
6664 
6665   // Only sext_inreg cases handled.
6666   assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
6667          Offset == 0 && "Not implemented");
6668 
6669   if (BitWidth < 32) {
6670     Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6671     Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6672     Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
6673 
6674     BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
6675         .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
6676         .addImm(0)
6677         .addImm(BitWidth);
6678 
6679     BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
6680       .addImm(31)
6681       .addReg(MidRegLo);
6682 
6683     BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
6684       .addReg(MidRegLo)
6685       .addImm(AMDGPU::sub0)
6686       .addReg(MidRegHi)
6687       .addImm(AMDGPU::sub1);
6688 
6689     MRI.replaceRegWith(Dest.getReg(), ResultReg);
6690     addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
6691     return;
6692   }
6693 
6694   MachineOperand &Src = Inst.getOperand(1);
6695   Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6696   Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
6697 
6698   BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
6699     .addImm(31)
6700     .addReg(Src.getReg(), 0, AMDGPU::sub0);
6701 
6702   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
6703     .addReg(Src.getReg(), 0, AMDGPU::sub0)
6704     .addImm(AMDGPU::sub0)
6705     .addReg(TmpReg)
6706     .addImm(AMDGPU::sub1);
6707 
6708   MRI.replaceRegWith(Dest.getReg(), ResultReg);
6709   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
6710 }
6711 
6712 void SIInstrInfo::addUsersToMoveToVALUWorklist(
6713   Register DstReg,
6714   MachineRegisterInfo &MRI,
6715   SetVectorType &Worklist) const {
6716   for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
6717          E = MRI.use_end(); I != E;) {
6718     MachineInstr &UseMI = *I->getParent();
6719 
6720     unsigned OpNo = 0;
6721 
6722     switch (UseMI.getOpcode()) {
6723     case AMDGPU::COPY:
6724     case AMDGPU::WQM:
6725     case AMDGPU::SOFT_WQM:
6726     case AMDGPU::STRICT_WWM:
6727     case AMDGPU::STRICT_WQM:
6728     case AMDGPU::REG_SEQUENCE:
6729     case AMDGPU::PHI:
6730     case AMDGPU::INSERT_SUBREG:
6731       break;
6732     default:
6733       OpNo = I.getOperandNo();
6734       break;
6735     }
6736 
6737     if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo))) {
6738       Worklist.insert(&UseMI);
6739 
6740       do {
6741         ++I;
6742       } while (I != E && I->getParent() == &UseMI);
6743     } else {
6744       ++I;
6745     }
6746   }
6747 }
6748 
6749 void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
6750                                  MachineRegisterInfo &MRI,
6751                                  MachineInstr &Inst) const {
6752   Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6753   MachineBasicBlock *MBB = Inst.getParent();
6754   MachineOperand &Src0 = Inst.getOperand(1);
6755   MachineOperand &Src1 = Inst.getOperand(2);
6756   const DebugLoc &DL = Inst.getDebugLoc();
6757 
6758   switch (Inst.getOpcode()) {
6759   case AMDGPU::S_PACK_LL_B32_B16: {
6760     Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6761     Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6762 
6763     // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
6764     // 0.
6765     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
6766       .addImm(0xffff);
6767 
6768     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
6769       .addReg(ImmReg, RegState::Kill)
6770       .add(Src0);
6771 
6772     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
6773       .add(Src1)
6774       .addImm(16)
6775       .addReg(TmpReg, RegState::Kill);
6776     break;
6777   }
6778   case AMDGPU::S_PACK_LH_B32_B16: {
6779     Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6780     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
6781       .addImm(0xffff);
6782     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
6783       .addReg(ImmReg, RegState::Kill)
6784       .add(Src0)
6785       .add(Src1);
6786     break;
6787   }
6788   case AMDGPU::S_PACK_HH_B32_B16: {
6789     Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6790     Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6791     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
6792       .addImm(16)
6793       .add(Src0);
6794     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
6795       .addImm(0xffff0000);
6796     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
6797       .add(Src1)
6798       .addReg(ImmReg, RegState::Kill)
6799       .addReg(TmpReg, RegState::Kill);
6800     break;
6801   }
6802   default:
6803     llvm_unreachable("unhandled s_pack_* instruction");
6804   }
6805 
6806   MachineOperand &Dest = Inst.getOperand(0);
6807   MRI.replaceRegWith(Dest.getReg(), ResultReg);
6808   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
6809 }
6810 
6811 void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
6812                                                MachineInstr &SCCDefInst,
6813                                                SetVectorType &Worklist) const {
6814   bool SCCUsedImplicitly = false;
6815 
6816   // Ensure that def inst defines SCC, which is still live.
6817   assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
6818          !Op.isDead() && Op.getParent() == &SCCDefInst);
6819   SmallVector<MachineInstr *, 4> CopyToDelete;
6820   // This assumes that all the users of SCC are in the same block
6821   // as the SCC def.
6822   for (MachineInstr &MI : // Skip the def inst itself.
6823        make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
6824                   SCCDefInst.getParent()->end())) {
6825     // Check if SCC is used first.
6826     if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1) {
6827       if (MI.isCopy()) {
6828         MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
6829         Register DestReg = MI.getOperand(0).getReg();
6830 
6831         for (auto &User : MRI.use_nodbg_instructions(DestReg)) {
6832           if ((User.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) ||
6833               (User.getOpcode() == AMDGPU::S_SUB_CO_PSEUDO)) {
6834             User.getOperand(4).setReg(RI.getVCC());
6835             Worklist.insert(&User);
6836           } else if (User.getOpcode() == AMDGPU::V_CNDMASK_B32_e64) {
6837             User.getOperand(5).setReg(RI.getVCC());
6838             // No need to add to Worklist.
6839           }
6840         }
6841         CopyToDelete.push_back(&MI);
6842       } else {
6843         if (MI.getOpcode() == AMDGPU::S_CSELECT_B32 ||
6844             MI.getOpcode() == AMDGPU::S_CSELECT_B64) {
6845           // This is an implicit use of SCC and it is really expected by
6846           // the SCC users to handle.
6847           // We cannot preserve the edge to the user so add the explicit
6848           // copy: SCC = COPY VCC.
6849           // The copy will be cleaned up during the processing of the user
6850           // in lowerSelect.
6851           SCCUsedImplicitly = true;
6852         }
6853 
6854         Worklist.insert(&MI);
6855       }
6856     }
6857     // Exit if we find another SCC def.
6858     if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1)
6859       break;
6860   }
6861   for (auto &Copy : CopyToDelete)
6862     Copy->eraseFromParent();
6863 
6864   if (SCCUsedImplicitly) {
6865     BuildMI(*SCCDefInst.getParent(), std::next(SCCDefInst.getIterator()),
6866             SCCDefInst.getDebugLoc(), get(AMDGPU::COPY), AMDGPU::SCC)
6867         .addReg(RI.getVCC());
6868   }
6869 }
6870 
6871 // Instructions that use SCC may be converted to VALU instructions. When that
6872 // happens, the SCC register is changed to VCC_LO. The instruction that defines
6873 // SCC must be changed to an instruction that defines VCC. This function makes
6874 // sure that the instruction that defines SCC is added to the moveToVALU
6875 // worklist.
6876 void SIInstrInfo::addSCCDefsToVALUWorklist(MachineOperand &Op,
6877                                            SetVectorType &Worklist) const {
6878   assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isUse());
6879 
6880   MachineInstr *SCCUseInst = Op.getParent();
6881   // Look for a preceeding instruction that either defines VCC or SCC. If VCC
6882   // then there is nothing to do because the defining instruction has been
6883   // converted to a VALU already. If SCC then that instruction needs to be
6884   // converted to a VALU.
6885   for (MachineInstr &MI :
6886        make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
6887                   SCCUseInst->getParent()->rend())) {
6888     if (MI.modifiesRegister(AMDGPU::VCC, &RI))
6889       break;
6890     if (MI.definesRegister(AMDGPU::SCC, &RI)) {
6891       Worklist.insert(&MI);
6892       break;
6893     }
6894   }
6895 }
6896 
6897 const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
6898   const MachineInstr &Inst) const {
6899   const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
6900 
6901   switch (Inst.getOpcode()) {
6902   // For target instructions, getOpRegClass just returns the virtual register
6903   // class associated with the operand, so we need to find an equivalent VGPR
6904   // register class in order to move the instruction to the VALU.
6905   case AMDGPU::COPY:
6906   case AMDGPU::PHI:
6907   case AMDGPU::REG_SEQUENCE:
6908   case AMDGPU::INSERT_SUBREG:
6909   case AMDGPU::WQM:
6910   case AMDGPU::SOFT_WQM:
6911   case AMDGPU::STRICT_WWM:
6912   case AMDGPU::STRICT_WQM: {
6913     const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
6914     if (RI.hasAGPRs(SrcRC)) {
6915       if (RI.hasAGPRs(NewDstRC))
6916         return nullptr;
6917 
6918       switch (Inst.getOpcode()) {
6919       case AMDGPU::PHI:
6920       case AMDGPU::REG_SEQUENCE:
6921       case AMDGPU::INSERT_SUBREG:
6922         NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
6923         break;
6924       default:
6925         NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
6926       }
6927 
6928       if (!NewDstRC)
6929         return nullptr;
6930     } else {
6931       if (RI.hasVGPRs(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
6932         return nullptr;
6933 
6934       NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
6935       if (!NewDstRC)
6936         return nullptr;
6937     }
6938 
6939     return NewDstRC;
6940   }
6941   default:
6942     return NewDstRC;
6943   }
6944 }
6945 
6946 // Find the one SGPR operand we are allowed to use.
6947 Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
6948                                    int OpIndices[3]) const {
6949   const MCInstrDesc &Desc = MI.getDesc();
6950 
6951   // Find the one SGPR operand we are allowed to use.
6952   //
6953   // First we need to consider the instruction's operand requirements before
6954   // legalizing. Some operands are required to be SGPRs, such as implicit uses
6955   // of VCC, but we are still bound by the constant bus requirement to only use
6956   // one.
6957   //
6958   // If the operand's class is an SGPR, we can never move it.
6959 
6960   Register SGPRReg = findImplicitSGPRRead(MI);
6961   if (SGPRReg != AMDGPU::NoRegister)
6962     return SGPRReg;
6963 
6964   Register UsedSGPRs[3] = { AMDGPU::NoRegister };
6965   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
6966 
6967   for (unsigned i = 0; i < 3; ++i) {
6968     int Idx = OpIndices[i];
6969     if (Idx == -1)
6970       break;
6971 
6972     const MachineOperand &MO = MI.getOperand(Idx);
6973     if (!MO.isReg())
6974       continue;
6975 
6976     // Is this operand statically required to be an SGPR based on the operand
6977     // constraints?
6978     const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass);
6979     bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
6980     if (IsRequiredSGPR)
6981       return MO.getReg();
6982 
6983     // If this could be a VGPR or an SGPR, Check the dynamic register class.
6984     Register Reg = MO.getReg();
6985     const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
6986     if (RI.isSGPRClass(RegRC))
6987       UsedSGPRs[i] = Reg;
6988   }
6989 
6990   // We don't have a required SGPR operand, so we have a bit more freedom in
6991   // selecting operands to move.
6992 
6993   // Try to select the most used SGPR. If an SGPR is equal to one of the
6994   // others, we choose that.
6995   //
6996   // e.g.
6997   // V_FMA_F32 v0, s0, s0, s0 -> No moves
6998   // V_FMA_F32 v0, s0, s1, s0 -> Move s1
6999 
7000   // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
7001   // prefer those.
7002 
7003   if (UsedSGPRs[0] != AMDGPU::NoRegister) {
7004     if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
7005       SGPRReg = UsedSGPRs[0];
7006   }
7007 
7008   if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) {
7009     if (UsedSGPRs[1] == UsedSGPRs[2])
7010       SGPRReg = UsedSGPRs[1];
7011   }
7012 
7013   return SGPRReg;
7014 }
7015 
7016 MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
7017                                              unsigned OperandName) const {
7018   int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
7019   if (Idx == -1)
7020     return nullptr;
7021 
7022   return &MI.getOperand(Idx);
7023 }
7024 
7025 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
7026   if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
7027     return (AMDGPU::MTBUFFormat::UFMT_32_FLOAT << 44) |
7028            (1ULL << 56) | // RESOURCE_LEVEL = 1
7029            (3ULL << 60); // OOB_SELECT = 3
7030   }
7031 
7032   uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
7033   if (ST.isAmdHsaOS()) {
7034     // Set ATC = 1. GFX9 doesn't have this bit.
7035     if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS)
7036       RsrcDataFormat |= (1ULL << 56);
7037 
7038     // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
7039     // BTW, it disables TC L2 and therefore decreases performance.
7040     if (ST.getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS)
7041       RsrcDataFormat |= (2ULL << 59);
7042   }
7043 
7044   return RsrcDataFormat;
7045 }
7046 
7047 uint64_t SIInstrInfo::getScratchRsrcWords23() const {
7048   uint64_t Rsrc23 = getDefaultRsrcDataFormat() |
7049                     AMDGPU::RSRC_TID_ENABLE |
7050                     0xffffffff; // Size;
7051 
7052   // GFX9 doesn't have ELEMENT_SIZE.
7053   if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
7054     uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
7055     Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
7056   }
7057 
7058   // IndexStride = 64 / 32.
7059   uint64_t IndexStride = ST.getWavefrontSize() == 64 ? 3 : 2;
7060   Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
7061 
7062   // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
7063   // Clear them unless we want a huge stride.
7064   if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
7065       ST.getGeneration() <= AMDGPUSubtarget::GFX9)
7066     Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
7067 
7068   return Rsrc23;
7069 }
7070 
7071 bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const {
7072   unsigned Opc = MI.getOpcode();
7073 
7074   return isSMRD(Opc);
7075 }
7076 
7077 bool SIInstrInfo::isHighLatencyDef(int Opc) const {
7078   return get(Opc).mayLoad() &&
7079          (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
7080 }
7081 
7082 unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI,
7083                                     int &FrameIndex) const {
7084   const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
7085   if (!Addr || !Addr->isFI())
7086     return AMDGPU::NoRegister;
7087 
7088   assert(!MI.memoperands_empty() &&
7089          (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
7090 
7091   FrameIndex = Addr->getIndex();
7092   return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
7093 }
7094 
7095 unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI,
7096                                         int &FrameIndex) const {
7097   const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
7098   assert(Addr && Addr->isFI());
7099   FrameIndex = Addr->getIndex();
7100   return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
7101 }
7102 
7103 unsigned SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
7104                                           int &FrameIndex) const {
7105   if (!MI.mayLoad())
7106     return AMDGPU::NoRegister;
7107 
7108   if (isMUBUF(MI) || isVGPRSpill(MI))
7109     return isStackAccess(MI, FrameIndex);
7110 
7111   if (isSGPRSpill(MI))
7112     return isSGPRStackAccess(MI, FrameIndex);
7113 
7114   return AMDGPU::NoRegister;
7115 }
7116 
7117 unsigned SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
7118                                          int &FrameIndex) const {
7119   if (!MI.mayStore())
7120     return AMDGPU::NoRegister;
7121 
7122   if (isMUBUF(MI) || isVGPRSpill(MI))
7123     return isStackAccess(MI, FrameIndex);
7124 
7125   if (isSGPRSpill(MI))
7126     return isSGPRStackAccess(MI, FrameIndex);
7127 
7128   return AMDGPU::NoRegister;
7129 }
7130 
7131 unsigned SIInstrInfo::getInstBundleSize(const MachineInstr &MI) const {
7132   unsigned Size = 0;
7133   MachineBasicBlock::const_instr_iterator I = MI.getIterator();
7134   MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
7135   while (++I != E && I->isInsideBundle()) {
7136     assert(!I->isBundle() && "No nested bundle!");
7137     Size += getInstSizeInBytes(*I);
7138   }
7139 
7140   return Size;
7141 }
7142 
7143 unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
7144   unsigned Opc = MI.getOpcode();
7145   const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc);
7146   unsigned DescSize = Desc.getSize();
7147 
7148   // If we have a definitive size, we can use it. Otherwise we need to inspect
7149   // the operands to know the size.
7150   if (isFixedSize(MI)) {
7151     unsigned Size = DescSize;
7152 
7153     // If we hit the buggy offset, an extra nop will be inserted in MC so
7154     // estimate the worst case.
7155     if (MI.isBranch() && ST.hasOffset3fBug())
7156       Size += 4;
7157 
7158     return Size;
7159   }
7160 
7161   // 4-byte instructions may have a 32-bit literal encoded after them. Check
7162   // operands that coud ever be literals.
7163   if (isVALU(MI) || isSALU(MI)) {
7164     int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
7165     if (Src0Idx == -1)
7166       return DescSize; // No operands.
7167 
7168     if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx]))
7169       return isVOP3(MI) ? 12 : (DescSize + 4);
7170 
7171     int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
7172     if (Src1Idx == -1)
7173       return DescSize;
7174 
7175     if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx]))
7176       return isVOP3(MI) ? 12 : (DescSize + 4);
7177 
7178     int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
7179     if (Src2Idx == -1)
7180       return DescSize;
7181 
7182     if (isLiteralConstantLike(MI.getOperand(Src2Idx), Desc.OpInfo[Src2Idx]))
7183       return isVOP3(MI) ? 12 : (DescSize + 4);
7184 
7185     return DescSize;
7186   }
7187 
7188   // Check whether we have extra NSA words.
7189   if (isMIMG(MI)) {
7190     int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
7191     if (VAddr0Idx < 0)
7192       return 8;
7193 
7194     int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
7195     return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
7196   }
7197 
7198   switch (Opc) {
7199   case TargetOpcode::BUNDLE:
7200     return getInstBundleSize(MI);
7201   case TargetOpcode::INLINEASM:
7202   case TargetOpcode::INLINEASM_BR: {
7203     const MachineFunction *MF = MI.getParent()->getParent();
7204     const char *AsmStr = MI.getOperand(0).getSymbolName();
7205     return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
7206   }
7207   default:
7208     if (MI.isMetaInstruction())
7209       return 0;
7210     return DescSize;
7211   }
7212 }
7213 
7214 bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const {
7215   if (!isFLAT(MI))
7216     return false;
7217 
7218   if (MI.memoperands_empty())
7219     return true;
7220 
7221   for (const MachineMemOperand *MMO : MI.memoperands()) {
7222     if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
7223       return true;
7224   }
7225   return false;
7226 }
7227 
7228 bool SIInstrInfo::isNonUniformBranchInstr(MachineInstr &Branch) const {
7229   return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO;
7230 }
7231 
7232 void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry,
7233                                             MachineBasicBlock *IfEnd) const {
7234   MachineBasicBlock::iterator TI = IfEntry->getFirstTerminator();
7235   assert(TI != IfEntry->end());
7236 
7237   MachineInstr *Branch = &(*TI);
7238   MachineFunction *MF = IfEntry->getParent();
7239   MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo();
7240 
7241   if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
7242     Register DstReg = MRI.createVirtualRegister(RI.getBoolRC());
7243     MachineInstr *SIIF =
7244         BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg)
7245             .add(Branch->getOperand(0))
7246             .add(Branch->getOperand(1));
7247     MachineInstr *SIEND =
7248         BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF))
7249             .addReg(DstReg);
7250 
7251     IfEntry->erase(TI);
7252     IfEntry->insert(IfEntry->end(), SIIF);
7253     IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND);
7254   }
7255 }
7256 
7257 void SIInstrInfo::convertNonUniformLoopRegion(
7258     MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const {
7259   MachineBasicBlock::iterator TI = LoopEnd->getFirstTerminator();
7260   // We expect 2 terminators, one conditional and one unconditional.
7261   assert(TI != LoopEnd->end());
7262 
7263   MachineInstr *Branch = &(*TI);
7264   MachineFunction *MF = LoopEnd->getParent();
7265   MachineRegisterInfo &MRI = LoopEnd->getParent()->getRegInfo();
7266 
7267   if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
7268 
7269     Register DstReg = MRI.createVirtualRegister(RI.getBoolRC());
7270     Register BackEdgeReg = MRI.createVirtualRegister(RI.getBoolRC());
7271     MachineInstrBuilder HeaderPHIBuilder =
7272         BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg);
7273     for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(),
7274                                           E = LoopEntry->pred_end();
7275          PI != E; ++PI) {
7276       if (*PI == LoopEnd) {
7277         HeaderPHIBuilder.addReg(BackEdgeReg);
7278       } else {
7279         MachineBasicBlock *PMBB = *PI;
7280         Register ZeroReg = MRI.createVirtualRegister(RI.getBoolRC());
7281         materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(),
7282                              ZeroReg, 0);
7283         HeaderPHIBuilder.addReg(ZeroReg);
7284       }
7285       HeaderPHIBuilder.addMBB(*PI);
7286     }
7287     MachineInstr *HeaderPhi = HeaderPHIBuilder;
7288     MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(),
7289                                       get(AMDGPU::SI_IF_BREAK), BackEdgeReg)
7290                                   .addReg(DstReg)
7291                                   .add(Branch->getOperand(0));
7292     MachineInstr *SILOOP =
7293         BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP))
7294             .addReg(BackEdgeReg)
7295             .addMBB(LoopEntry);
7296 
7297     LoopEntry->insert(LoopEntry->begin(), HeaderPhi);
7298     LoopEnd->erase(TI);
7299     LoopEnd->insert(LoopEnd->end(), SIIFBREAK);
7300     LoopEnd->insert(LoopEnd->end(), SILOOP);
7301   }
7302 }
7303 
7304 ArrayRef<std::pair<int, const char *>>
7305 SIInstrInfo::getSerializableTargetIndices() const {
7306   static const std::pair<int, const char *> TargetIndices[] = {
7307       {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
7308       {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
7309       {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
7310       {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
7311       {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
7312   return makeArrayRef(TargetIndices);
7313 }
7314 
7315 /// This is used by the post-RA scheduler (SchedulePostRAList.cpp).  The
7316 /// post-RA version of misched uses CreateTargetMIHazardRecognizer.
7317 ScheduleHazardRecognizer *
7318 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
7319                                             const ScheduleDAG *DAG) const {
7320   return new GCNHazardRecognizer(DAG->MF);
7321 }
7322 
7323 /// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
7324 /// pass.
7325 ScheduleHazardRecognizer *
7326 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const {
7327   return new GCNHazardRecognizer(MF);
7328 }
7329 
7330 std::pair<unsigned, unsigned>
7331 SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
7332   return std::make_pair(TF & MO_MASK, TF & ~MO_MASK);
7333 }
7334 
7335 ArrayRef<std::pair<unsigned, const char *>>
7336 SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
7337   static const std::pair<unsigned, const char *> TargetFlags[] = {
7338     { MO_GOTPCREL, "amdgpu-gotprel" },
7339     { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" },
7340     { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" },
7341     { MO_REL32_LO, "amdgpu-rel32-lo" },
7342     { MO_REL32_HI, "amdgpu-rel32-hi" },
7343     { MO_ABS32_LO, "amdgpu-abs32-lo" },
7344     { MO_ABS32_HI, "amdgpu-abs32-hi" },
7345   };
7346 
7347   return makeArrayRef(TargetFlags);
7348 }
7349 
7350 bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const {
7351   return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
7352          MI.modifiesRegister(AMDGPU::EXEC, &RI);
7353 }
7354 
7355 MachineInstrBuilder
7356 SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
7357                            MachineBasicBlock::iterator I,
7358                            const DebugLoc &DL,
7359                            Register DestReg) const {
7360   if (ST.hasAddNoCarry())
7361     return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
7362 
7363   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
7364   Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
7365   MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
7366 
7367   return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
7368            .addReg(UnusedCarry, RegState::Define | RegState::Dead);
7369 }
7370 
7371 MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
7372                                                MachineBasicBlock::iterator I,
7373                                                const DebugLoc &DL,
7374                                                Register DestReg,
7375                                                RegScavenger &RS) const {
7376   if (ST.hasAddNoCarry())
7377     return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
7378 
7379   // If available, prefer to use vcc.
7380   Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
7381                              ? Register(RI.getVCC())
7382                              : RS.scavengeRegister(RI.getBoolRC(), I, 0, false);
7383 
7384   // TODO: Users need to deal with this.
7385   if (!UnusedCarry.isValid())
7386     return MachineInstrBuilder();
7387 
7388   return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
7389            .addReg(UnusedCarry, RegState::Define | RegState::Dead);
7390 }
7391 
7392 bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
7393   switch (Opcode) {
7394   case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
7395   case AMDGPU::SI_KILL_I1_TERMINATOR:
7396     return true;
7397   default:
7398     return false;
7399   }
7400 }
7401 
7402 const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) const {
7403   switch (Opcode) {
7404   case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
7405     return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
7406   case AMDGPU::SI_KILL_I1_PSEUDO:
7407     return get(AMDGPU::SI_KILL_I1_TERMINATOR);
7408   default:
7409     llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
7410   }
7411 }
7412 
7413 void SIInstrInfo::fixImplicitOperands(MachineInstr &MI) const {
7414   if (!ST.isWave32())
7415     return;
7416 
7417   for (auto &Op : MI.implicit_operands()) {
7418     if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
7419       Op.setReg(AMDGPU::VCC_LO);
7420   }
7421 }
7422 
7423 bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const {
7424   if (!isSMRD(MI))
7425     return false;
7426 
7427   // Check that it is using a buffer resource.
7428   int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
7429   if (Idx == -1) // e.g. s_memtime
7430     return false;
7431 
7432   const auto RCID = MI.getDesc().OpInfo[Idx].RegClass;
7433   return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
7434 }
7435 
7436 // Depending on the used address space and instructions, some immediate offsets
7437 // are allowed and some are not.
7438 // In general, flat instruction offsets can only be non-negative, global and
7439 // scratch instruction offsets can also be negative.
7440 //
7441 // There are several bugs related to these offsets:
7442 // On gfx10.1, flat instructions that go into the global address space cannot
7443 // use an offset.
7444 //
7445 // For scratch instructions, the address can be either an SGPR or a VGPR.
7446 // The following offsets can be used, depending on the architecture (x means
7447 // cannot be used):
7448 // +----------------------------+------+------+
7449 // | Address-Mode               | SGPR | VGPR |
7450 // +----------------------------+------+------+
7451 // | gfx9                       |      |      |
7452 // | negative, 4-aligned offset | x    | ok   |
7453 // | negative, unaligned offset | x    | ok   |
7454 // +----------------------------+------+------+
7455 // | gfx10                      |      |      |
7456 // | negative, 4-aligned offset | ok   | ok   |
7457 // | negative, unaligned offset | ok   | x    |
7458 // +----------------------------+------+------+
7459 // | gfx10.3                    |      |      |
7460 // | negative, 4-aligned offset | ok   | ok   |
7461 // | negative, unaligned offset | ok   | ok   |
7462 // +----------------------------+------+------+
7463 //
7464 // This function ignores the addressing mode, so if an offset cannot be used in
7465 // one addressing mode, it is considered illegal.
7466 bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
7467                                     uint64_t FlatVariant) const {
7468   // TODO: Should 0 be special cased?
7469   if (!ST.hasFlatInstOffsets())
7470     return false;
7471 
7472   if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
7473       (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
7474        AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
7475     return false;
7476 
7477   bool Signed = FlatVariant != SIInstrFlags::FLAT;
7478   if (ST.hasNegativeScratchOffsetBug() &&
7479       FlatVariant == SIInstrFlags::FlatScratch)
7480     Signed = false;
7481   if (ST.hasNegativeUnalignedScratchOffsetBug() &&
7482       FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
7483       (Offset % 4) != 0) {
7484     return false;
7485   }
7486 
7487   unsigned N = AMDGPU::getNumFlatOffsetBits(ST, Signed);
7488   return Signed ? isIntN(N, Offset) : isUIntN(N, Offset);
7489 }
7490 
7491 // See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
7492 std::pair<int64_t, int64_t>
7493 SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
7494                              uint64_t FlatVariant) const {
7495   int64_t RemainderOffset = COffsetVal;
7496   int64_t ImmField = 0;
7497   bool Signed = FlatVariant != SIInstrFlags::FLAT;
7498   if (ST.hasNegativeScratchOffsetBug() &&
7499       FlatVariant == SIInstrFlags::FlatScratch)
7500     Signed = false;
7501 
7502   const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST, Signed);
7503   if (Signed) {
7504     // Use signed division by a power of two to truncate towards 0.
7505     int64_t D = 1LL << (NumBits - 1);
7506     RemainderOffset = (COffsetVal / D) * D;
7507     ImmField = COffsetVal - RemainderOffset;
7508 
7509     if (ST.hasNegativeUnalignedScratchOffsetBug() &&
7510         FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
7511         (ImmField % 4) != 0) {
7512       // Make ImmField a multiple of 4
7513       RemainderOffset += ImmField % 4;
7514       ImmField -= ImmField % 4;
7515     }
7516   } else if (COffsetVal >= 0) {
7517     ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
7518     RemainderOffset = COffsetVal - ImmField;
7519   }
7520 
7521   assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
7522   assert(RemainderOffset + ImmField == COffsetVal);
7523   return {ImmField, RemainderOffset};
7524 }
7525 
7526 // This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td
7527 enum SIEncodingFamily {
7528   SI = 0,
7529   VI = 1,
7530   SDWA = 2,
7531   SDWA9 = 3,
7532   GFX80 = 4,
7533   GFX9 = 5,
7534   GFX10 = 6,
7535   SDWA10 = 7,
7536   GFX90A = 8
7537 };
7538 
7539 static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) {
7540   switch (ST.getGeneration()) {
7541   default:
7542     break;
7543   case AMDGPUSubtarget::SOUTHERN_ISLANDS:
7544   case AMDGPUSubtarget::SEA_ISLANDS:
7545     return SIEncodingFamily::SI;
7546   case AMDGPUSubtarget::VOLCANIC_ISLANDS:
7547   case AMDGPUSubtarget::GFX9:
7548     return SIEncodingFamily::VI;
7549   case AMDGPUSubtarget::GFX10:
7550     return SIEncodingFamily::GFX10;
7551   }
7552   llvm_unreachable("Unknown subtarget generation!");
7553 }
7554 
7555 bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
7556   switch(MCOp) {
7557   // These opcodes use indirect register addressing so
7558   // they need special handling by codegen (currently missing).
7559   // Therefore it is too risky to allow these opcodes
7560   // to be selected by dpp combiner or sdwa peepholer.
7561   case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
7562   case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
7563   case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
7564   case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
7565   case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
7566   case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
7567   case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
7568   case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
7569     return true;
7570   default:
7571     return false;
7572   }
7573 }
7574 
7575 int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
7576   SIEncodingFamily Gen = subtargetEncodingFamily(ST);
7577 
7578   if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 &&
7579     ST.getGeneration() == AMDGPUSubtarget::GFX9)
7580     Gen = SIEncodingFamily::GFX9;
7581 
7582   // Adjust the encoding family to GFX80 for D16 buffer instructions when the
7583   // subtarget has UnpackedD16VMem feature.
7584   // TODO: remove this when we discard GFX80 encoding.
7585   if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
7586     Gen = SIEncodingFamily::GFX80;
7587 
7588   if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
7589     switch (ST.getGeneration()) {
7590     default:
7591       Gen = SIEncodingFamily::SDWA;
7592       break;
7593     case AMDGPUSubtarget::GFX9:
7594       Gen = SIEncodingFamily::SDWA9;
7595       break;
7596     case AMDGPUSubtarget::GFX10:
7597       Gen = SIEncodingFamily::SDWA10;
7598       break;
7599     }
7600   }
7601 
7602   int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
7603 
7604   // -1 means that Opcode is already a native instruction.
7605   if (MCOp == -1)
7606     return Opcode;
7607 
7608   if (ST.hasGFX90AInsts()) {
7609     uint16_t NMCOp = (uint16_t)-1;
7610       NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX90A);
7611     if (NMCOp == (uint16_t)-1)
7612       NMCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX9);
7613     if (NMCOp != (uint16_t)-1)
7614       MCOp = NMCOp;
7615   }
7616 
7617   // (uint16_t)-1 means that Opcode is a pseudo instruction that has
7618   // no encoding in the given subtarget generation.
7619   if (MCOp == (uint16_t)-1)
7620     return -1;
7621 
7622   if (isAsmOnlyOpcode(MCOp))
7623     return -1;
7624 
7625   return MCOp;
7626 }
7627 
7628 static
7629 TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd) {
7630   assert(RegOpnd.isReg());
7631   return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
7632                              getRegSubRegPair(RegOpnd);
7633 }
7634 
7635 TargetInstrInfo::RegSubRegPair
7636 llvm::getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg) {
7637   assert(MI.isRegSequence());
7638   for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
7639     if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
7640       auto &RegOp = MI.getOperand(1 + 2 * I);
7641       return getRegOrUndef(RegOp);
7642     }
7643   return TargetInstrInfo::RegSubRegPair();
7644 }
7645 
7646 // Try to find the definition of reg:subreg in subreg-manipulation pseudos
7647 // Following a subreg of reg:subreg isn't supported
7648 static bool followSubRegDef(MachineInstr &MI,
7649                             TargetInstrInfo::RegSubRegPair &RSR) {
7650   if (!RSR.SubReg)
7651     return false;
7652   switch (MI.getOpcode()) {
7653   default: break;
7654   case AMDGPU::REG_SEQUENCE:
7655     RSR = getRegSequenceSubReg(MI, RSR.SubReg);
7656     return true;
7657   // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
7658   case AMDGPU::INSERT_SUBREG:
7659     if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
7660       // inserted the subreg we're looking for
7661       RSR = getRegOrUndef(MI.getOperand(2));
7662     else { // the subreg in the rest of the reg
7663       auto R1 = getRegOrUndef(MI.getOperand(1));
7664       if (R1.SubReg) // subreg of subreg isn't supported
7665         return false;
7666       RSR.Reg = R1.Reg;
7667     }
7668     return true;
7669   }
7670   return false;
7671 }
7672 
7673 MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
7674                                      MachineRegisterInfo &MRI) {
7675   assert(MRI.isSSA());
7676   if (!P.Reg.isVirtual())
7677     return nullptr;
7678 
7679   auto RSR = P;
7680   auto *DefInst = MRI.getVRegDef(RSR.Reg);
7681   while (auto *MI = DefInst) {
7682     DefInst = nullptr;
7683     switch (MI->getOpcode()) {
7684     case AMDGPU::COPY:
7685     case AMDGPU::V_MOV_B32_e32: {
7686       auto &Op1 = MI->getOperand(1);
7687       if (Op1.isReg() && Op1.getReg().isVirtual()) {
7688         if (Op1.isUndef())
7689           return nullptr;
7690         RSR = getRegSubRegPair(Op1);
7691         DefInst = MRI.getVRegDef(RSR.Reg);
7692       }
7693       break;
7694     }
7695     default:
7696       if (followSubRegDef(*MI, RSR)) {
7697         if (!RSR.Reg)
7698           return nullptr;
7699         DefInst = MRI.getVRegDef(RSR.Reg);
7700       }
7701     }
7702     if (!DefInst)
7703       return MI;
7704   }
7705   return nullptr;
7706 }
7707 
7708 bool llvm::execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI,
7709                                       Register VReg,
7710                                       const MachineInstr &DefMI,
7711                                       const MachineInstr &UseMI) {
7712   assert(MRI.isSSA() && "Must be run on SSA");
7713 
7714   auto *TRI = MRI.getTargetRegisterInfo();
7715   auto *DefBB = DefMI.getParent();
7716 
7717   // Don't bother searching between blocks, although it is possible this block
7718   // doesn't modify exec.
7719   if (UseMI.getParent() != DefBB)
7720     return true;
7721 
7722   const int MaxInstScan = 20;
7723   int NumInst = 0;
7724 
7725   // Stop scan at the use.
7726   auto E = UseMI.getIterator();
7727   for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
7728     if (I->isDebugInstr())
7729       continue;
7730 
7731     if (++NumInst > MaxInstScan)
7732       return true;
7733 
7734     if (I->modifiesRegister(AMDGPU::EXEC, TRI))
7735       return true;
7736   }
7737 
7738   return false;
7739 }
7740 
7741 bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI,
7742                                          Register VReg,
7743                                          const MachineInstr &DefMI) {
7744   assert(MRI.isSSA() && "Must be run on SSA");
7745 
7746   auto *TRI = MRI.getTargetRegisterInfo();
7747   auto *DefBB = DefMI.getParent();
7748 
7749   const int MaxUseScan = 10;
7750   int NumUse = 0;
7751 
7752   for (auto &Use : MRI.use_nodbg_operands(VReg)) {
7753     auto &UseInst = *Use.getParent();
7754     // Don't bother searching between blocks, although it is possible this block
7755     // doesn't modify exec.
7756     if (UseInst.getParent() != DefBB)
7757       return true;
7758 
7759     if (++NumUse > MaxUseScan)
7760       return true;
7761   }
7762 
7763   if (NumUse == 0)
7764     return false;
7765 
7766   const int MaxInstScan = 20;
7767   int NumInst = 0;
7768 
7769   // Stop scan when we have seen all the uses.
7770   for (auto I = std::next(DefMI.getIterator()); ; ++I) {
7771     assert(I != DefBB->end());
7772 
7773     if (I->isDebugInstr())
7774       continue;
7775 
7776     if (++NumInst > MaxInstScan)
7777       return true;
7778 
7779     for (const MachineOperand &Op : I->operands()) {
7780       // We don't check reg masks here as they're used only on calls:
7781       // 1. EXEC is only considered const within one BB
7782       // 2. Call should be a terminator instruction if present in a BB
7783 
7784       if (!Op.isReg())
7785         continue;
7786 
7787       Register Reg = Op.getReg();
7788       if (Op.isUse()) {
7789         if (Reg == VReg && --NumUse == 0)
7790           return false;
7791       } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
7792         return true;
7793     }
7794   }
7795 }
7796 
7797 MachineInstr *SIInstrInfo::createPHIDestinationCopy(
7798     MachineBasicBlock &MBB, MachineBasicBlock::iterator LastPHIIt,
7799     const DebugLoc &DL, Register Src, Register Dst) const {
7800   auto Cur = MBB.begin();
7801   if (Cur != MBB.end())
7802     do {
7803       if (!Cur->isPHI() && Cur->readsRegister(Dst))
7804         return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
7805       ++Cur;
7806     } while (Cur != MBB.end() && Cur != LastPHIIt);
7807 
7808   return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
7809                                                    Dst);
7810 }
7811 
7812 MachineInstr *SIInstrInfo::createPHISourceCopy(
7813     MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt,
7814     const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
7815   if (InsPt != MBB.end() &&
7816       (InsPt->getOpcode() == AMDGPU::SI_IF ||
7817        InsPt->getOpcode() == AMDGPU::SI_ELSE ||
7818        InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
7819       InsPt->definesRegister(Src)) {
7820     InsPt++;
7821     return BuildMI(MBB, InsPt, DL,
7822                    get(ST.isWave32() ? AMDGPU::S_MOV_B32_term
7823                                      : AMDGPU::S_MOV_B64_term),
7824                    Dst)
7825         .addReg(Src, 0, SrcSubReg)
7826         .addReg(AMDGPU::EXEC, RegState::Implicit);
7827   }
7828   return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
7829                                               Dst);
7830 }
7831 
7832 bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
7833 
7834 MachineInstr *SIInstrInfo::foldMemoryOperandImpl(
7835     MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
7836     MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
7837     VirtRegMap *VRM) const {
7838   // This is a bit of a hack (copied from AArch64). Consider this instruction:
7839   //
7840   //   %0:sreg_32 = COPY $m0
7841   //
7842   // We explicitly chose SReg_32 for the virtual register so such a copy might
7843   // be eliminated by RegisterCoalescer. However, that may not be possible, and
7844   // %0 may even spill. We can't spill $m0 normally (it would require copying to
7845   // a numbered SGPR anyway), and since it is in the SReg_32 register class,
7846   // TargetInstrInfo::foldMemoryOperand() is going to try.
7847   // A similar issue also exists with spilling and reloading $exec registers.
7848   //
7849   // To prevent that, constrain the %0 register class here.
7850   if (MI.isFullCopy()) {
7851     Register DstReg = MI.getOperand(0).getReg();
7852     Register SrcReg = MI.getOperand(1).getReg();
7853     if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
7854         (DstReg.isVirtual() != SrcReg.isVirtual())) {
7855       MachineRegisterInfo &MRI = MF.getRegInfo();
7856       Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
7857       const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
7858       if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
7859         MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
7860         return nullptr;
7861       } else if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
7862         MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
7863         return nullptr;
7864       }
7865     }
7866   }
7867 
7868   return nullptr;
7869 }
7870 
7871 unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
7872                                       const MachineInstr &MI,
7873                                       unsigned *PredCost) const {
7874   if (MI.isBundle()) {
7875     MachineBasicBlock::const_instr_iterator I(MI.getIterator());
7876     MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
7877     unsigned Lat = 0, Count = 0;
7878     for (++I; I != E && I->isBundledWithPred(); ++I) {
7879       ++Count;
7880       Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
7881     }
7882     return Lat + Count - 1;
7883   }
7884 
7885   return SchedModel.computeInstrLatency(&MI);
7886 }
7887 
7888 unsigned SIInstrInfo::getDSShaderTypeValue(const MachineFunction &MF) {
7889   switch (MF.getFunction().getCallingConv()) {
7890   case CallingConv::AMDGPU_PS:
7891     return 1;
7892   case CallingConv::AMDGPU_VS:
7893     return 2;
7894   case CallingConv::AMDGPU_GS:
7895     return 3;
7896   case CallingConv::AMDGPU_HS:
7897   case CallingConv::AMDGPU_LS:
7898   case CallingConv::AMDGPU_ES:
7899     report_fatal_error("ds_ordered_count unsupported for this calling conv");
7900   case CallingConv::AMDGPU_CS:
7901   case CallingConv::AMDGPU_KERNEL:
7902   case CallingConv::C:
7903   case CallingConv::Fast:
7904   default:
7905     // Assume other calling conventions are various compute callable functions
7906     return 0;
7907   }
7908 }
7909