1 //===- SIInstrInfo.cpp - SI Instruction Information  ----------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief SI Implementation of TargetInstrInfo.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "SIInstrInfo.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUSubtarget.h"
18 #include "GCNHazardRecognizer.h"
19 #include "SIDefines.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "SIRegisterInfo.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "llvm/ADT/APInt.h"
24 #include "llvm/ADT/ArrayRef.h"
25 #include "llvm/ADT/SmallVector.h"
26 #include "llvm/ADT/StringRef.h"
27 #include "llvm/ADT/iterator_range.h"
28 #include "llvm/Analysis/AliasAnalysis.h"
29 #include "llvm/Analysis/MemoryLocation.h"
30 #include "llvm/CodeGen/MachineBasicBlock.h"
31 #include "llvm/CodeGen/MachineFrameInfo.h"
32 #include "llvm/CodeGen/MachineFunction.h"
33 #include "llvm/CodeGen/MachineInstr.h"
34 #include "llvm/CodeGen/MachineInstrBuilder.h"
35 #include "llvm/CodeGen/MachineInstrBundle.h"
36 #include "llvm/CodeGen/MachineMemOperand.h"
37 #include "llvm/CodeGen/MachineOperand.h"
38 #include "llvm/CodeGen/MachineRegisterInfo.h"
39 #include "llvm/CodeGen/MachineValueType.h"
40 #include "llvm/CodeGen/RegisterScavenging.h"
41 #include "llvm/CodeGen/ScheduleDAG.h"
42 #include "llvm/CodeGen/SelectionDAGNodes.h"
43 #include "llvm/IR/DebugLoc.h"
44 #include "llvm/IR/DiagnosticInfo.h"
45 #include "llvm/IR/Function.h"
46 #include "llvm/IR/InlineAsm.h"
47 #include "llvm/IR/LLVMContext.h"
48 #include "llvm/MC/MCInstrDesc.h"
49 #include "llvm/Support/Casting.h"
50 #include "llvm/Support/CommandLine.h"
51 #include "llvm/Support/Compiler.h"
52 #include "llvm/Support/ErrorHandling.h"
53 #include "llvm/Support/MathExtras.h"
54 #include "llvm/Target/TargetMachine.h"
55 #include "llvm/Target/TargetOpcodes.h"
56 #include "llvm/Target/TargetRegisterInfo.h"
57 #include <cassert>
58 #include <cstdint>
59 #include <iterator>
60 #include <utility>
61 
62 using namespace llvm;
63 
64 // Must be at least 4 to be able to branch over minimum unconditional branch
65 // code. This is only for making it possible to write reasonably small tests for
66 // long branches.
67 static cl::opt<unsigned>
68 BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
69                  cl::desc("Restrict range of branch instructions (DEBUG)"));
70 
71 SIInstrInfo::SIInstrInfo(const SISubtarget &ST)
72   : AMDGPUInstrInfo(ST), RI(ST), ST(ST) {}
73 
74 //===----------------------------------------------------------------------===//
75 // TargetInstrInfo callbacks
76 //===----------------------------------------------------------------------===//
77 
78 static unsigned getNumOperandsNoGlue(SDNode *Node) {
79   unsigned N = Node->getNumOperands();
80   while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
81     --N;
82   return N;
83 }
84 
85 static SDValue findChainOperand(SDNode *Load) {
86   SDValue LastOp = Load->getOperand(getNumOperandsNoGlue(Load) - 1);
87   assert(LastOp.getValueType() == MVT::Other && "Chain missing from load node");
88   return LastOp;
89 }
90 
91 /// \brief Returns true if both nodes have the same value for the given
92 ///        operand \p Op, or if both nodes do not have this operand.
93 static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
94   unsigned Opc0 = N0->getMachineOpcode();
95   unsigned Opc1 = N1->getMachineOpcode();
96 
97   int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
98   int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
99 
100   if (Op0Idx == -1 && Op1Idx == -1)
101     return true;
102 
103 
104   if ((Op0Idx == -1 && Op1Idx != -1) ||
105       (Op1Idx == -1 && Op0Idx != -1))
106     return false;
107 
108   // getNamedOperandIdx returns the index for the MachineInstr's operands,
109   // which includes the result as the first operand. We are indexing into the
110   // MachineSDNode's operands, so we need to skip the result operand to get
111   // the real index.
112   --Op0Idx;
113   --Op1Idx;
114 
115   return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
116 }
117 
118 bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
119                                                     AliasAnalysis *AA) const {
120   // TODO: The generic check fails for VALU instructions that should be
121   // rematerializable due to implicit reads of exec. We really want all of the
122   // generic logic for this except for this.
123   switch (MI.getOpcode()) {
124   case AMDGPU::V_MOV_B32_e32:
125   case AMDGPU::V_MOV_B32_e64:
126   case AMDGPU::V_MOV_B64_PSEUDO:
127     return true;
128   default:
129     return false;
130   }
131 }
132 
133 bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
134                                           int64_t &Offset0,
135                                           int64_t &Offset1) const {
136   if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
137     return false;
138 
139   unsigned Opc0 = Load0->getMachineOpcode();
140   unsigned Opc1 = Load1->getMachineOpcode();
141 
142   // Make sure both are actually loads.
143   if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
144     return false;
145 
146   if (isDS(Opc0) && isDS(Opc1)) {
147 
148     // FIXME: Handle this case:
149     if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
150       return false;
151 
152     // Check base reg.
153     if (Load0->getOperand(1) != Load1->getOperand(1))
154       return false;
155 
156     // Check chain.
157     if (findChainOperand(Load0) != findChainOperand(Load1))
158       return false;
159 
160     // Skip read2 / write2 variants for simplicity.
161     // TODO: We should report true if the used offsets are adjacent (excluded
162     // st64 versions).
163     if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::data1) != -1 ||
164         AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::data1) != -1)
165       return false;
166 
167     Offset0 = cast<ConstantSDNode>(Load0->getOperand(2))->getZExtValue();
168     Offset1 = cast<ConstantSDNode>(Load1->getOperand(2))->getZExtValue();
169     return true;
170   }
171 
172   if (isSMRD(Opc0) && isSMRD(Opc1)) {
173     // Skip time and cache invalidation instructions.
174     if (AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::sbase) == -1 ||
175         AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::sbase) == -1)
176       return false;
177 
178     assert(getNumOperandsNoGlue(Load0) == getNumOperandsNoGlue(Load1));
179 
180     // Check base reg.
181     if (Load0->getOperand(0) != Load1->getOperand(0))
182       return false;
183 
184     const ConstantSDNode *Load0Offset =
185         dyn_cast<ConstantSDNode>(Load0->getOperand(1));
186     const ConstantSDNode *Load1Offset =
187         dyn_cast<ConstantSDNode>(Load1->getOperand(1));
188 
189     if (!Load0Offset || !Load1Offset)
190       return false;
191 
192     // Check chain.
193     if (findChainOperand(Load0) != findChainOperand(Load1))
194       return false;
195 
196     Offset0 = Load0Offset->getZExtValue();
197     Offset1 = Load1Offset->getZExtValue();
198     return true;
199   }
200 
201   // MUBUF and MTBUF can access the same addresses.
202   if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
203 
204     // MUBUF and MTBUF have vaddr at different indices.
205     if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
206         findChainOperand(Load0) != findChainOperand(Load1) ||
207         !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
208         !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
209       return false;
210 
211     int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
212     int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
213 
214     if (OffIdx0 == -1 || OffIdx1 == -1)
215       return false;
216 
217     // getNamedOperandIdx returns the index for MachineInstrs.  Since they
218     // inlcude the output in the operand list, but SDNodes don't, we need to
219     // subtract the index by one.
220     --OffIdx0;
221     --OffIdx1;
222 
223     SDValue Off0 = Load0->getOperand(OffIdx0);
224     SDValue Off1 = Load1->getOperand(OffIdx1);
225 
226     // The offset might be a FrameIndexSDNode.
227     if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
228       return false;
229 
230     Offset0 = cast<ConstantSDNode>(Off0)->getZExtValue();
231     Offset1 = cast<ConstantSDNode>(Off1)->getZExtValue();
232     return true;
233   }
234 
235   return false;
236 }
237 
238 static bool isStride64(unsigned Opc) {
239   switch (Opc) {
240   case AMDGPU::DS_READ2ST64_B32:
241   case AMDGPU::DS_READ2ST64_B64:
242   case AMDGPU::DS_WRITE2ST64_B32:
243   case AMDGPU::DS_WRITE2ST64_B64:
244     return true;
245   default:
246     return false;
247   }
248 }
249 
250 bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
251                                         int64_t &Offset,
252                                         const TargetRegisterInfo *TRI) const {
253   unsigned Opc = LdSt.getOpcode();
254 
255   if (isDS(LdSt)) {
256     const MachineOperand *OffsetImm =
257         getNamedOperand(LdSt, AMDGPU::OpName::offset);
258     if (OffsetImm) {
259       // Normal, single offset LDS instruction.
260       const MachineOperand *AddrReg =
261           getNamedOperand(LdSt, AMDGPU::OpName::addr);
262 
263       BaseReg = AddrReg->getReg();
264       Offset = OffsetImm->getImm();
265       return true;
266     }
267 
268     // The 2 offset instructions use offset0 and offset1 instead. We can treat
269     // these as a load with a single offset if the 2 offsets are consecutive. We
270     // will use this for some partially aligned loads.
271     const MachineOperand *Offset0Imm =
272         getNamedOperand(LdSt, AMDGPU::OpName::offset0);
273     const MachineOperand *Offset1Imm =
274         getNamedOperand(LdSt, AMDGPU::OpName::offset1);
275 
276     uint8_t Offset0 = Offset0Imm->getImm();
277     uint8_t Offset1 = Offset1Imm->getImm();
278 
279     if (Offset1 > Offset0 && Offset1 - Offset0 == 1) {
280       // Each of these offsets is in element sized units, so we need to convert
281       // to bytes of the individual reads.
282 
283       unsigned EltSize;
284       if (LdSt.mayLoad())
285         EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
286       else {
287         assert(LdSt.mayStore());
288         int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
289         EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
290       }
291 
292       if (isStride64(Opc))
293         EltSize *= 64;
294 
295       const MachineOperand *AddrReg =
296           getNamedOperand(LdSt, AMDGPU::OpName::addr);
297       BaseReg = AddrReg->getReg();
298       Offset = EltSize * Offset0;
299       return true;
300     }
301 
302     return false;
303   }
304 
305   if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
306     const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset);
307     if (SOffset && SOffset->isReg())
308       return false;
309 
310     const MachineOperand *AddrReg =
311         getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
312     if (!AddrReg)
313       return false;
314 
315     const MachineOperand *OffsetImm =
316         getNamedOperand(LdSt, AMDGPU::OpName::offset);
317     BaseReg = AddrReg->getReg();
318     Offset = OffsetImm->getImm();
319 
320     if (SOffset) // soffset can be an inline immediate.
321       Offset += SOffset->getImm();
322 
323     return true;
324   }
325 
326   if (isSMRD(LdSt)) {
327     const MachineOperand *OffsetImm =
328         getNamedOperand(LdSt, AMDGPU::OpName::offset);
329     if (!OffsetImm)
330       return false;
331 
332     const MachineOperand *SBaseReg =
333         getNamedOperand(LdSt, AMDGPU::OpName::sbase);
334     BaseReg = SBaseReg->getReg();
335     Offset = OffsetImm->getImm();
336     return true;
337   }
338 
339   if (isFLAT(LdSt)) {
340     const MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
341     if (VAddr) {
342       // Can't analyze 2 offsets.
343       if (getNamedOperand(LdSt, AMDGPU::OpName::saddr))
344         return false;
345 
346       BaseReg = VAddr->getReg();
347     } else {
348       // scratch instructions have either vaddr or saddr.
349       BaseReg = getNamedOperand(LdSt, AMDGPU::OpName::saddr)->getReg();
350     }
351 
352     Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
353     return true;
354   }
355 
356   return false;
357 }
358 
359 bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
360                                       MachineInstr &SecondLdSt,
361                                       unsigned NumLoads) const {
362   const MachineOperand *FirstDst = nullptr;
363   const MachineOperand *SecondDst = nullptr;
364 
365   if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) ||
366       (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) ||
367       (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) {
368     FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata);
369     if (!FirstDst)
370       FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
371     SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata);
372     if (!SecondDst)
373       SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
374   } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) {
375     FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst);
376     SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst);
377   } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) {
378     FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
379     SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
380   }
381 
382   if (!FirstDst || !SecondDst)
383     return false;
384 
385   // Try to limit clustering based on the total number of bytes loaded
386   // rather than the number of instructions.  This is done to help reduce
387   // register pressure.  The method used is somewhat inexact, though,
388   // because it assumes that all loads in the cluster will load the
389   // same number of bytes as FirstLdSt.
390 
391   // The unit of this value is bytes.
392   // FIXME: This needs finer tuning.
393   unsigned LoadClusterThreshold = 16;
394 
395   const MachineRegisterInfo &MRI =
396       FirstLdSt.getParent()->getParent()->getRegInfo();
397   const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg());
398 
399   return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold;
400 }
401 
402 static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
403                               MachineBasicBlock::iterator MI,
404                               const DebugLoc &DL, unsigned DestReg,
405                               unsigned SrcReg, bool KillSrc) {
406   MachineFunction *MF = MBB.getParent();
407   DiagnosticInfoUnsupported IllegalCopy(*MF->getFunction(),
408                                         "illegal SGPR to VGPR copy",
409                                         DL, DS_Error);
410   LLVMContext &C = MF->getFunction()->getContext();
411   C.diagnose(IllegalCopy);
412 
413   BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
414     .addReg(SrcReg, getKillRegState(KillSrc));
415 }
416 
417 void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
418                               MachineBasicBlock::iterator MI,
419                               const DebugLoc &DL, unsigned DestReg,
420                               unsigned SrcReg, bool KillSrc) const {
421   const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg);
422 
423   if (RC == &AMDGPU::VGPR_32RegClass) {
424     assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
425            AMDGPU::SReg_32RegClass.contains(SrcReg));
426     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
427       .addReg(SrcReg, getKillRegState(KillSrc));
428     return;
429   }
430 
431   if (RC == &AMDGPU::SReg_32_XM0RegClass ||
432       RC == &AMDGPU::SReg_32RegClass) {
433     if (SrcReg == AMDGPU::SCC) {
434       BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
435           .addImm(-1)
436           .addImm(0);
437       return;
438     }
439 
440     if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
441       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
442       return;
443     }
444 
445     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
446             .addReg(SrcReg, getKillRegState(KillSrc));
447     return;
448   }
449 
450   if (RC == &AMDGPU::SReg_64RegClass) {
451     if (DestReg == AMDGPU::VCC) {
452       if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
453         BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
454           .addReg(SrcReg, getKillRegState(KillSrc));
455       } else {
456         // FIXME: Hack until VReg_1 removed.
457         assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
458         BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
459           .addImm(0)
460           .addReg(SrcReg, getKillRegState(KillSrc));
461       }
462 
463       return;
464     }
465 
466     if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
467       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
468       return;
469     }
470 
471     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
472             .addReg(SrcReg, getKillRegState(KillSrc));
473     return;
474   }
475 
476   if (DestReg == AMDGPU::SCC) {
477     assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
478     BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
479       .addReg(SrcReg, getKillRegState(KillSrc))
480       .addImm(0);
481     return;
482   }
483 
484   unsigned EltSize = 4;
485   unsigned Opcode = AMDGPU::V_MOV_B32_e32;
486   if (RI.isSGPRClass(RC)) {
487     if (RI.getRegSizeInBits(*RC) > 32) {
488       Opcode =  AMDGPU::S_MOV_B64;
489       EltSize = 8;
490     } else {
491       Opcode = AMDGPU::S_MOV_B32;
492       EltSize = 4;
493     }
494 
495     if (!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))) {
496       reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
497       return;
498     }
499   }
500 
501   ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
502   bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
503 
504   for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
505     unsigned SubIdx;
506     if (Forward)
507       SubIdx = SubIndices[Idx];
508     else
509       SubIdx = SubIndices[SubIndices.size() - Idx - 1];
510 
511     MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
512       get(Opcode), RI.getSubReg(DestReg, SubIdx));
513 
514     Builder.addReg(RI.getSubReg(SrcReg, SubIdx));
515 
516     if (Idx == 0)
517       Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
518 
519     bool UseKill = KillSrc && Idx == SubIndices.size() - 1;
520     Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
521   }
522 }
523 
524 int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
525   int NewOpc;
526 
527   // Try to map original to commuted opcode
528   NewOpc = AMDGPU::getCommuteRev(Opcode);
529   if (NewOpc != -1)
530     // Check if the commuted (REV) opcode exists on the target.
531     return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
532 
533   // Try to map commuted to original opcode
534   NewOpc = AMDGPU::getCommuteOrig(Opcode);
535   if (NewOpc != -1)
536     // Check if the original (non-REV) opcode exists on the target.
537     return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
538 
539   return Opcode;
540 }
541 
542 void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB,
543                                        MachineBasicBlock::iterator MI,
544                                        const DebugLoc &DL, unsigned DestReg,
545                                        int64_t Value) const {
546   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
547   const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
548   if (RegClass == &AMDGPU::SReg_32RegClass ||
549       RegClass == &AMDGPU::SGPR_32RegClass ||
550       RegClass == &AMDGPU::SReg_32_XM0RegClass ||
551       RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) {
552     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
553       .addImm(Value);
554     return;
555   }
556 
557   if (RegClass == &AMDGPU::SReg_64RegClass ||
558       RegClass == &AMDGPU::SGPR_64RegClass ||
559       RegClass == &AMDGPU::SReg_64_XEXECRegClass) {
560     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
561       .addImm(Value);
562     return;
563   }
564 
565   if (RegClass == &AMDGPU::VGPR_32RegClass) {
566     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
567       .addImm(Value);
568     return;
569   }
570   if (RegClass == &AMDGPU::VReg_64RegClass) {
571     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
572       .addImm(Value);
573     return;
574   }
575 
576   unsigned EltSize = 4;
577   unsigned Opcode = AMDGPU::V_MOV_B32_e32;
578   if (RI.isSGPRClass(RegClass)) {
579     if (RI.getRegSizeInBits(*RegClass) > 32) {
580       Opcode =  AMDGPU::S_MOV_B64;
581       EltSize = 8;
582     } else {
583       Opcode = AMDGPU::S_MOV_B32;
584       EltSize = 4;
585     }
586   }
587 
588   ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
589   for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
590     int64_t IdxValue = Idx == 0 ? Value : 0;
591 
592     MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
593       get(Opcode), RI.getSubReg(DestReg, Idx));
594     Builder.addImm(IdxValue);
595   }
596 }
597 
598 const TargetRegisterClass *
599 SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const {
600   return &AMDGPU::VGPR_32RegClass;
601 }
602 
603 void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
604                                      MachineBasicBlock::iterator I,
605                                      const DebugLoc &DL, unsigned DstReg,
606                                      ArrayRef<MachineOperand> Cond,
607                                      unsigned TrueReg,
608                                      unsigned FalseReg) const {
609   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
610   assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
611          "Not a VGPR32 reg");
612 
613   if (Cond.size() == 1) {
614     BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
615       .addReg(FalseReg)
616       .addReg(TrueReg)
617       .add(Cond[0]);
618   } else if (Cond.size() == 2) {
619     assert(Cond[0].isImm() && "Cond[0] is not an immediate");
620     switch (Cond[0].getImm()) {
621     case SIInstrInfo::SCC_TRUE: {
622       unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
623       BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
624         .addImm(-1)
625         .addImm(0);
626       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
627         .addReg(FalseReg)
628         .addReg(TrueReg)
629         .addReg(SReg);
630       break;
631     }
632     case SIInstrInfo::SCC_FALSE: {
633       unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
634       BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
635         .addImm(0)
636         .addImm(-1);
637       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
638         .addReg(FalseReg)
639         .addReg(TrueReg)
640         .addReg(SReg);
641       break;
642     }
643     case SIInstrInfo::VCCNZ: {
644       MachineOperand RegOp = Cond[1];
645       RegOp.setImplicit(false);
646       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
647           .addReg(FalseReg)
648           .addReg(TrueReg)
649           .add(RegOp);
650       break;
651     }
652     case SIInstrInfo::VCCZ: {
653       MachineOperand RegOp = Cond[1];
654       RegOp.setImplicit(false);
655       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
656           .addReg(TrueReg)
657           .addReg(FalseReg)
658           .add(RegOp);
659       break;
660     }
661     case SIInstrInfo::EXECNZ: {
662       unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
663       unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
664       BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
665         .addImm(0);
666       BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
667         .addImm(-1)
668         .addImm(0);
669       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
670         .addReg(FalseReg)
671         .addReg(TrueReg)
672         .addReg(SReg);
673       break;
674     }
675     case SIInstrInfo::EXECZ: {
676       unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
677       unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
678       BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
679         .addImm(0);
680       BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
681         .addImm(0)
682         .addImm(-1);
683       BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
684         .addReg(FalseReg)
685         .addReg(TrueReg)
686         .addReg(SReg);
687       llvm_unreachable("Unhandled branch predicate EXECZ");
688       break;
689     }
690     default:
691       llvm_unreachable("invalid branch predicate");
692     }
693   } else {
694     llvm_unreachable("Can only handle Cond size 1 or 2");
695   }
696 }
697 
698 unsigned SIInstrInfo::insertEQ(MachineBasicBlock *MBB,
699                                MachineBasicBlock::iterator I,
700                                const DebugLoc &DL,
701                                unsigned SrcReg, int Value) const {
702   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
703   unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
704   BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
705     .addImm(Value)
706     .addReg(SrcReg);
707 
708   return Reg;
709 }
710 
711 unsigned SIInstrInfo::insertNE(MachineBasicBlock *MBB,
712                                MachineBasicBlock::iterator I,
713                                const DebugLoc &DL,
714                                unsigned SrcReg, int Value) const {
715   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
716   unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
717   BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
718     .addImm(Value)
719     .addReg(SrcReg);
720 
721   return Reg;
722 }
723 
724 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
725 
726   if (RI.getRegSizeInBits(*DstRC) == 32) {
727     return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
728   } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) {
729     return AMDGPU::S_MOV_B64;
730   } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) {
731     return  AMDGPU::V_MOV_B64_PSEUDO;
732   }
733   return AMDGPU::COPY;
734 }
735 
736 static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
737   switch (Size) {
738   case 4:
739     return AMDGPU::SI_SPILL_S32_SAVE;
740   case 8:
741     return AMDGPU::SI_SPILL_S64_SAVE;
742   case 16:
743     return AMDGPU::SI_SPILL_S128_SAVE;
744   case 32:
745     return AMDGPU::SI_SPILL_S256_SAVE;
746   case 64:
747     return AMDGPU::SI_SPILL_S512_SAVE;
748   default:
749     llvm_unreachable("unknown register size");
750   }
751 }
752 
753 static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
754   switch (Size) {
755   case 4:
756     return AMDGPU::SI_SPILL_V32_SAVE;
757   case 8:
758     return AMDGPU::SI_SPILL_V64_SAVE;
759   case 12:
760     return AMDGPU::SI_SPILL_V96_SAVE;
761   case 16:
762     return AMDGPU::SI_SPILL_V128_SAVE;
763   case 32:
764     return AMDGPU::SI_SPILL_V256_SAVE;
765   case 64:
766     return AMDGPU::SI_SPILL_V512_SAVE;
767   default:
768     llvm_unreachable("unknown register size");
769   }
770 }
771 
772 void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
773                                       MachineBasicBlock::iterator MI,
774                                       unsigned SrcReg, bool isKill,
775                                       int FrameIndex,
776                                       const TargetRegisterClass *RC,
777                                       const TargetRegisterInfo *TRI) const {
778   MachineFunction *MF = MBB.getParent();
779   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
780   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
781   DebugLoc DL = MBB.findDebugLoc(MI);
782 
783   unsigned Size = FrameInfo.getObjectSize(FrameIndex);
784   unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
785   MachinePointerInfo PtrInfo
786     = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
787   MachineMemOperand *MMO
788     = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
789                                Size, Align);
790   unsigned SpillSize = TRI->getSpillSize(*RC);
791 
792   if (RI.isSGPRClass(RC)) {
793     MFI->setHasSpilledSGPRs();
794 
795     // We are only allowed to create one new instruction when spilling
796     // registers, so we need to use pseudo instruction for spilling SGPRs.
797     const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
798 
799     // The SGPR spill/restore instructions only work on number sgprs, so we need
800     // to make sure we are using the correct register class.
801     if (TargetRegisterInfo::isVirtualRegister(SrcReg) && SpillSize == 4) {
802       MachineRegisterInfo &MRI = MF->getRegInfo();
803       MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass);
804     }
805 
806     MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc)
807       .addReg(SrcReg, getKillRegState(isKill)) // data
808       .addFrameIndex(FrameIndex)               // addr
809       .addMemOperand(MMO)
810       .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
811       .addReg(MFI->getFrameOffsetReg(), RegState::Implicit);
812     // Add the scratch resource registers as implicit uses because we may end up
813     // needing them, and need to ensure that the reserved registers are
814     // correctly handled.
815 
816     FrameInfo.setStackID(FrameIndex, 1);
817     if (ST.hasScalarStores()) {
818       // m0 is used for offset to scalar stores if used to spill.
819       Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
820     }
821 
822     return;
823   }
824 
825   if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) {
826     LLVMContext &Ctx = MF->getFunction()->getContext();
827     Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to"
828                   " spill register");
829     BuildMI(MBB, MI, DL, get(AMDGPU::KILL))
830       .addReg(SrcReg);
831 
832     return;
833   }
834 
835   assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
836 
837   unsigned Opcode = getVGPRSpillSaveOpcode(SpillSize);
838   MFI->setHasSpilledVGPRs();
839   BuildMI(MBB, MI, DL, get(Opcode))
840     .addReg(SrcReg, getKillRegState(isKill)) // data
841     .addFrameIndex(FrameIndex)               // addr
842     .addReg(MFI->getScratchRSrcReg())        // scratch_rsrc
843     .addReg(MFI->getFrameOffsetReg())        // scratch_offset
844     .addImm(0)                               // offset
845     .addMemOperand(MMO);
846 }
847 
848 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
849   switch (Size) {
850   case 4:
851     return AMDGPU::SI_SPILL_S32_RESTORE;
852   case 8:
853     return AMDGPU::SI_SPILL_S64_RESTORE;
854   case 16:
855     return AMDGPU::SI_SPILL_S128_RESTORE;
856   case 32:
857     return AMDGPU::SI_SPILL_S256_RESTORE;
858   case 64:
859     return AMDGPU::SI_SPILL_S512_RESTORE;
860   default:
861     llvm_unreachable("unknown register size");
862   }
863 }
864 
865 static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
866   switch (Size) {
867   case 4:
868     return AMDGPU::SI_SPILL_V32_RESTORE;
869   case 8:
870     return AMDGPU::SI_SPILL_V64_RESTORE;
871   case 12:
872     return AMDGPU::SI_SPILL_V96_RESTORE;
873   case 16:
874     return AMDGPU::SI_SPILL_V128_RESTORE;
875   case 32:
876     return AMDGPU::SI_SPILL_V256_RESTORE;
877   case 64:
878     return AMDGPU::SI_SPILL_V512_RESTORE;
879   default:
880     llvm_unreachable("unknown register size");
881   }
882 }
883 
884 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
885                                        MachineBasicBlock::iterator MI,
886                                        unsigned DestReg, int FrameIndex,
887                                        const TargetRegisterClass *RC,
888                                        const TargetRegisterInfo *TRI) const {
889   MachineFunction *MF = MBB.getParent();
890   const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
891   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
892   DebugLoc DL = MBB.findDebugLoc(MI);
893   unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
894   unsigned Size = FrameInfo.getObjectSize(FrameIndex);
895   unsigned SpillSize = TRI->getSpillSize(*RC);
896 
897   MachinePointerInfo PtrInfo
898     = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
899 
900   MachineMemOperand *MMO = MF->getMachineMemOperand(
901     PtrInfo, MachineMemOperand::MOLoad, Size, Align);
902 
903   if (RI.isSGPRClass(RC)) {
904     // FIXME: Maybe this should not include a memoperand because it will be
905     // lowered to non-memory instructions.
906     const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
907     if (TargetRegisterInfo::isVirtualRegister(DestReg) && SpillSize == 4) {
908       MachineRegisterInfo &MRI = MF->getRegInfo();
909       MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
910     }
911 
912     FrameInfo.setStackID(FrameIndex, 1);
913     MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg)
914       .addFrameIndex(FrameIndex) // addr
915       .addMemOperand(MMO)
916       .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
917       .addReg(MFI->getFrameOffsetReg(), RegState::Implicit);
918 
919     if (ST.hasScalarStores()) {
920       // m0 is used for offset to scalar stores if used to spill.
921       Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
922     }
923 
924     return;
925   }
926 
927   if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) {
928     LLVMContext &Ctx = MF->getFunction()->getContext();
929     Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to"
930                   " restore register");
931     BuildMI(MBB, MI, DL, get(AMDGPU::IMPLICIT_DEF), DestReg);
932 
933     return;
934   }
935 
936   assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
937 
938   unsigned Opcode = getVGPRSpillRestoreOpcode(SpillSize);
939   BuildMI(MBB, MI, DL, get(Opcode), DestReg)
940     .addFrameIndex(FrameIndex)        // vaddr
941     .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
942     .addReg(MFI->getFrameOffsetReg()) // scratch_offset
943     .addImm(0)                        // offset
944     .addMemOperand(MMO);
945 }
946 
947 /// \param @Offset Offset in bytes of the FrameIndex being spilled
948 unsigned SIInstrInfo::calculateLDSSpillAddress(
949     MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg,
950     unsigned FrameOffset, unsigned Size) const {
951   MachineFunction *MF = MBB.getParent();
952   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
953   const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
954   DebugLoc DL = MBB.findDebugLoc(MI);
955   unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize();
956   unsigned WavefrontSize = ST.getWavefrontSize();
957 
958   unsigned TIDReg = MFI->getTIDReg();
959   if (!MFI->hasCalculatedTID()) {
960     MachineBasicBlock &Entry = MBB.getParent()->front();
961     MachineBasicBlock::iterator Insert = Entry.front();
962     DebugLoc DL = Insert->getDebugLoc();
963 
964     TIDReg = RI.findUnusedRegister(MF->getRegInfo(), &AMDGPU::VGPR_32RegClass,
965                                    *MF);
966     if (TIDReg == AMDGPU::NoRegister)
967       return TIDReg;
968 
969     if (!AMDGPU::isShader(MF->getFunction()->getCallingConv()) &&
970         WorkGroupSize > WavefrontSize) {
971       unsigned TIDIGXReg
972         = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
973       unsigned TIDIGYReg
974         = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
975       unsigned TIDIGZReg
976         = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
977       unsigned InputPtrReg =
978           MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
979       for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) {
980         if (!Entry.isLiveIn(Reg))
981           Entry.addLiveIn(Reg);
982       }
983 
984       RS->enterBasicBlock(Entry);
985       // FIXME: Can we scavenge an SReg_64 and access the subregs?
986       unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
987       unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
988       BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0)
989               .addReg(InputPtrReg)
990               .addImm(SI::KernelInputOffsets::NGROUPS_Z);
991       BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1)
992               .addReg(InputPtrReg)
993               .addImm(SI::KernelInputOffsets::NGROUPS_Y);
994 
995       // NGROUPS.X * NGROUPS.Y
996       BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1)
997               .addReg(STmp1)
998               .addReg(STmp0);
999       // (NGROUPS.X * NGROUPS.Y) * TIDIG.X
1000       BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg)
1001               .addReg(STmp1)
1002               .addReg(TIDIGXReg);
1003       // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)
1004       BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg)
1005               .addReg(STmp0)
1006               .addReg(TIDIGYReg)
1007               .addReg(TIDReg);
1008       // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z
1009       BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg)
1010               .addReg(TIDReg)
1011               .addReg(TIDIGZReg);
1012     } else {
1013       // Get the wave id
1014       BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
1015               TIDReg)
1016               .addImm(-1)
1017               .addImm(0);
1018 
1019       BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e64),
1020               TIDReg)
1021               .addImm(-1)
1022               .addReg(TIDReg);
1023     }
1024 
1025     BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32),
1026             TIDReg)
1027             .addImm(2)
1028             .addReg(TIDReg);
1029     MFI->setTIDReg(TIDReg);
1030   }
1031 
1032   // Add FrameIndex to LDS offset
1033   unsigned LDSOffset = MFI->getLDSSize() + (FrameOffset * WorkGroupSize);
1034   BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg)
1035           .addImm(LDSOffset)
1036           .addReg(TIDReg);
1037 
1038   return TmpReg;
1039 }
1040 
1041 void SIInstrInfo::insertWaitStates(MachineBasicBlock &MBB,
1042                                    MachineBasicBlock::iterator MI,
1043                                    int Count) const {
1044   DebugLoc DL = MBB.findDebugLoc(MI);
1045   while (Count > 0) {
1046     int Arg;
1047     if (Count >= 8)
1048       Arg = 7;
1049     else
1050       Arg = Count - 1;
1051     Count -= 8;
1052     BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP))
1053             .addImm(Arg);
1054   }
1055 }
1056 
1057 void SIInstrInfo::insertNoop(MachineBasicBlock &MBB,
1058                              MachineBasicBlock::iterator MI) const {
1059   insertWaitStates(MBB, MI, 1);
1060 }
1061 
1062 void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const {
1063   auto MF = MBB.getParent();
1064   SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1065 
1066   assert(Info->isEntryFunction());
1067 
1068   if (MBB.succ_empty()) {
1069     bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
1070     if (HasNoTerminator)
1071       BuildMI(MBB, MBB.end(), DebugLoc(),
1072               get(Info->returnsVoid() ? AMDGPU::S_ENDPGM : AMDGPU::SI_RETURN_TO_EPILOG));
1073   }
1074 }
1075 
1076 unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const {
1077   switch (MI.getOpcode()) {
1078   default: return 1; // FIXME: Do wait states equal cycles?
1079 
1080   case AMDGPU::S_NOP:
1081     return MI.getOperand(0).getImm() + 1;
1082   }
1083 }
1084 
1085 bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
1086   MachineBasicBlock &MBB = *MI.getParent();
1087   DebugLoc DL = MBB.findDebugLoc(MI);
1088   switch (MI.getOpcode()) {
1089   default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
1090   case AMDGPU::S_MOV_B64_term:
1091     // This is only a terminator to get the correct spill code placement during
1092     // register allocation.
1093     MI.setDesc(get(AMDGPU::S_MOV_B64));
1094     break;
1095 
1096   case AMDGPU::S_XOR_B64_term:
1097     // This is only a terminator to get the correct spill code placement during
1098     // register allocation.
1099     MI.setDesc(get(AMDGPU::S_XOR_B64));
1100     break;
1101 
1102   case AMDGPU::S_ANDN2_B64_term:
1103     // This is only a terminator to get the correct spill code placement during
1104     // register allocation.
1105     MI.setDesc(get(AMDGPU::S_ANDN2_B64));
1106     break;
1107 
1108   case AMDGPU::V_MOV_B64_PSEUDO: {
1109     unsigned Dst = MI.getOperand(0).getReg();
1110     unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
1111     unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
1112 
1113     const MachineOperand &SrcOp = MI.getOperand(1);
1114     // FIXME: Will this work for 64-bit floating point immediates?
1115     assert(!SrcOp.isFPImm());
1116     if (SrcOp.isImm()) {
1117       APInt Imm(64, SrcOp.getImm());
1118       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
1119         .addImm(Imm.getLoBits(32).getZExtValue())
1120         .addReg(Dst, RegState::Implicit | RegState::Define);
1121       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
1122         .addImm(Imm.getHiBits(32).getZExtValue())
1123         .addReg(Dst, RegState::Implicit | RegState::Define);
1124     } else {
1125       assert(SrcOp.isReg());
1126       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
1127         .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
1128         .addReg(Dst, RegState::Implicit | RegState::Define);
1129       BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
1130         .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
1131         .addReg(Dst, RegState::Implicit | RegState::Define);
1132     }
1133     MI.eraseFromParent();
1134     break;
1135   }
1136   case AMDGPU::V_SET_INACTIVE_B32: {
1137     BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
1138       .addReg(AMDGPU::EXEC);
1139     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
1140       .add(MI.getOperand(2));
1141     BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
1142       .addReg(AMDGPU::EXEC);
1143     MI.eraseFromParent();
1144     break;
1145   }
1146   case AMDGPU::V_SET_INACTIVE_B64: {
1147     BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
1148       .addReg(AMDGPU::EXEC);
1149     MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
1150                                  MI.getOperand(0).getReg())
1151       .add(MI.getOperand(2));
1152     expandPostRAPseudo(*Copy);
1153     BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
1154       .addReg(AMDGPU::EXEC);
1155     MI.eraseFromParent();
1156     break;
1157   }
1158   case AMDGPU::V_MOVRELD_B32_V1:
1159   case AMDGPU::V_MOVRELD_B32_V2:
1160   case AMDGPU::V_MOVRELD_B32_V4:
1161   case AMDGPU::V_MOVRELD_B32_V8:
1162   case AMDGPU::V_MOVRELD_B32_V16: {
1163     const MCInstrDesc &MovRelDesc = get(AMDGPU::V_MOVRELD_B32_e32);
1164     unsigned VecReg = MI.getOperand(0).getReg();
1165     bool IsUndef = MI.getOperand(1).isUndef();
1166     unsigned SubReg = AMDGPU::sub0 + MI.getOperand(3).getImm();
1167     assert(VecReg == MI.getOperand(1).getReg());
1168 
1169     MachineInstr *MovRel =
1170         BuildMI(MBB, MI, DL, MovRelDesc)
1171             .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
1172             .add(MI.getOperand(2))
1173             .addReg(VecReg, RegState::ImplicitDefine)
1174             .addReg(VecReg,
1175                     RegState::Implicit | (IsUndef ? RegState::Undef : 0));
1176 
1177     const int ImpDefIdx =
1178         MovRelDesc.getNumOperands() + MovRelDesc.getNumImplicitUses();
1179     const int ImpUseIdx = ImpDefIdx + 1;
1180     MovRel->tieOperands(ImpDefIdx, ImpUseIdx);
1181 
1182     MI.eraseFromParent();
1183     break;
1184   }
1185   case AMDGPU::SI_PC_ADD_REL_OFFSET: {
1186     MachineFunction &MF = *MBB.getParent();
1187     unsigned Reg = MI.getOperand(0).getReg();
1188     unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
1189     unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
1190 
1191     // Create a bundle so these instructions won't be re-ordered by the
1192     // post-RA scheduler.
1193     MIBundleBuilder Bundler(MBB, MI);
1194     Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
1195 
1196     // Add 32-bit offset from this instruction to the start of the
1197     // constant data.
1198     Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo)
1199                        .addReg(RegLo)
1200                        .add(MI.getOperand(1)));
1201 
1202     MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
1203                                   .addReg(RegHi);
1204     if (MI.getOperand(2).getTargetFlags() == SIInstrInfo::MO_NONE)
1205       MIB.addImm(0);
1206     else
1207       MIB.add(MI.getOperand(2));
1208 
1209     Bundler.append(MIB);
1210     finalizeBundle(MBB, Bundler.begin());
1211 
1212     MI.eraseFromParent();
1213     break;
1214   }
1215   case AMDGPU::EXIT_WWM: {
1216     // This only gets its own opcode so that SIFixWWMLiveness can tell when WWM
1217     // is exited.
1218     MI.setDesc(get(AMDGPU::S_MOV_B64));
1219     break;
1220   }
1221   }
1222   return true;
1223 }
1224 
1225 bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI,
1226                                       MachineOperand &Src0,
1227                                       unsigned Src0OpName,
1228                                       MachineOperand &Src1,
1229                                       unsigned Src1OpName) const {
1230   MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
1231   if (!Src0Mods)
1232     return false;
1233 
1234   MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
1235   assert(Src1Mods &&
1236          "All commutable instructions have both src0 and src1 modifiers");
1237 
1238   int Src0ModsVal = Src0Mods->getImm();
1239   int Src1ModsVal = Src1Mods->getImm();
1240 
1241   Src1Mods->setImm(Src0ModsVal);
1242   Src0Mods->setImm(Src1ModsVal);
1243   return true;
1244 }
1245 
1246 static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI,
1247                                              MachineOperand &RegOp,
1248                                              MachineOperand &NonRegOp) {
1249   unsigned Reg = RegOp.getReg();
1250   unsigned SubReg = RegOp.getSubReg();
1251   bool IsKill = RegOp.isKill();
1252   bool IsDead = RegOp.isDead();
1253   bool IsUndef = RegOp.isUndef();
1254   bool IsDebug = RegOp.isDebug();
1255 
1256   if (NonRegOp.isImm())
1257     RegOp.ChangeToImmediate(NonRegOp.getImm());
1258   else if (NonRegOp.isFI())
1259     RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
1260   else
1261     return nullptr;
1262 
1263   NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
1264   NonRegOp.setSubReg(SubReg);
1265 
1266   return &MI;
1267 }
1268 
1269 MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
1270                                                   unsigned Src0Idx,
1271                                                   unsigned Src1Idx) const {
1272   assert(!NewMI && "this should never be used");
1273 
1274   unsigned Opc = MI.getOpcode();
1275   int CommutedOpcode = commuteOpcode(Opc);
1276   if (CommutedOpcode == -1)
1277     return nullptr;
1278 
1279   assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
1280            static_cast<int>(Src0Idx) &&
1281          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
1282            static_cast<int>(Src1Idx) &&
1283          "inconsistency with findCommutedOpIndices");
1284 
1285   MachineOperand &Src0 = MI.getOperand(Src0Idx);
1286   MachineOperand &Src1 = MI.getOperand(Src1Idx);
1287 
1288   MachineInstr *CommutedMI = nullptr;
1289   if (Src0.isReg() && Src1.isReg()) {
1290     if (isOperandLegal(MI, Src1Idx, &Src0)) {
1291       // Be sure to copy the source modifiers to the right place.
1292       CommutedMI
1293         = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
1294     }
1295 
1296   } else if (Src0.isReg() && !Src1.isReg()) {
1297     // src0 should always be able to support any operand type, so no need to
1298     // check operand legality.
1299     CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
1300   } else if (!Src0.isReg() && Src1.isReg()) {
1301     if (isOperandLegal(MI, Src1Idx, &Src0))
1302       CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
1303   } else {
1304     // FIXME: Found two non registers to commute. This does happen.
1305     return nullptr;
1306   }
1307 
1308   if (CommutedMI) {
1309     swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
1310                         Src1, AMDGPU::OpName::src1_modifiers);
1311 
1312     CommutedMI->setDesc(get(CommutedOpcode));
1313   }
1314 
1315   return CommutedMI;
1316 }
1317 
1318 // This needs to be implemented because the source modifiers may be inserted
1319 // between the true commutable operands, and the base
1320 // TargetInstrInfo::commuteInstruction uses it.
1321 bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0,
1322                                         unsigned &SrcOpIdx1) const {
1323   if (!MI.isCommutable())
1324     return false;
1325 
1326   unsigned Opc = MI.getOpcode();
1327   int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
1328   if (Src0Idx == -1)
1329     return false;
1330 
1331   int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
1332   if (Src1Idx == -1)
1333     return false;
1334 
1335   return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
1336 }
1337 
1338 bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
1339                                         int64_t BrOffset) const {
1340   // BranchRelaxation should never have to check s_setpc_b64 because its dest
1341   // block is unanalyzable.
1342   assert(BranchOp != AMDGPU::S_SETPC_B64);
1343 
1344   // Convert to dwords.
1345   BrOffset /= 4;
1346 
1347   // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
1348   // from the next instruction.
1349   BrOffset -= 1;
1350 
1351   return isIntN(BranchOffsetBits, BrOffset);
1352 }
1353 
1354 MachineBasicBlock *SIInstrInfo::getBranchDestBlock(
1355   const MachineInstr &MI) const {
1356   if (MI.getOpcode() == AMDGPU::S_SETPC_B64) {
1357     // This would be a difficult analysis to perform, but can always be legal so
1358     // there's no need to analyze it.
1359     return nullptr;
1360   }
1361 
1362   return MI.getOperand(0).getMBB();
1363 }
1364 
1365 unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
1366                                            MachineBasicBlock &DestBB,
1367                                            const DebugLoc &DL,
1368                                            int64_t BrOffset,
1369                                            RegScavenger *RS) const {
1370   assert(RS && "RegScavenger required for long branching");
1371   assert(MBB.empty() &&
1372          "new block should be inserted for expanding unconditional branch");
1373   assert(MBB.pred_size() == 1);
1374 
1375   MachineFunction *MF = MBB.getParent();
1376   MachineRegisterInfo &MRI = MF->getRegInfo();
1377 
1378   // FIXME: Virtual register workaround for RegScavenger not working with empty
1379   // blocks.
1380   unsigned PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
1381 
1382   auto I = MBB.end();
1383 
1384   // We need to compute the offset relative to the instruction immediately after
1385   // s_getpc_b64. Insert pc arithmetic code before last terminator.
1386   MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
1387 
1388   // TODO: Handle > 32-bit block address.
1389   if (BrOffset >= 0) {
1390     BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
1391       .addReg(PCReg, RegState::Define, AMDGPU::sub0)
1392       .addReg(PCReg, 0, AMDGPU::sub0)
1393       .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_FORWARD);
1394     BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
1395       .addReg(PCReg, RegState::Define, AMDGPU::sub1)
1396       .addReg(PCReg, 0, AMDGPU::sub1)
1397       .addImm(0);
1398   } else {
1399     // Backwards branch.
1400     BuildMI(MBB, I, DL, get(AMDGPU::S_SUB_U32))
1401       .addReg(PCReg, RegState::Define, AMDGPU::sub0)
1402       .addReg(PCReg, 0, AMDGPU::sub0)
1403       .addMBB(&DestBB, AMDGPU::TF_LONG_BRANCH_BACKWARD);
1404     BuildMI(MBB, I, DL, get(AMDGPU::S_SUBB_U32))
1405       .addReg(PCReg, RegState::Define, AMDGPU::sub1)
1406       .addReg(PCReg, 0, AMDGPU::sub1)
1407       .addImm(0);
1408   }
1409 
1410   // Insert the indirect branch after the other terminator.
1411   BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
1412     .addReg(PCReg);
1413 
1414   // FIXME: If spilling is necessary, this will fail because this scavenger has
1415   // no emergency stack slots. It is non-trivial to spill in this situation,
1416   // because the restore code needs to be specially placed after the
1417   // jump. BranchRelaxation then needs to be made aware of the newly inserted
1418   // block.
1419   //
1420   // If a spill is needed for the pc register pair, we need to insert a spill
1421   // restore block right before the destination block, and insert a short branch
1422   // into the old destination block's fallthrough predecessor.
1423   // e.g.:
1424   //
1425   // s_cbranch_scc0 skip_long_branch:
1426   //
1427   // long_branch_bb:
1428   //   spill s[8:9]
1429   //   s_getpc_b64 s[8:9]
1430   //   s_add_u32 s8, s8, restore_bb
1431   //   s_addc_u32 s9, s9, 0
1432   //   s_setpc_b64 s[8:9]
1433   //
1434   // skip_long_branch:
1435   //   foo;
1436   //
1437   // .....
1438   //
1439   // dest_bb_fallthrough_predecessor:
1440   // bar;
1441   // s_branch dest_bb
1442   //
1443   // restore_bb:
1444   //  restore s[8:9]
1445   //  fallthrough dest_bb
1446   ///
1447   // dest_bb:
1448   //   buzz;
1449 
1450   RS->enterBasicBlockEnd(MBB);
1451   unsigned Scav = RS->scavengeRegister(&AMDGPU::SReg_64RegClass,
1452                                        MachineBasicBlock::iterator(GetPC), 0);
1453   MRI.replaceRegWith(PCReg, Scav);
1454   MRI.clearVirtRegs();
1455   RS->setRegUsed(Scav);
1456 
1457   return 4 + 8 + 4 + 4;
1458 }
1459 
1460 unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
1461   switch (Cond) {
1462   case SIInstrInfo::SCC_TRUE:
1463     return AMDGPU::S_CBRANCH_SCC1;
1464   case SIInstrInfo::SCC_FALSE:
1465     return AMDGPU::S_CBRANCH_SCC0;
1466   case SIInstrInfo::VCCNZ:
1467     return AMDGPU::S_CBRANCH_VCCNZ;
1468   case SIInstrInfo::VCCZ:
1469     return AMDGPU::S_CBRANCH_VCCZ;
1470   case SIInstrInfo::EXECNZ:
1471     return AMDGPU::S_CBRANCH_EXECNZ;
1472   case SIInstrInfo::EXECZ:
1473     return AMDGPU::S_CBRANCH_EXECZ;
1474   default:
1475     llvm_unreachable("invalid branch predicate");
1476   }
1477 }
1478 
1479 SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
1480   switch (Opcode) {
1481   case AMDGPU::S_CBRANCH_SCC0:
1482     return SCC_FALSE;
1483   case AMDGPU::S_CBRANCH_SCC1:
1484     return SCC_TRUE;
1485   case AMDGPU::S_CBRANCH_VCCNZ:
1486     return VCCNZ;
1487   case AMDGPU::S_CBRANCH_VCCZ:
1488     return VCCZ;
1489   case AMDGPU::S_CBRANCH_EXECNZ:
1490     return EXECNZ;
1491   case AMDGPU::S_CBRANCH_EXECZ:
1492     return EXECZ;
1493   default:
1494     return INVALID_BR;
1495   }
1496 }
1497 
1498 bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB,
1499                                     MachineBasicBlock::iterator I,
1500                                     MachineBasicBlock *&TBB,
1501                                     MachineBasicBlock *&FBB,
1502                                     SmallVectorImpl<MachineOperand> &Cond,
1503                                     bool AllowModify) const {
1504   if (I->getOpcode() == AMDGPU::S_BRANCH) {
1505     // Unconditional Branch
1506     TBB = I->getOperand(0).getMBB();
1507     return false;
1508   }
1509 
1510   MachineBasicBlock *CondBB = nullptr;
1511 
1512   if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
1513     CondBB = I->getOperand(1).getMBB();
1514     Cond.push_back(I->getOperand(0));
1515   } else {
1516     BranchPredicate Pred = getBranchPredicate(I->getOpcode());
1517     if (Pred == INVALID_BR)
1518       return true;
1519 
1520     CondBB = I->getOperand(0).getMBB();
1521     Cond.push_back(MachineOperand::CreateImm(Pred));
1522     Cond.push_back(I->getOperand(1)); // Save the branch register.
1523   }
1524   ++I;
1525 
1526   if (I == MBB.end()) {
1527     // Conditional branch followed by fall-through.
1528     TBB = CondBB;
1529     return false;
1530   }
1531 
1532   if (I->getOpcode() == AMDGPU::S_BRANCH) {
1533     TBB = CondBB;
1534     FBB = I->getOperand(0).getMBB();
1535     return false;
1536   }
1537 
1538   return true;
1539 }
1540 
1541 bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
1542                                 MachineBasicBlock *&FBB,
1543                                 SmallVectorImpl<MachineOperand> &Cond,
1544                                 bool AllowModify) const {
1545   MachineBasicBlock::iterator I = MBB.getFirstTerminator();
1546   if (I == MBB.end())
1547     return false;
1548 
1549   if (I->getOpcode() != AMDGPU::SI_MASK_BRANCH)
1550     return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
1551 
1552   ++I;
1553 
1554   // TODO: Should be able to treat as fallthrough?
1555   if (I == MBB.end())
1556     return true;
1557 
1558   if (analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify))
1559     return true;
1560 
1561   MachineBasicBlock *MaskBrDest = I->getOperand(0).getMBB();
1562 
1563   // Specifically handle the case where the conditional branch is to the same
1564   // destination as the mask branch. e.g.
1565   //
1566   // si_mask_branch BB8
1567   // s_cbranch_execz BB8
1568   // s_cbranch BB9
1569   //
1570   // This is required to understand divergent loops which may need the branches
1571   // to be relaxed.
1572   if (TBB != MaskBrDest || Cond.empty())
1573     return true;
1574 
1575   auto Pred = Cond[0].getImm();
1576   return (Pred != EXECZ && Pred != EXECNZ);
1577 }
1578 
1579 unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB,
1580                                    int *BytesRemoved) const {
1581   MachineBasicBlock::iterator I = MBB.getFirstTerminator();
1582 
1583   unsigned Count = 0;
1584   unsigned RemovedSize = 0;
1585   while (I != MBB.end()) {
1586     MachineBasicBlock::iterator Next = std::next(I);
1587     if (I->getOpcode() == AMDGPU::SI_MASK_BRANCH) {
1588       I = Next;
1589       continue;
1590     }
1591 
1592     RemovedSize += getInstSizeInBytes(*I);
1593     I->eraseFromParent();
1594     ++Count;
1595     I = Next;
1596   }
1597 
1598   if (BytesRemoved)
1599     *BytesRemoved = RemovedSize;
1600 
1601   return Count;
1602 }
1603 
1604 // Copy the flags onto the implicit condition register operand.
1605 static void preserveCondRegFlags(MachineOperand &CondReg,
1606                                  const MachineOperand &OrigCond) {
1607   CondReg.setIsUndef(OrigCond.isUndef());
1608   CondReg.setIsKill(OrigCond.isKill());
1609 }
1610 
1611 unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
1612                                    MachineBasicBlock *TBB,
1613                                    MachineBasicBlock *FBB,
1614                                    ArrayRef<MachineOperand> Cond,
1615                                    const DebugLoc &DL,
1616                                    int *BytesAdded) const {
1617   if (!FBB && Cond.empty()) {
1618     BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
1619       .addMBB(TBB);
1620     if (BytesAdded)
1621       *BytesAdded = 4;
1622     return 1;
1623   }
1624 
1625   if(Cond.size() == 1 && Cond[0].isReg()) {
1626      BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO))
1627        .add(Cond[0])
1628        .addMBB(TBB);
1629      return 1;
1630   }
1631 
1632   assert(TBB && Cond[0].isImm());
1633 
1634   unsigned Opcode
1635     = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
1636 
1637   if (!FBB) {
1638     Cond[1].isUndef();
1639     MachineInstr *CondBr =
1640       BuildMI(&MBB, DL, get(Opcode))
1641       .addMBB(TBB);
1642 
1643     // Copy the flags onto the implicit condition register operand.
1644     preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
1645 
1646     if (BytesAdded)
1647       *BytesAdded = 4;
1648     return 1;
1649   }
1650 
1651   assert(TBB && FBB);
1652 
1653   MachineInstr *CondBr =
1654     BuildMI(&MBB, DL, get(Opcode))
1655     .addMBB(TBB);
1656   BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
1657     .addMBB(FBB);
1658 
1659   MachineOperand &CondReg = CondBr->getOperand(1);
1660   CondReg.setIsUndef(Cond[1].isUndef());
1661   CondReg.setIsKill(Cond[1].isKill());
1662 
1663   if (BytesAdded)
1664       *BytesAdded = 8;
1665 
1666   return 2;
1667 }
1668 
1669 bool SIInstrInfo::reverseBranchCondition(
1670   SmallVectorImpl<MachineOperand> &Cond) const {
1671   if (Cond.size() != 2) {
1672     return true;
1673   }
1674 
1675   if (Cond[0].isImm()) {
1676     Cond[0].setImm(-Cond[0].getImm());
1677     return false;
1678   }
1679 
1680   return true;
1681 }
1682 
1683 bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
1684                                   ArrayRef<MachineOperand> Cond,
1685                                   unsigned TrueReg, unsigned FalseReg,
1686                                   int &CondCycles,
1687                                   int &TrueCycles, int &FalseCycles) const {
1688   switch (Cond[0].getImm()) {
1689   case VCCNZ:
1690   case VCCZ: {
1691     const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1692     const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
1693     assert(MRI.getRegClass(FalseReg) == RC);
1694 
1695     int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
1696     CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
1697 
1698     // Limit to equal cost for branch vs. N v_cndmask_b32s.
1699     return !RI.isSGPRClass(RC) && NumInsts <= 6;
1700   }
1701   case SCC_TRUE:
1702   case SCC_FALSE: {
1703     // FIXME: We could insert for VGPRs if we could replace the original compare
1704     // with a vector one.
1705     const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1706     const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
1707     assert(MRI.getRegClass(FalseReg) == RC);
1708 
1709     int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
1710 
1711     // Multiples of 8 can do s_cselect_b64
1712     if (NumInsts % 2 == 0)
1713       NumInsts /= 2;
1714 
1715     CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
1716     return RI.isSGPRClass(RC);
1717   }
1718   default:
1719     return false;
1720   }
1721 }
1722 
1723 void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
1724                                MachineBasicBlock::iterator I, const DebugLoc &DL,
1725                                unsigned DstReg, ArrayRef<MachineOperand> Cond,
1726                                unsigned TrueReg, unsigned FalseReg) const {
1727   BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
1728   if (Pred == VCCZ || Pred == SCC_FALSE) {
1729     Pred = static_cast<BranchPredicate>(-Pred);
1730     std::swap(TrueReg, FalseReg);
1731   }
1732 
1733   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1734   const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
1735   unsigned DstSize = RI.getRegSizeInBits(*DstRC);
1736 
1737   if (DstSize == 32) {
1738     unsigned SelOp = Pred == SCC_TRUE ?
1739       AMDGPU::S_CSELECT_B32 : AMDGPU::V_CNDMASK_B32_e32;
1740 
1741     // Instruction's operands are backwards from what is expected.
1742     MachineInstr *Select =
1743       BuildMI(MBB, I, DL, get(SelOp), DstReg)
1744       .addReg(FalseReg)
1745       .addReg(TrueReg);
1746 
1747     preserveCondRegFlags(Select->getOperand(3), Cond[1]);
1748     return;
1749   }
1750 
1751   if (DstSize == 64 && Pred == SCC_TRUE) {
1752     MachineInstr *Select =
1753       BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
1754       .addReg(FalseReg)
1755       .addReg(TrueReg);
1756 
1757     preserveCondRegFlags(Select->getOperand(3), Cond[1]);
1758     return;
1759   }
1760 
1761   static const int16_t Sub0_15[] = {
1762     AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
1763     AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
1764     AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
1765     AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
1766   };
1767 
1768   static const int16_t Sub0_15_64[] = {
1769     AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
1770     AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
1771     AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
1772     AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
1773   };
1774 
1775   unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
1776   const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
1777   const int16_t *SubIndices = Sub0_15;
1778   int NElts = DstSize / 32;
1779 
1780   // 64-bit select is only avaialble for SALU.
1781   if (Pred == SCC_TRUE) {
1782     SelOp = AMDGPU::S_CSELECT_B64;
1783     EltRC = &AMDGPU::SGPR_64RegClass;
1784     SubIndices = Sub0_15_64;
1785 
1786     assert(NElts % 2 == 0);
1787     NElts /= 2;
1788   }
1789 
1790   MachineInstrBuilder MIB = BuildMI(
1791     MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
1792 
1793   I = MIB->getIterator();
1794 
1795   SmallVector<unsigned, 8> Regs;
1796   for (int Idx = 0; Idx != NElts; ++Idx) {
1797     unsigned DstElt = MRI.createVirtualRegister(EltRC);
1798     Regs.push_back(DstElt);
1799 
1800     unsigned SubIdx = SubIndices[Idx];
1801 
1802     MachineInstr *Select =
1803       BuildMI(MBB, I, DL, get(SelOp), DstElt)
1804       .addReg(FalseReg, 0, SubIdx)
1805       .addReg(TrueReg, 0, SubIdx);
1806     preserveCondRegFlags(Select->getOperand(3), Cond[1]);
1807 
1808     MIB.addReg(DstElt)
1809        .addImm(SubIdx);
1810   }
1811 }
1812 
1813 bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const {
1814   switch (MI.getOpcode()) {
1815   case AMDGPU::V_MOV_B32_e32:
1816   case AMDGPU::V_MOV_B32_e64:
1817   case AMDGPU::V_MOV_B64_PSEUDO: {
1818     // If there are additional implicit register operands, this may be used for
1819     // register indexing so the source register operand isn't simply copied.
1820     unsigned NumOps = MI.getDesc().getNumOperands() +
1821       MI.getDesc().getNumImplicitUses();
1822 
1823     return MI.getNumOperands() == NumOps;
1824   }
1825   case AMDGPU::S_MOV_B32:
1826   case AMDGPU::S_MOV_B64:
1827   case AMDGPU::COPY:
1828     return true;
1829   default:
1830     return false;
1831   }
1832 }
1833 
1834 static void removeModOperands(MachineInstr &MI) {
1835   unsigned Opc = MI.getOpcode();
1836   int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc,
1837                                               AMDGPU::OpName::src0_modifiers);
1838   int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc,
1839                                               AMDGPU::OpName::src1_modifiers);
1840   int Src2ModIdx = AMDGPU::getNamedOperandIdx(Opc,
1841                                               AMDGPU::OpName::src2_modifiers);
1842 
1843   MI.RemoveOperand(Src2ModIdx);
1844   MI.RemoveOperand(Src1ModIdx);
1845   MI.RemoveOperand(Src0ModIdx);
1846 }
1847 
1848 bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
1849                                 unsigned Reg, MachineRegisterInfo *MRI) const {
1850   if (!MRI->hasOneNonDBGUse(Reg))
1851     return false;
1852 
1853   unsigned Opc = UseMI.getOpcode();
1854   if (Opc == AMDGPU::COPY) {
1855     bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg());
1856     switch (DefMI.getOpcode()) {
1857     default:
1858       return false;
1859     case AMDGPU::S_MOV_B64:
1860       // TODO: We could fold 64-bit immediates, but this get compilicated
1861       // when there are sub-registers.
1862       return false;
1863 
1864     case AMDGPU::V_MOV_B32_e32:
1865     case AMDGPU::S_MOV_B32:
1866       break;
1867     }
1868     unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
1869     const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0);
1870     assert(ImmOp);
1871     // FIXME: We could handle FrameIndex values here.
1872     if (!ImmOp->isImm()) {
1873       return false;
1874     }
1875     UseMI.setDesc(get(NewOpc));
1876     UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm());
1877     UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
1878     return true;
1879   }
1880 
1881   if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 ||
1882       Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) {
1883     // Don't fold if we are using source or output modifiers. The new VOP2
1884     // instructions don't have them.
1885     if (hasAnyModifiersSet(UseMI))
1886       return false;
1887 
1888     const MachineOperand &ImmOp = DefMI.getOperand(1);
1889 
1890     // If this is a free constant, there's no reason to do this.
1891     // TODO: We could fold this here instead of letting SIFoldOperands do it
1892     // later.
1893     MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
1894 
1895     // Any src operand can be used for the legality check.
1896     if (isInlineConstant(UseMI, *Src0, ImmOp))
1897       return false;
1898 
1899     bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64;
1900     MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
1901     MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
1902 
1903     // Multiplied part is the constant: Use v_madmk_{f16, f32}.
1904     // We should only expect these to be on src0 due to canonicalizations.
1905     if (Src0->isReg() && Src0->getReg() == Reg) {
1906       if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
1907         return false;
1908 
1909       if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
1910         return false;
1911 
1912       // We need to swap operands 0 and 1 since madmk constant is at operand 1.
1913 
1914       const int64_t Imm = DefMI.getOperand(1).getImm();
1915 
1916       // FIXME: This would be a lot easier if we could return a new instruction
1917       // instead of having to modify in place.
1918 
1919       // Remove these first since they are at the end.
1920       UseMI.RemoveOperand(
1921           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
1922       UseMI.RemoveOperand(
1923           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
1924 
1925       unsigned Src1Reg = Src1->getReg();
1926       unsigned Src1SubReg = Src1->getSubReg();
1927       Src0->setReg(Src1Reg);
1928       Src0->setSubReg(Src1SubReg);
1929       Src0->setIsKill(Src1->isKill());
1930 
1931       if (Opc == AMDGPU::V_MAC_F32_e64 ||
1932           Opc == AMDGPU::V_MAC_F16_e64)
1933         UseMI.untieRegOperand(
1934             AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
1935 
1936       Src1->ChangeToImmediate(Imm);
1937 
1938       removeModOperands(UseMI);
1939       UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16));
1940 
1941       bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
1942       if (DeleteDef)
1943         DefMI.eraseFromParent();
1944 
1945       return true;
1946     }
1947 
1948     // Added part is the constant: Use v_madak_{f16, f32}.
1949     if (Src2->isReg() && Src2->getReg() == Reg) {
1950       // Not allowed to use constant bus for another operand.
1951       // We can however allow an inline immediate as src0.
1952       if (!Src0->isImm() &&
1953           (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))
1954         return false;
1955 
1956       if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
1957         return false;
1958 
1959       const int64_t Imm = DefMI.getOperand(1).getImm();
1960 
1961       // FIXME: This would be a lot easier if we could return a new instruction
1962       // instead of having to modify in place.
1963 
1964       // Remove these first since they are at the end.
1965       UseMI.RemoveOperand(
1966           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod));
1967       UseMI.RemoveOperand(
1968           AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
1969 
1970       if (Opc == AMDGPU::V_MAC_F32_e64 ||
1971           Opc == AMDGPU::V_MAC_F16_e64)
1972         UseMI.untieRegOperand(
1973             AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
1974 
1975       // ChangingToImmediate adds Src2 back to the instruction.
1976       Src2->ChangeToImmediate(Imm);
1977 
1978       // These come before src2.
1979       removeModOperands(UseMI);
1980       UseMI.setDesc(get(IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16));
1981 
1982       bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
1983       if (DeleteDef)
1984         DefMI.eraseFromParent();
1985 
1986       return true;
1987     }
1988   }
1989 
1990   return false;
1991 }
1992 
1993 static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
1994                                 int WidthB, int OffsetB) {
1995   int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
1996   int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
1997   int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
1998   return LowOffset + LowWidth <= HighOffset;
1999 }
2000 
2001 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa,
2002                                                MachineInstr &MIb) const {
2003   unsigned BaseReg0, BaseReg1;
2004   int64_t Offset0, Offset1;
2005 
2006   if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) &&
2007       getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) {
2008 
2009     if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
2010       // FIXME: Handle ds_read2 / ds_write2.
2011       return false;
2012     }
2013     unsigned Width0 = (*MIa.memoperands_begin())->getSize();
2014     unsigned Width1 = (*MIb.memoperands_begin())->getSize();
2015     if (BaseReg0 == BaseReg1 &&
2016         offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) {
2017       return true;
2018     }
2019   }
2020 
2021   return false;
2022 }
2023 
2024 bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa,
2025                                                   MachineInstr &MIb,
2026                                                   AliasAnalysis *AA) const {
2027   assert((MIa.mayLoad() || MIa.mayStore()) &&
2028          "MIa must load from or modify a memory location");
2029   assert((MIb.mayLoad() || MIb.mayStore()) &&
2030          "MIb must load from or modify a memory location");
2031 
2032   if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects())
2033     return false;
2034 
2035   // XXX - Can we relax this between address spaces?
2036   if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
2037     return false;
2038 
2039   if (AA && MIa.hasOneMemOperand() && MIb.hasOneMemOperand()) {
2040     const MachineMemOperand *MMOa = *MIa.memoperands_begin();
2041     const MachineMemOperand *MMOb = *MIb.memoperands_begin();
2042     if (MMOa->getValue() && MMOb->getValue()) {
2043       MemoryLocation LocA(MMOa->getValue(), MMOa->getSize(), MMOa->getAAInfo());
2044       MemoryLocation LocB(MMOb->getValue(), MMOb->getSize(), MMOb->getAAInfo());
2045       if (!AA->alias(LocA, LocB))
2046         return true;
2047     }
2048   }
2049 
2050   // TODO: Should we check the address space from the MachineMemOperand? That
2051   // would allow us to distinguish objects we know don't alias based on the
2052   // underlying address space, even if it was lowered to a different one,
2053   // e.g. private accesses lowered to use MUBUF instructions on a scratch
2054   // buffer.
2055   if (isDS(MIa)) {
2056     if (isDS(MIb))
2057       return checkInstOffsetsDoNotOverlap(MIa, MIb);
2058 
2059     return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
2060   }
2061 
2062   if (isMUBUF(MIa) || isMTBUF(MIa)) {
2063     if (isMUBUF(MIb) || isMTBUF(MIb))
2064       return checkInstOffsetsDoNotOverlap(MIa, MIb);
2065 
2066     return !isFLAT(MIb) && !isSMRD(MIb);
2067   }
2068 
2069   if (isSMRD(MIa)) {
2070     if (isSMRD(MIb))
2071       return checkInstOffsetsDoNotOverlap(MIa, MIb);
2072 
2073     return !isFLAT(MIb) && !isMUBUF(MIa) && !isMTBUF(MIa);
2074   }
2075 
2076   if (isFLAT(MIa)) {
2077     if (isFLAT(MIb))
2078       return checkInstOffsetsDoNotOverlap(MIa, MIb);
2079 
2080     return false;
2081   }
2082 
2083   return false;
2084 }
2085 
2086 MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
2087                                                  MachineInstr &MI,
2088                                                  LiveVariables *LV) const {
2089   bool IsF16 = false;
2090 
2091   switch (MI.getOpcode()) {
2092   default:
2093     return nullptr;
2094   case AMDGPU::V_MAC_F16_e64:
2095     IsF16 = true;
2096     LLVM_FALLTHROUGH;
2097   case AMDGPU::V_MAC_F32_e64:
2098     break;
2099   case AMDGPU::V_MAC_F16_e32:
2100     IsF16 = true;
2101     LLVM_FALLTHROUGH;
2102   case AMDGPU::V_MAC_F32_e32: {
2103     int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
2104                                              AMDGPU::OpName::src0);
2105     const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
2106     if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
2107       return nullptr;
2108     break;
2109   }
2110   }
2111 
2112   const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
2113   const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
2114   const MachineOperand *Src0Mods =
2115     getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
2116   const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
2117   const MachineOperand *Src1Mods =
2118     getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
2119   const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
2120   const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
2121   const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
2122 
2123   return BuildMI(*MBB, MI, MI.getDebugLoc(),
2124                  get(IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32))
2125       .add(*Dst)
2126       .addImm(Src0Mods ? Src0Mods->getImm() : 0)
2127       .add(*Src0)
2128       .addImm(Src1Mods ? Src1Mods->getImm() : 0)
2129       .add(*Src1)
2130       .addImm(0) // Src mods
2131       .add(*Src2)
2132       .addImm(Clamp ? Clamp->getImm() : 0)
2133       .addImm(Omod ? Omod->getImm() : 0);
2134 }
2135 
2136 // It's not generally safe to move VALU instructions across these since it will
2137 // start using the register as a base index rather than directly.
2138 // XXX - Why isn't hasSideEffects sufficient for these?
2139 static bool changesVGPRIndexingMode(const MachineInstr &MI) {
2140   switch (MI.getOpcode()) {
2141   case AMDGPU::S_SET_GPR_IDX_ON:
2142   case AMDGPU::S_SET_GPR_IDX_MODE:
2143   case AMDGPU::S_SET_GPR_IDX_OFF:
2144     return true;
2145   default:
2146     return false;
2147   }
2148 }
2149 
2150 bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
2151                                        const MachineBasicBlock *MBB,
2152                                        const MachineFunction &MF) const {
2153   // XXX - Do we want the SP check in the base implementation?
2154 
2155   // Target-independent instructions do not have an implicit-use of EXEC, even
2156   // when they operate on VGPRs. Treating EXEC modifications as scheduling
2157   // boundaries prevents incorrect movements of such instructions.
2158   return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) ||
2159          MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
2160          MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
2161          MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
2162          changesVGPRIndexingMode(MI);
2163 }
2164 
2165 bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
2166   switch (Imm.getBitWidth()) {
2167   case 32:
2168     return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
2169                                         ST.hasInv2PiInlineImm());
2170   case 64:
2171     return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
2172                                         ST.hasInv2PiInlineImm());
2173   case 16:
2174     return ST.has16BitInsts() &&
2175            AMDGPU::isInlinableLiteral16(Imm.getSExtValue(),
2176                                         ST.hasInv2PiInlineImm());
2177   default:
2178     llvm_unreachable("invalid bitwidth");
2179   }
2180 }
2181 
2182 bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
2183                                    uint8_t OperandType) const {
2184   if (!MO.isImm() ||
2185       OperandType < AMDGPU::OPERAND_SRC_FIRST ||
2186       OperandType > AMDGPU::OPERAND_SRC_LAST)
2187     return false;
2188 
2189   // MachineOperand provides no way to tell the true operand size, since it only
2190   // records a 64-bit value. We need to know the size to determine if a 32-bit
2191   // floating point immediate bit pattern is legal for an integer immediate. It
2192   // would be for any 32-bit integer operand, but would not be for a 64-bit one.
2193 
2194   int64_t Imm = MO.getImm();
2195   switch (OperandType) {
2196   case AMDGPU::OPERAND_REG_IMM_INT32:
2197   case AMDGPU::OPERAND_REG_IMM_FP32:
2198   case AMDGPU::OPERAND_REG_INLINE_C_INT32:
2199   case AMDGPU::OPERAND_REG_INLINE_C_FP32: {
2200     int32_t Trunc = static_cast<int32_t>(Imm);
2201     return Trunc == Imm &&
2202            AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
2203   }
2204   case AMDGPU::OPERAND_REG_IMM_INT64:
2205   case AMDGPU::OPERAND_REG_IMM_FP64:
2206   case AMDGPU::OPERAND_REG_INLINE_C_INT64:
2207   case AMDGPU::OPERAND_REG_INLINE_C_FP64:
2208     return AMDGPU::isInlinableLiteral64(MO.getImm(),
2209                                         ST.hasInv2PiInlineImm());
2210   case AMDGPU::OPERAND_REG_IMM_INT16:
2211   case AMDGPU::OPERAND_REG_IMM_FP16:
2212   case AMDGPU::OPERAND_REG_INLINE_C_INT16:
2213   case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
2214     if (isInt<16>(Imm) || isUInt<16>(Imm)) {
2215       // A few special case instructions have 16-bit operands on subtargets
2216       // where 16-bit instructions are not legal.
2217       // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
2218       // constants in these cases
2219       int16_t Trunc = static_cast<int16_t>(Imm);
2220       return ST.has16BitInsts() &&
2221              AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm());
2222     }
2223 
2224     return false;
2225   }
2226   case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
2227   case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: {
2228     uint32_t Trunc = static_cast<uint32_t>(Imm);
2229     return  AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm());
2230   }
2231   default:
2232     llvm_unreachable("invalid bitwidth");
2233   }
2234 }
2235 
2236 bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO,
2237                                         const MCOperandInfo &OpInfo) const {
2238   switch (MO.getType()) {
2239   case MachineOperand::MO_Register:
2240     return false;
2241   case MachineOperand::MO_Immediate:
2242     return !isInlineConstant(MO, OpInfo);
2243   case MachineOperand::MO_FrameIndex:
2244   case MachineOperand::MO_MachineBasicBlock:
2245   case MachineOperand::MO_ExternalSymbol:
2246   case MachineOperand::MO_GlobalAddress:
2247   case MachineOperand::MO_MCSymbol:
2248     return true;
2249   default:
2250     llvm_unreachable("unexpected operand type");
2251   }
2252 }
2253 
2254 static bool compareMachineOp(const MachineOperand &Op0,
2255                              const MachineOperand &Op1) {
2256   if (Op0.getType() != Op1.getType())
2257     return false;
2258 
2259   switch (Op0.getType()) {
2260   case MachineOperand::MO_Register:
2261     return Op0.getReg() == Op1.getReg();
2262   case MachineOperand::MO_Immediate:
2263     return Op0.getImm() == Op1.getImm();
2264   default:
2265     llvm_unreachable("Didn't expect to be comparing these operand types");
2266   }
2267 }
2268 
2269 bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
2270                                     const MachineOperand &MO) const {
2271   const MCOperandInfo &OpInfo = get(MI.getOpcode()).OpInfo[OpNo];
2272 
2273   assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
2274 
2275   if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
2276     return true;
2277 
2278   if (OpInfo.RegClass < 0)
2279     return false;
2280 
2281   if (MO.isImm() && isInlineConstant(MO, OpInfo))
2282     return RI.opCanUseInlineConstant(OpInfo.OperandType);
2283 
2284   return RI.opCanUseLiteralConstant(OpInfo.OperandType);
2285 }
2286 
2287 bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
2288   int Op32 = AMDGPU::getVOPe32(Opcode);
2289   if (Op32 == -1)
2290     return false;
2291 
2292   return pseudoToMCOpcode(Op32) != -1;
2293 }
2294 
2295 bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
2296   // The src0_modifier operand is present on all instructions
2297   // that have modifiers.
2298 
2299   return AMDGPU::getNamedOperandIdx(Opcode,
2300                                     AMDGPU::OpName::src0_modifiers) != -1;
2301 }
2302 
2303 bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI,
2304                                   unsigned OpName) const {
2305   const MachineOperand *Mods = getNamedOperand(MI, OpName);
2306   return Mods && Mods->getImm();
2307 }
2308 
2309 bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const {
2310   return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
2311          hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
2312          hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) ||
2313          hasModifiersSet(MI, AMDGPU::OpName::clamp) ||
2314          hasModifiersSet(MI, AMDGPU::OpName::omod);
2315 }
2316 
2317 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
2318                                   const MachineOperand &MO,
2319                                   const MCOperandInfo &OpInfo) const {
2320   // Literal constants use the constant bus.
2321   //if (isLiteralConstantLike(MO, OpInfo))
2322   // return true;
2323   if (MO.isImm())
2324     return !isInlineConstant(MO, OpInfo);
2325 
2326   if (!MO.isReg())
2327     return true; // Misc other operands like FrameIndex
2328 
2329   if (!MO.isUse())
2330     return false;
2331 
2332   if (TargetRegisterInfo::isVirtualRegister(MO.getReg()))
2333     return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
2334 
2335   // FLAT_SCR is just an SGPR pair.
2336   if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCR))
2337     return true;
2338 
2339   // EXEC register uses the constant bus.
2340   if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC)
2341     return true;
2342 
2343   // SGPRs use the constant bus
2344   return (MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::M0 ||
2345           (!MO.isImplicit() &&
2346            (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) ||
2347             AMDGPU::SGPR_64RegClass.contains(MO.getReg()))));
2348 }
2349 
2350 static unsigned findImplicitSGPRRead(const MachineInstr &MI) {
2351   for (const MachineOperand &MO : MI.implicit_operands()) {
2352     // We only care about reads.
2353     if (MO.isDef())
2354       continue;
2355 
2356     switch (MO.getReg()) {
2357     case AMDGPU::VCC:
2358     case AMDGPU::M0:
2359     case AMDGPU::FLAT_SCR:
2360       return MO.getReg();
2361 
2362     default:
2363       break;
2364     }
2365   }
2366 
2367   return AMDGPU::NoRegister;
2368 }
2369 
2370 static bool shouldReadExec(const MachineInstr &MI) {
2371   if (SIInstrInfo::isVALU(MI)) {
2372     switch (MI.getOpcode()) {
2373     case AMDGPU::V_READLANE_B32:
2374     case AMDGPU::V_READLANE_B32_si:
2375     case AMDGPU::V_READLANE_B32_vi:
2376     case AMDGPU::V_WRITELANE_B32:
2377     case AMDGPU::V_WRITELANE_B32_si:
2378     case AMDGPU::V_WRITELANE_B32_vi:
2379       return false;
2380     }
2381 
2382     return true;
2383   }
2384 
2385   if (SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
2386       SIInstrInfo::isSALU(MI) ||
2387       SIInstrInfo::isSMRD(MI))
2388     return false;
2389 
2390   return true;
2391 }
2392 
2393 static bool isSubRegOf(const SIRegisterInfo &TRI,
2394                        const MachineOperand &SuperVec,
2395                        const MachineOperand &SubReg) {
2396   if (TargetRegisterInfo::isPhysicalRegister(SubReg.getReg()))
2397     return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
2398 
2399   return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
2400          SubReg.getReg() == SuperVec.getReg();
2401 }
2402 
2403 bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
2404                                     StringRef &ErrInfo) const {
2405   uint16_t Opcode = MI.getOpcode();
2406   if (SIInstrInfo::isGenericOpcode(MI.getOpcode()))
2407     return true;
2408 
2409   const MachineFunction *MF = MI.getParent()->getParent();
2410   const MachineRegisterInfo &MRI = MF->getRegInfo();
2411 
2412   int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
2413   int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
2414   int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
2415 
2416   // Make sure the number of operands is correct.
2417   const MCInstrDesc &Desc = get(Opcode);
2418   if (!Desc.isVariadic() &&
2419       Desc.getNumOperands() != MI.getNumExplicitOperands()) {
2420     ErrInfo = "Instruction has wrong number of operands.";
2421     return false;
2422   }
2423 
2424   if (MI.isInlineAsm()) {
2425     // Verify register classes for inlineasm constraints.
2426     for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
2427          I != E; ++I) {
2428       const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
2429       if (!RC)
2430         continue;
2431 
2432       const MachineOperand &Op = MI.getOperand(I);
2433       if (!Op.isReg())
2434         continue;
2435 
2436       unsigned Reg = Op.getReg();
2437       if (!TargetRegisterInfo::isVirtualRegister(Reg) && !RC->contains(Reg)) {
2438         ErrInfo = "inlineasm operand has incorrect register class.";
2439         return false;
2440       }
2441     }
2442 
2443     return true;
2444   }
2445 
2446   // Make sure the register classes are correct.
2447   for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
2448     if (MI.getOperand(i).isFPImm()) {
2449       ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
2450                 "all fp values to integers.";
2451       return false;
2452     }
2453 
2454     int RegClass = Desc.OpInfo[i].RegClass;
2455 
2456     switch (Desc.OpInfo[i].OperandType) {
2457     case MCOI::OPERAND_REGISTER:
2458       if (MI.getOperand(i).isImm()) {
2459         ErrInfo = "Illegal immediate value for operand.";
2460         return false;
2461       }
2462       break;
2463     case AMDGPU::OPERAND_REG_IMM_INT32:
2464     case AMDGPU::OPERAND_REG_IMM_FP32:
2465       break;
2466     case AMDGPU::OPERAND_REG_INLINE_C_INT32:
2467     case AMDGPU::OPERAND_REG_INLINE_C_FP32:
2468     case AMDGPU::OPERAND_REG_INLINE_C_INT64:
2469     case AMDGPU::OPERAND_REG_INLINE_C_FP64:
2470     case AMDGPU::OPERAND_REG_INLINE_C_INT16:
2471     case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
2472       const MachineOperand &MO = MI.getOperand(i);
2473       if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
2474         ErrInfo = "Illegal immediate value for operand.";
2475         return false;
2476       }
2477       break;
2478     }
2479     case MCOI::OPERAND_IMMEDIATE:
2480     case AMDGPU::OPERAND_KIMM32:
2481       // Check if this operand is an immediate.
2482       // FrameIndex operands will be replaced by immediates, so they are
2483       // allowed.
2484       if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
2485         ErrInfo = "Expected immediate, but got non-immediate";
2486         return false;
2487       }
2488       LLVM_FALLTHROUGH;
2489     default:
2490       continue;
2491     }
2492 
2493     if (!MI.getOperand(i).isReg())
2494       continue;
2495 
2496     if (RegClass != -1) {
2497       unsigned Reg = MI.getOperand(i).getReg();
2498       if (Reg == AMDGPU::NoRegister ||
2499           TargetRegisterInfo::isVirtualRegister(Reg))
2500         continue;
2501 
2502       const TargetRegisterClass *RC = RI.getRegClass(RegClass);
2503       if (!RC->contains(Reg)) {
2504         ErrInfo = "Operand has incorrect register class.";
2505         return false;
2506       }
2507     }
2508   }
2509 
2510   // Verify SDWA
2511   if (isSDWA(MI)) {
2512     if (!ST.hasSDWA()) {
2513       ErrInfo = "SDWA is not supported on this target";
2514       return false;
2515     }
2516 
2517     int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
2518 
2519     const int OpIndicies[] = { DstIdx, Src0Idx, Src1Idx, Src2Idx };
2520 
2521     for (int OpIdx: OpIndicies) {
2522       if (OpIdx == -1)
2523         continue;
2524       const MachineOperand &MO = MI.getOperand(OpIdx);
2525 
2526       if (!ST.hasSDWAScalar()) {
2527         // Only VGPRS on VI
2528         if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
2529           ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
2530           return false;
2531         }
2532       } else {
2533         // No immediates on GFX9
2534         if (!MO.isReg()) {
2535           ErrInfo = "Only reg allowed as operands in SDWA instructions on GFX9";
2536           return false;
2537         }
2538       }
2539     }
2540 
2541     if (!ST.hasSDWAOmod()) {
2542       // No omod allowed on VI
2543       const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
2544       if (OMod != nullptr &&
2545         (!OMod->isImm() || OMod->getImm() != 0)) {
2546         ErrInfo = "OMod not allowed in SDWA instructions on VI";
2547         return false;
2548       }
2549     }
2550 
2551     uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
2552     if (isVOPC(BasicOpcode)) {
2553       if (!ST.hasSDWASdst() && DstIdx != -1) {
2554         // Only vcc allowed as dst on VI for VOPC
2555         const MachineOperand &Dst = MI.getOperand(DstIdx);
2556         if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
2557           ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
2558           return false;
2559         }
2560       } else if (!ST.hasSDWAOutModsVOPC()) {
2561         // No clamp allowed on GFX9 for VOPC
2562         const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
2563         if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
2564           ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
2565           return false;
2566         }
2567 
2568         // No omod allowed on GFX9 for VOPC
2569         const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
2570         if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
2571           ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
2572           return false;
2573         }
2574       }
2575     }
2576   }
2577 
2578   // Verify VOP*
2579   if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI)) {
2580     // Only look at the true operands. Only a real operand can use the constant
2581     // bus, and we don't want to check pseudo-operands like the source modifier
2582     // flags.
2583     const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
2584 
2585     unsigned ConstantBusCount = 0;
2586 
2587     if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1)
2588       ++ConstantBusCount;
2589 
2590     unsigned SGPRUsed = findImplicitSGPRRead(MI);
2591     if (SGPRUsed != AMDGPU::NoRegister)
2592       ++ConstantBusCount;
2593 
2594     for (int OpIdx : OpIndices) {
2595       if (OpIdx == -1)
2596         break;
2597       const MachineOperand &MO = MI.getOperand(OpIdx);
2598       if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) {
2599         if (MO.isReg()) {
2600           if (MO.getReg() != SGPRUsed)
2601             ++ConstantBusCount;
2602           SGPRUsed = MO.getReg();
2603         } else {
2604           ++ConstantBusCount;
2605         }
2606       }
2607     }
2608     if (ConstantBusCount > 1) {
2609       ErrInfo = "VOP* instruction uses the constant bus more than once";
2610       return false;
2611     }
2612   }
2613 
2614   // Verify misc. restrictions on specific instructions.
2615   if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 ||
2616       Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) {
2617     const MachineOperand &Src0 = MI.getOperand(Src0Idx);
2618     const MachineOperand &Src1 = MI.getOperand(Src1Idx);
2619     const MachineOperand &Src2 = MI.getOperand(Src2Idx);
2620     if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
2621       if (!compareMachineOp(Src0, Src1) &&
2622           !compareMachineOp(Src0, Src2)) {
2623         ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
2624         return false;
2625       }
2626     }
2627   }
2628 
2629   if (isSOPK(MI)) {
2630     int64_t Imm = getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm();
2631     if (sopkIsZext(MI)) {
2632       if (!isUInt<16>(Imm)) {
2633         ErrInfo = "invalid immediate for SOPK instruction";
2634         return false;
2635       }
2636     } else {
2637       if (!isInt<16>(Imm)) {
2638         ErrInfo = "invalid immediate for SOPK instruction";
2639         return false;
2640       }
2641     }
2642   }
2643 
2644   if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
2645       Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
2646       Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
2647       Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
2648     const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
2649                        Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
2650 
2651     const unsigned StaticNumOps = Desc.getNumOperands() +
2652       Desc.getNumImplicitUses();
2653     const unsigned NumImplicitOps = IsDst ? 2 : 1;
2654 
2655     // Allow additional implicit operands. This allows a fixup done by the post
2656     // RA scheduler where the main implicit operand is killed and implicit-defs
2657     // are added for sub-registers that remain live after this instruction.
2658     if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
2659       ErrInfo = "missing implicit register operands";
2660       return false;
2661     }
2662 
2663     const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
2664     if (IsDst) {
2665       if (!Dst->isUse()) {
2666         ErrInfo = "v_movreld_b32 vdst should be a use operand";
2667         return false;
2668       }
2669 
2670       unsigned UseOpIdx;
2671       if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
2672           UseOpIdx != StaticNumOps + 1) {
2673         ErrInfo = "movrel implicit operands should be tied";
2674         return false;
2675       }
2676     }
2677 
2678     const MachineOperand &Src0 = MI.getOperand(Src0Idx);
2679     const MachineOperand &ImpUse
2680       = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
2681     if (!ImpUse.isReg() || !ImpUse.isUse() ||
2682         !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
2683       ErrInfo = "src0 should be subreg of implicit vector use";
2684       return false;
2685     }
2686   }
2687 
2688   // Make sure we aren't losing exec uses in the td files. This mostly requires
2689   // being careful when using let Uses to try to add other use registers.
2690   if (shouldReadExec(MI)) {
2691     if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
2692       ErrInfo = "VALU instruction does not implicitly read exec mask";
2693       return false;
2694     }
2695   }
2696 
2697   if (isSMRD(MI)) {
2698     if (MI.mayStore()) {
2699       // The register offset form of scalar stores may only use m0 as the
2700       // soffset register.
2701       const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soff);
2702       if (Soff && Soff->getReg() != AMDGPU::M0) {
2703         ErrInfo = "scalar stores must use m0 as offset register";
2704         return false;
2705       }
2706     }
2707   }
2708 
2709   if (isFLAT(MI) && !MF->getSubtarget<SISubtarget>().hasFlatInstOffsets()) {
2710     const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
2711     if (Offset->getImm() != 0) {
2712       ErrInfo = "subtarget does not support offsets in flat instructions";
2713       return false;
2714     }
2715   }
2716 
2717   return true;
2718 }
2719 
2720 unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
2721   switch (MI.getOpcode()) {
2722   default: return AMDGPU::INSTRUCTION_LIST_END;
2723   case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
2724   case AMDGPU::COPY: return AMDGPU::COPY;
2725   case AMDGPU::PHI: return AMDGPU::PHI;
2726   case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
2727   case AMDGPU::WQM: return AMDGPU::WQM;
2728   case AMDGPU::WWM: return AMDGPU::WWM;
2729   case AMDGPU::S_MOV_B32:
2730     return MI.getOperand(1).isReg() ?
2731            AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
2732   case AMDGPU::S_ADD_I32:
2733   case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32;
2734   case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32;
2735   case AMDGPU::S_SUB_I32:
2736   case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32;
2737   case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
2738   case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_I32;
2739   case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
2740   case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
2741   case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
2742   case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
2743   case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
2744   case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
2745   case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
2746   case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
2747   case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64;
2748   case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
2749   case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64;
2750   case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
2751   case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64;
2752   case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32;
2753   case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32;
2754   case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32;
2755   case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32;
2756   case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
2757   case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
2758   case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
2759   case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
2760   case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32;
2761   case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32;
2762   case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32;
2763   case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32;
2764   case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
2765   case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
2766   case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32;
2767   case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32;
2768   case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32;
2769   case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32;
2770   case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32;
2771   case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32;
2772   case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e32;
2773   case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e32;
2774   case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
2775   case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
2776   case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
2777   case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
2778   case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
2779   case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
2780   }
2781 }
2782 
2783 bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const {
2784   return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END;
2785 }
2786 
2787 const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
2788                                                       unsigned OpNo) const {
2789   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
2790   const MCInstrDesc &Desc = get(MI.getOpcode());
2791   if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
2792       Desc.OpInfo[OpNo].RegClass == -1) {
2793     unsigned Reg = MI.getOperand(OpNo).getReg();
2794 
2795     if (TargetRegisterInfo::isVirtualRegister(Reg))
2796       return MRI.getRegClass(Reg);
2797     return RI.getPhysRegClass(Reg);
2798   }
2799 
2800   unsigned RCID = Desc.OpInfo[OpNo].RegClass;
2801   return RI.getRegClass(RCID);
2802 }
2803 
2804 bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const {
2805   switch (MI.getOpcode()) {
2806   case AMDGPU::COPY:
2807   case AMDGPU::REG_SEQUENCE:
2808   case AMDGPU::PHI:
2809   case AMDGPU::INSERT_SUBREG:
2810     return RI.hasVGPRs(getOpRegClass(MI, 0));
2811   default:
2812     return RI.hasVGPRs(getOpRegClass(MI, OpNo));
2813   }
2814 }
2815 
2816 void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
2817   MachineBasicBlock::iterator I = MI;
2818   MachineBasicBlock *MBB = MI.getParent();
2819   MachineOperand &MO = MI.getOperand(OpIdx);
2820   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
2821   unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass;
2822   const TargetRegisterClass *RC = RI.getRegClass(RCID);
2823   unsigned Opcode = AMDGPU::V_MOV_B32_e32;
2824   if (MO.isReg())
2825     Opcode = AMDGPU::COPY;
2826   else if (RI.isSGPRClass(RC))
2827     Opcode = AMDGPU::S_MOV_B32;
2828 
2829   const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
2830   if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC))
2831     VRC = &AMDGPU::VReg_64RegClass;
2832   else
2833     VRC = &AMDGPU::VGPR_32RegClass;
2834 
2835   unsigned Reg = MRI.createVirtualRegister(VRC);
2836   DebugLoc DL = MBB->findDebugLoc(I);
2837   BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
2838   MO.ChangeToRegister(Reg, false);
2839 }
2840 
2841 unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
2842                                          MachineRegisterInfo &MRI,
2843                                          MachineOperand &SuperReg,
2844                                          const TargetRegisterClass *SuperRC,
2845                                          unsigned SubIdx,
2846                                          const TargetRegisterClass *SubRC)
2847                                          const {
2848   MachineBasicBlock *MBB = MI->getParent();
2849   DebugLoc DL = MI->getDebugLoc();
2850   unsigned SubReg = MRI.createVirtualRegister(SubRC);
2851 
2852   if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) {
2853     BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
2854       .addReg(SuperReg.getReg(), 0, SubIdx);
2855     return SubReg;
2856   }
2857 
2858   // Just in case the super register is itself a sub-register, copy it to a new
2859   // value so we don't need to worry about merging its subreg index with the
2860   // SubIdx passed to this function. The register coalescer should be able to
2861   // eliminate this extra copy.
2862   unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC);
2863 
2864   BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
2865     .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());
2866 
2867   BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
2868     .addReg(NewSuperReg, 0, SubIdx);
2869 
2870   return SubReg;
2871 }
2872 
2873 MachineOperand SIInstrInfo::buildExtractSubRegOrImm(
2874   MachineBasicBlock::iterator MII,
2875   MachineRegisterInfo &MRI,
2876   MachineOperand &Op,
2877   const TargetRegisterClass *SuperRC,
2878   unsigned SubIdx,
2879   const TargetRegisterClass *SubRC) const {
2880   if (Op.isImm()) {
2881     if (SubIdx == AMDGPU::sub0)
2882       return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
2883     if (SubIdx == AMDGPU::sub1)
2884       return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
2885 
2886     llvm_unreachable("Unhandled register index for immediate");
2887   }
2888 
2889   unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
2890                                        SubIdx, SubRC);
2891   return MachineOperand::CreateReg(SubReg, false);
2892 }
2893 
2894 // Change the order of operands from (0, 1, 2) to (0, 2, 1)
2895 void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
2896   assert(Inst.getNumExplicitOperands() == 3);
2897   MachineOperand Op1 = Inst.getOperand(1);
2898   Inst.RemoveOperand(1);
2899   Inst.addOperand(Op1);
2900 }
2901 
2902 bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
2903                                     const MCOperandInfo &OpInfo,
2904                                     const MachineOperand &MO) const {
2905   if (!MO.isReg())
2906     return false;
2907 
2908   unsigned Reg = MO.getReg();
2909   const TargetRegisterClass *RC =
2910     TargetRegisterInfo::isVirtualRegister(Reg) ?
2911     MRI.getRegClass(Reg) :
2912     RI.getPhysRegClass(Reg);
2913 
2914   const SIRegisterInfo *TRI =
2915       static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
2916   RC = TRI->getSubRegClass(RC, MO.getSubReg());
2917 
2918   // In order to be legal, the common sub-class must be equal to the
2919   // class of the current operand.  For example:
2920   //
2921   // v_mov_b32 s0 ; Operand defined as vsrc_b32
2922   //              ; RI.getCommonSubClass(s0,vsrc_b32) = sgpr ; LEGAL
2923   //
2924   // s_sendmsg 0, s0 ; Operand defined as m0reg
2925   //                 ; RI.getCommonSubClass(s0,m0reg) = m0reg ; NOT LEGAL
2926 
2927   return RI.getCommonSubClass(RC, RI.getRegClass(OpInfo.RegClass)) == RC;
2928 }
2929 
2930 bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI,
2931                                      const MCOperandInfo &OpInfo,
2932                                      const MachineOperand &MO) const {
2933   if (MO.isReg())
2934     return isLegalRegOperand(MRI, OpInfo, MO);
2935 
2936   // Handle non-register types that are treated like immediates.
2937   assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
2938   return true;
2939 }
2940 
2941 bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
2942                                  const MachineOperand *MO) const {
2943   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
2944   const MCInstrDesc &InstDesc = MI.getDesc();
2945   const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx];
2946   const TargetRegisterClass *DefinedRC =
2947       OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
2948   if (!MO)
2949     MO = &MI.getOperand(OpIdx);
2950 
2951   if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
2952 
2953     RegSubRegPair SGPRUsed;
2954     if (MO->isReg())
2955       SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg());
2956 
2957     for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
2958       if (i == OpIdx)
2959         continue;
2960       const MachineOperand &Op = MI.getOperand(i);
2961       if (Op.isReg()) {
2962         if ((Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) &&
2963             usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) {
2964           return false;
2965         }
2966       } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) {
2967         return false;
2968       }
2969     }
2970   }
2971 
2972   if (MO->isReg()) {
2973     assert(DefinedRC);
2974     return isLegalRegOperand(MRI, OpInfo, *MO);
2975   }
2976 
2977   // Handle non-register types that are treated like immediates.
2978   assert(MO->isImm() || MO->isTargetIndex() || MO->isFI());
2979 
2980   if (!DefinedRC) {
2981     // This operand expects an immediate.
2982     return true;
2983   }
2984 
2985   return isImmOperandLegal(MI, OpIdx, *MO);
2986 }
2987 
2988 void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
2989                                        MachineInstr &MI) const {
2990   unsigned Opc = MI.getOpcode();
2991   const MCInstrDesc &InstrDesc = get(Opc);
2992 
2993   int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2994   MachineOperand &Src1 = MI.getOperand(Src1Idx);
2995 
2996   // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
2997   // we need to only have one constant bus use.
2998   //
2999   // Note we do not need to worry about literal constants here. They are
3000   // disabled for the operand type for instructions because they will always
3001   // violate the one constant bus use rule.
3002   bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister;
3003   if (HasImplicitSGPR) {
3004     int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
3005     MachineOperand &Src0 = MI.getOperand(Src0Idx);
3006 
3007     if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg()))
3008       legalizeOpWithMove(MI, Src0Idx);
3009   }
3010 
3011   // VOP2 src0 instructions support all operand types, so we don't need to check
3012   // their legality. If src1 is already legal, we don't need to do anything.
3013   if (isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src1))
3014     return;
3015 
3016   // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
3017   // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
3018   // select is uniform.
3019   if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
3020       RI.isVGPR(MRI, Src1.getReg())) {
3021     unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3022     const DebugLoc &DL = MI.getDebugLoc();
3023     BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
3024         .add(Src1);
3025     Src1.ChangeToRegister(Reg, false);
3026     return;
3027   }
3028 
3029   // We do not use commuteInstruction here because it is too aggressive and will
3030   // commute if it is possible. We only want to commute here if it improves
3031   // legality. This can be called a fairly large number of times so don't waste
3032   // compile time pointlessly swapping and checking legality again.
3033   if (HasImplicitSGPR || !MI.isCommutable()) {
3034     legalizeOpWithMove(MI, Src1Idx);
3035     return;
3036   }
3037 
3038   int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
3039   MachineOperand &Src0 = MI.getOperand(Src0Idx);
3040 
3041   // If src0 can be used as src1, commuting will make the operands legal.
3042   // Otherwise we have to give up and insert a move.
3043   //
3044   // TODO: Other immediate-like operand kinds could be commuted if there was a
3045   // MachineOperand::ChangeTo* for them.
3046   if ((!Src1.isImm() && !Src1.isReg()) ||
3047       !isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) {
3048     legalizeOpWithMove(MI, Src1Idx);
3049     return;
3050   }
3051 
3052   int CommutedOpc = commuteOpcode(MI);
3053   if (CommutedOpc == -1) {
3054     legalizeOpWithMove(MI, Src1Idx);
3055     return;
3056   }
3057 
3058   MI.setDesc(get(CommutedOpc));
3059 
3060   unsigned Src0Reg = Src0.getReg();
3061   unsigned Src0SubReg = Src0.getSubReg();
3062   bool Src0Kill = Src0.isKill();
3063 
3064   if (Src1.isImm())
3065     Src0.ChangeToImmediate(Src1.getImm());
3066   else if (Src1.isReg()) {
3067     Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
3068     Src0.setSubReg(Src1.getSubReg());
3069   } else
3070     llvm_unreachable("Should only have register or immediate operands");
3071 
3072   Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
3073   Src1.setSubReg(Src0SubReg);
3074 }
3075 
3076 // Legalize VOP3 operands. Because all operand types are supported for any
3077 // operand, and since literal constants are not allowed and should never be
3078 // seen, we only need to worry about inserting copies if we use multiple SGPR
3079 // operands.
3080 void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
3081                                        MachineInstr &MI) const {
3082   unsigned Opc = MI.getOpcode();
3083 
3084   int VOP3Idx[3] = {
3085     AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
3086     AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
3087     AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
3088   };
3089 
3090   // Find the one SGPR operand we are allowed to use.
3091   unsigned SGPRReg = findUsedSGPR(MI, VOP3Idx);
3092 
3093   for (unsigned i = 0; i < 3; ++i) {
3094     int Idx = VOP3Idx[i];
3095     if (Idx == -1)
3096       break;
3097     MachineOperand &MO = MI.getOperand(Idx);
3098 
3099     // We should never see a VOP3 instruction with an illegal immediate operand.
3100     if (!MO.isReg())
3101       continue;
3102 
3103     if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
3104       continue; // VGPRs are legal
3105 
3106     if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) {
3107       SGPRReg = MO.getReg();
3108       // We can use one SGPR in each VOP3 instruction.
3109       continue;
3110     }
3111 
3112     // If we make it this far, then the operand is not legal and we must
3113     // legalize it.
3114     legalizeOpWithMove(MI, Idx);
3115   }
3116 }
3117 
3118 unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI,
3119                                          MachineRegisterInfo &MRI) const {
3120   const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
3121   const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
3122   unsigned DstReg = MRI.createVirtualRegister(SRC);
3123   unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
3124 
3125   SmallVector<unsigned, 8> SRegs;
3126   for (unsigned i = 0; i < SubRegs; ++i) {
3127     unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3128     BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
3129             get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
3130         .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
3131     SRegs.push_back(SGPR);
3132   }
3133 
3134   MachineInstrBuilder MIB =
3135       BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
3136               get(AMDGPU::REG_SEQUENCE), DstReg);
3137   for (unsigned i = 0; i < SubRegs; ++i) {
3138     MIB.addReg(SRegs[i]);
3139     MIB.addImm(RI.getSubRegFromChannel(i));
3140   }
3141   return DstReg;
3142 }
3143 
3144 void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
3145                                        MachineInstr &MI) const {
3146 
3147   // If the pointer is store in VGPRs, then we need to move them to
3148   // SGPRs using v_readfirstlane.  This is safe because we only select
3149   // loads with uniform pointers to SMRD instruction so we know the
3150   // pointer value is uniform.
3151   MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
3152   if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
3153       unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
3154       SBase->setReg(SGPR);
3155   }
3156 }
3157 
3158 void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
3159                                          MachineBasicBlock::iterator I,
3160                                          const TargetRegisterClass *DstRC,
3161                                          MachineOperand &Op,
3162                                          MachineRegisterInfo &MRI,
3163                                          const DebugLoc &DL) const {
3164   unsigned OpReg = Op.getReg();
3165   unsigned OpSubReg = Op.getSubReg();
3166 
3167   const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
3168       RI.getRegClassForReg(MRI, OpReg), OpSubReg);
3169 
3170   // Check if operand is already the correct register class.
3171   if (DstRC == OpRC)
3172     return;
3173 
3174   unsigned DstReg = MRI.createVirtualRegister(DstRC);
3175   MachineInstr *Copy =
3176       BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op);
3177 
3178   Op.setReg(DstReg);
3179   Op.setSubReg(0);
3180 
3181   MachineInstr *Def = MRI.getVRegDef(OpReg);
3182   if (!Def)
3183     return;
3184 
3185   // Try to eliminate the copy if it is copying an immediate value.
3186   if (Def->isMoveImmediate())
3187     FoldImmediate(*Copy, *Def, OpReg, &MRI);
3188 }
3189 
3190 void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
3191   MachineFunction &MF = *MI.getParent()->getParent();
3192   MachineRegisterInfo &MRI = MF.getRegInfo();
3193 
3194   // Legalize VOP2
3195   if (isVOP2(MI) || isVOPC(MI)) {
3196     legalizeOperandsVOP2(MRI, MI);
3197     return;
3198   }
3199 
3200   // Legalize VOP3
3201   if (isVOP3(MI)) {
3202     legalizeOperandsVOP3(MRI, MI);
3203     return;
3204   }
3205 
3206   // Legalize SMRD
3207   if (isSMRD(MI)) {
3208     legalizeOperandsSMRD(MRI, MI);
3209     return;
3210   }
3211 
3212   // Legalize REG_SEQUENCE and PHI
3213   // The register class of the operands much be the same type as the register
3214   // class of the output.
3215   if (MI.getOpcode() == AMDGPU::PHI) {
3216     const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
3217     for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
3218       if (!MI.getOperand(i).isReg() ||
3219           !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg()))
3220         continue;
3221       const TargetRegisterClass *OpRC =
3222           MRI.getRegClass(MI.getOperand(i).getReg());
3223       if (RI.hasVGPRs(OpRC)) {
3224         VRC = OpRC;
3225       } else {
3226         SRC = OpRC;
3227       }
3228     }
3229 
3230     // If any of the operands are VGPR registers, then they all most be
3231     // otherwise we will create illegal VGPR->SGPR copies when legalizing
3232     // them.
3233     if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
3234       if (!VRC) {
3235         assert(SRC);
3236         VRC = RI.getEquivalentVGPRClass(SRC);
3237       }
3238       RC = VRC;
3239     } else {
3240       RC = SRC;
3241     }
3242 
3243     // Update all the operands so they have the same type.
3244     for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3245       MachineOperand &Op = MI.getOperand(I);
3246       if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
3247         continue;
3248 
3249       // MI is a PHI instruction.
3250       MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
3251       MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator();
3252 
3253       // Avoid creating no-op copies with the same src and dst reg class.  These
3254       // confuse some of the machine passes.
3255       legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
3256     }
3257   }
3258 
3259   // REG_SEQUENCE doesn't really require operand legalization, but if one has a
3260   // VGPR dest type and SGPR sources, insert copies so all operands are
3261   // VGPRs. This seems to help operand folding / the register coalescer.
3262   if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
3263     MachineBasicBlock *MBB = MI.getParent();
3264     const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
3265     if (RI.hasVGPRs(DstRC)) {
3266       // Update all the operands so they are VGPR register classes. These may
3267       // not be the same register class because REG_SEQUENCE supports mixing
3268       // subregister index types e.g. sub0_sub1 + sub2 + sub3
3269       for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
3270         MachineOperand &Op = MI.getOperand(I);
3271         if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
3272           continue;
3273 
3274         const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
3275         const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
3276         if (VRC == OpRC)
3277           continue;
3278 
3279         legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
3280         Op.setIsKill();
3281       }
3282     }
3283 
3284     return;
3285   }
3286 
3287   // Legalize INSERT_SUBREG
3288   // src0 must have the same register class as dst
3289   if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
3290     unsigned Dst = MI.getOperand(0).getReg();
3291     unsigned Src0 = MI.getOperand(1).getReg();
3292     const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
3293     const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
3294     if (DstRC != Src0RC) {
3295       MachineBasicBlock *MBB = MI.getParent();
3296       MachineOperand &Op = MI.getOperand(1);
3297       legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
3298     }
3299     return;
3300   }
3301 
3302   // Legalize MIMG and MUBUF/MTBUF for shaders.
3303   //
3304   // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
3305   // scratch memory access. In both cases, the legalization never involves
3306   // conversion to the addr64 form.
3307   if (isMIMG(MI) ||
3308       (AMDGPU::isShader(MF.getFunction()->getCallingConv()) &&
3309        (isMUBUF(MI) || isMTBUF(MI)))) {
3310     MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc);
3311     if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) {
3312       unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI);
3313       SRsrc->setReg(SGPR);
3314     }
3315 
3316     MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp);
3317     if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) {
3318       unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI);
3319       SSamp->setReg(SGPR);
3320     }
3321     return;
3322   }
3323 
3324   // Legalize MUBUF* instructions by converting to addr64 form.
3325   // FIXME: If we start using the non-addr64 instructions for compute, we
3326   // may need to legalize them as above. This especially applies to the
3327   // buffer_load_format_* variants and variants with idxen (or bothen).
3328   int SRsrcIdx =
3329       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
3330   if (SRsrcIdx != -1) {
3331     // We have an MUBUF instruction
3332     MachineOperand *SRsrc = &MI.getOperand(SRsrcIdx);
3333     unsigned SRsrcRC = get(MI.getOpcode()).OpInfo[SRsrcIdx].RegClass;
3334     if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()),
3335                                              RI.getRegClass(SRsrcRC))) {
3336       // The operands are legal.
3337       // FIXME: We may need to legalize operands besided srsrc.
3338       return;
3339     }
3340 
3341     MachineBasicBlock &MBB = *MI.getParent();
3342 
3343     // Extract the ptr from the resource descriptor.
3344     unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc,
3345       &AMDGPU::VReg_128RegClass, AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
3346 
3347     // Create an empty resource descriptor
3348     unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3349     unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3350     unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3351     unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
3352     uint64_t RsrcDataFormat = getDefaultRsrcDataFormat();
3353 
3354     // Zero64 = 0
3355     BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B64), Zero64)
3356         .addImm(0);
3357 
3358     // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
3359     BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
3360         .addImm(RsrcDataFormat & 0xFFFFFFFF);
3361 
3362     // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
3363     BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
3364         .addImm(RsrcDataFormat >> 32);
3365 
3366     // NewSRsrc = {Zero64, SRsrcFormat}
3367     BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc)
3368         .addReg(Zero64)
3369         .addImm(AMDGPU::sub0_sub1)
3370         .addReg(SRsrcFormatLo)
3371         .addImm(AMDGPU::sub2)
3372         .addReg(SRsrcFormatHi)
3373         .addImm(AMDGPU::sub3);
3374 
3375     MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
3376     unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
3377     if (VAddr) {
3378       // This is already an ADDR64 instruction so we need to add the pointer
3379       // extracted from the resource descriptor to the current value of VAddr.
3380       unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3381       unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3382 
3383       // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0
3384       DebugLoc DL = MI.getDebugLoc();
3385       BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo)
3386         .addReg(SRsrcPtr, 0, AMDGPU::sub0)
3387         .addReg(VAddr->getReg(), 0, AMDGPU::sub0);
3388 
3389       // NewVaddrHi = SRsrcPtr:sub1 + VAddr:sub1
3390       BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e32), NewVAddrHi)
3391         .addReg(SRsrcPtr, 0, AMDGPU::sub1)
3392         .addReg(VAddr->getReg(), 0, AMDGPU::sub1);
3393 
3394       // NewVaddr = {NewVaddrHi, NewVaddrLo}
3395       BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
3396           .addReg(NewVAddrLo)
3397           .addImm(AMDGPU::sub0)
3398           .addReg(NewVAddrHi)
3399           .addImm(AMDGPU::sub1);
3400     } else {
3401       // This instructions is the _OFFSET variant, so we need to convert it to
3402       // ADDR64.
3403       assert(MBB.getParent()->getSubtarget<SISubtarget>().getGeneration()
3404              < SISubtarget::VOLCANIC_ISLANDS &&
3405              "FIXME: Need to emit flat atomics here");
3406 
3407       MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
3408       MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
3409       MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
3410       unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
3411 
3412       // Atomics rith return have have an additional tied operand and are
3413       // missing some of the special bits.
3414       MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
3415       MachineInstr *Addr64;
3416 
3417       if (!VDataIn) {
3418         // Regular buffer load / store.
3419         MachineInstrBuilder MIB =
3420             BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
3421                 .add(*VData)
3422                 .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
3423                 // This will be replaced later
3424                 // with the new value of vaddr.
3425                 .add(*SRsrc)
3426                 .add(*SOffset)
3427                 .add(*Offset);
3428 
3429         // Atomics do not have this operand.
3430         if (const MachineOperand *GLC =
3431                 getNamedOperand(MI, AMDGPU::OpName::glc)) {
3432           MIB.addImm(GLC->getImm());
3433         }
3434 
3435         MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc));
3436 
3437         if (const MachineOperand *TFE =
3438                 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
3439           MIB.addImm(TFE->getImm());
3440         }
3441 
3442         MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
3443         Addr64 = MIB;
3444       } else {
3445         // Atomics with return.
3446         Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
3447                      .add(*VData)
3448                      .add(*VDataIn)
3449                      .addReg(AMDGPU::NoRegister) // Dummy value for vaddr.
3450                      // This will be replaced later
3451                      // with the new value of vaddr.
3452                      .add(*SRsrc)
3453                      .add(*SOffset)
3454                      .add(*Offset)
3455                      .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc))
3456                      .setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
3457       }
3458 
3459       MI.removeFromParent();
3460 
3461       // NewVaddr = {NewVaddrHi, NewVaddrLo}
3462       BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
3463               NewVAddr)
3464           .addReg(SRsrcPtr, 0, AMDGPU::sub0)
3465           .addImm(AMDGPU::sub0)
3466           .addReg(SRsrcPtr, 0, AMDGPU::sub1)
3467           .addImm(AMDGPU::sub1);
3468 
3469       VAddr = getNamedOperand(*Addr64, AMDGPU::OpName::vaddr);
3470       SRsrc = getNamedOperand(*Addr64, AMDGPU::OpName::srsrc);
3471     }
3472 
3473     // Update the instruction to use NewVaddr
3474     VAddr->setReg(NewVAddr);
3475     // Update the instruction to use NewSRsrc
3476     SRsrc->setReg(NewSRsrc);
3477   }
3478 }
3479 
3480 void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
3481   SetVectorType Worklist;
3482   Worklist.insert(&TopInst);
3483 
3484   while (!Worklist.empty()) {
3485     MachineInstr &Inst = *Worklist.pop_back_val();
3486     MachineBasicBlock *MBB = Inst.getParent();
3487     MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
3488 
3489     unsigned Opcode = Inst.getOpcode();
3490     unsigned NewOpcode = getVALUOp(Inst);
3491 
3492     // Handle some special cases
3493     switch (Opcode) {
3494     default:
3495       break;
3496     case AMDGPU::S_AND_B64:
3497       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64);
3498       Inst.eraseFromParent();
3499       continue;
3500 
3501     case AMDGPU::S_OR_B64:
3502       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64);
3503       Inst.eraseFromParent();
3504       continue;
3505 
3506     case AMDGPU::S_XOR_B64:
3507       splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64);
3508       Inst.eraseFromParent();
3509       continue;
3510 
3511     case AMDGPU::S_NOT_B64:
3512       splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32);
3513       Inst.eraseFromParent();
3514       continue;
3515 
3516     case AMDGPU::S_BCNT1_I32_B64:
3517       splitScalar64BitBCNT(Worklist, Inst);
3518       Inst.eraseFromParent();
3519       continue;
3520 
3521     case AMDGPU::S_BFE_I64:
3522       splitScalar64BitBFE(Worklist, Inst);
3523       Inst.eraseFromParent();
3524       continue;
3525 
3526     case AMDGPU::S_LSHL_B32:
3527       if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
3528         NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
3529         swapOperands(Inst);
3530       }
3531       break;
3532     case AMDGPU::S_ASHR_I32:
3533       if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
3534         NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
3535         swapOperands(Inst);
3536       }
3537       break;
3538     case AMDGPU::S_LSHR_B32:
3539       if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
3540         NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
3541         swapOperands(Inst);
3542       }
3543       break;
3544     case AMDGPU::S_LSHL_B64:
3545       if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
3546         NewOpcode = AMDGPU::V_LSHLREV_B64;
3547         swapOperands(Inst);
3548       }
3549       break;
3550     case AMDGPU::S_ASHR_I64:
3551       if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
3552         NewOpcode = AMDGPU::V_ASHRREV_I64;
3553         swapOperands(Inst);
3554       }
3555       break;
3556     case AMDGPU::S_LSHR_B64:
3557       if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
3558         NewOpcode = AMDGPU::V_LSHRREV_B64;
3559         swapOperands(Inst);
3560       }
3561       break;
3562 
3563     case AMDGPU::S_ABS_I32:
3564       lowerScalarAbs(Worklist, Inst);
3565       Inst.eraseFromParent();
3566       continue;
3567 
3568     case AMDGPU::S_CBRANCH_SCC0:
3569     case AMDGPU::S_CBRANCH_SCC1:
3570       // Clear unused bits of vcc
3571       BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64),
3572               AMDGPU::VCC)
3573           .addReg(AMDGPU::EXEC)
3574           .addReg(AMDGPU::VCC);
3575       break;
3576 
3577     case AMDGPU::S_BFE_U64:
3578     case AMDGPU::S_BFM_B64:
3579       llvm_unreachable("Moving this op to VALU not implemented");
3580 
3581     case AMDGPU::S_PACK_LL_B32_B16:
3582     case AMDGPU::S_PACK_LH_B32_B16:
3583     case AMDGPU::S_PACK_HH_B32_B16:
3584       movePackToVALU(Worklist, MRI, Inst);
3585       Inst.eraseFromParent();
3586       continue;
3587     }
3588 
3589     if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
3590       // We cannot move this instruction to the VALU, so we should try to
3591       // legalize its operands instead.
3592       legalizeOperands(Inst);
3593       continue;
3594     }
3595 
3596     // Use the new VALU Opcode.
3597     const MCInstrDesc &NewDesc = get(NewOpcode);
3598     Inst.setDesc(NewDesc);
3599 
3600     // Remove any references to SCC. Vector instructions can't read from it, and
3601     // We're just about to add the implicit use / defs of VCC, and we don't want
3602     // both.
3603     for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) {
3604       MachineOperand &Op = Inst.getOperand(i);
3605       if (Op.isReg() && Op.getReg() == AMDGPU::SCC) {
3606         Inst.RemoveOperand(i);
3607         addSCCDefUsersToVALUWorklist(Inst, Worklist);
3608       }
3609     }
3610 
3611     if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
3612       // We are converting these to a BFE, so we need to add the missing
3613       // operands for the size and offset.
3614       unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
3615       Inst.addOperand(MachineOperand::CreateImm(0));
3616       Inst.addOperand(MachineOperand::CreateImm(Size));
3617 
3618     } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
3619       // The VALU version adds the second operand to the result, so insert an
3620       // extra 0 operand.
3621       Inst.addOperand(MachineOperand::CreateImm(0));
3622     }
3623 
3624     Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent());
3625 
3626     if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
3627       const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
3628       // If we need to move this to VGPRs, we need to unpack the second operand
3629       // back into the 2 separate ones for bit offset and width.
3630       assert(OffsetWidthOp.isImm() &&
3631              "Scalar BFE is only implemented for constant width and offset");
3632       uint32_t Imm = OffsetWidthOp.getImm();
3633 
3634       uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
3635       uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
3636       Inst.RemoveOperand(2);                     // Remove old immediate.
3637       Inst.addOperand(MachineOperand::CreateImm(Offset));
3638       Inst.addOperand(MachineOperand::CreateImm(BitWidth));
3639     }
3640 
3641     bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef();
3642     unsigned NewDstReg = AMDGPU::NoRegister;
3643     if (HasDst) {
3644       unsigned DstReg = Inst.getOperand(0).getReg();
3645       if (TargetRegisterInfo::isPhysicalRegister(DstReg))
3646         continue;
3647 
3648       // Update the destination register class.
3649       const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
3650       if (!NewDstRC)
3651         continue;
3652 
3653       if (Inst.isCopy() &&
3654           TargetRegisterInfo::isVirtualRegister(Inst.getOperand(1).getReg()) &&
3655           NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
3656         // Instead of creating a copy where src and dst are the same register
3657         // class, we just replace all uses of dst with src.  These kinds of
3658         // copies interfere with the heuristics MachineSink uses to decide
3659         // whether or not to split a critical edge.  Since the pass assumes
3660         // that copies will end up as machine instructions and not be
3661         // eliminated.
3662         addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
3663         MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
3664         MRI.clearKillFlags(Inst.getOperand(1).getReg());
3665         Inst.getOperand(0).setReg(DstReg);
3666         continue;
3667       }
3668 
3669       NewDstReg = MRI.createVirtualRegister(NewDstRC);
3670       MRI.replaceRegWith(DstReg, NewDstReg);
3671     }
3672 
3673     // Legalize the operands
3674     legalizeOperands(Inst);
3675 
3676     if (HasDst)
3677      addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
3678   }
3679 }
3680 
3681 void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist,
3682                                  MachineInstr &Inst) const {
3683   MachineBasicBlock &MBB = *Inst.getParent();
3684   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3685   MachineBasicBlock::iterator MII = Inst;
3686   DebugLoc DL = Inst.getDebugLoc();
3687 
3688   MachineOperand &Dest = Inst.getOperand(0);
3689   MachineOperand &Src = Inst.getOperand(1);
3690   unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3691   unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3692 
3693   BuildMI(MBB, MII, DL, get(AMDGPU::V_SUB_I32_e32), TmpReg)
3694     .addImm(0)
3695     .addReg(Src.getReg());
3696 
3697   BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
3698     .addReg(Src.getReg())
3699     .addReg(TmpReg);
3700 
3701   MRI.replaceRegWith(Dest.getReg(), ResultReg);
3702   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
3703 }
3704 
3705 void SIInstrInfo::splitScalar64BitUnaryOp(
3706     SetVectorType &Worklist, MachineInstr &Inst,
3707     unsigned Opcode) const {
3708   MachineBasicBlock &MBB = *Inst.getParent();
3709   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3710 
3711   MachineOperand &Dest = Inst.getOperand(0);
3712   MachineOperand &Src0 = Inst.getOperand(1);
3713   DebugLoc DL = Inst.getDebugLoc();
3714 
3715   MachineBasicBlock::iterator MII = Inst;
3716 
3717   const MCInstrDesc &InstDesc = get(Opcode);
3718   const TargetRegisterClass *Src0RC = Src0.isReg() ?
3719     MRI.getRegClass(Src0.getReg()) :
3720     &AMDGPU::SGPR_32RegClass;
3721 
3722   const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
3723 
3724   MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
3725                                                        AMDGPU::sub0, Src0SubRC);
3726 
3727   const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
3728   const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
3729   const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
3730 
3731   unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
3732   BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
3733 
3734   MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
3735                                                        AMDGPU::sub1, Src0SubRC);
3736 
3737   unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
3738   BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
3739 
3740   unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
3741   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
3742     .addReg(DestSub0)
3743     .addImm(AMDGPU::sub0)
3744     .addReg(DestSub1)
3745     .addImm(AMDGPU::sub1);
3746 
3747   MRI.replaceRegWith(Dest.getReg(), FullDestReg);
3748 
3749   // We don't need to legalizeOperands here because for a single operand, src0
3750   // will support any kind of input.
3751 
3752   // Move all users of this moved value.
3753   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
3754 }
3755 
3756 void SIInstrInfo::splitScalar64BitBinaryOp(
3757     SetVectorType &Worklist, MachineInstr &Inst,
3758     unsigned Opcode) const {
3759   MachineBasicBlock &MBB = *Inst.getParent();
3760   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3761 
3762   MachineOperand &Dest = Inst.getOperand(0);
3763   MachineOperand &Src0 = Inst.getOperand(1);
3764   MachineOperand &Src1 = Inst.getOperand(2);
3765   DebugLoc DL = Inst.getDebugLoc();
3766 
3767   MachineBasicBlock::iterator MII = Inst;
3768 
3769   const MCInstrDesc &InstDesc = get(Opcode);
3770   const TargetRegisterClass *Src0RC = Src0.isReg() ?
3771     MRI.getRegClass(Src0.getReg()) :
3772     &AMDGPU::SGPR_32RegClass;
3773 
3774   const TargetRegisterClass *Src0SubRC = RI.getSubRegClass(Src0RC, AMDGPU::sub0);
3775   const TargetRegisterClass *Src1RC = Src1.isReg() ?
3776     MRI.getRegClass(Src1.getReg()) :
3777     &AMDGPU::SGPR_32RegClass;
3778 
3779   const TargetRegisterClass *Src1SubRC = RI.getSubRegClass(Src1RC, AMDGPU::sub0);
3780 
3781   MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
3782                                                        AMDGPU::sub0, Src0SubRC);
3783   MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
3784                                                        AMDGPU::sub0, Src1SubRC);
3785 
3786   const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
3787   const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
3788   const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
3789 
3790   unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
3791   MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
3792                               .add(SrcReg0Sub0)
3793                               .add(SrcReg1Sub0);
3794 
3795   MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
3796                                                        AMDGPU::sub1, Src0SubRC);
3797   MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
3798                                                        AMDGPU::sub1, Src1SubRC);
3799 
3800   unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
3801   MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
3802                               .add(SrcReg0Sub1)
3803                               .add(SrcReg1Sub1);
3804 
3805   unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
3806   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
3807     .addReg(DestSub0)
3808     .addImm(AMDGPU::sub0)
3809     .addReg(DestSub1)
3810     .addImm(AMDGPU::sub1);
3811 
3812   MRI.replaceRegWith(Dest.getReg(), FullDestReg);
3813 
3814   // Try to legalize the operands in case we need to swap the order to keep it
3815   // valid.
3816   legalizeOperands(LoHalf);
3817   legalizeOperands(HiHalf);
3818 
3819   // Move all users of this moved vlaue.
3820   addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
3821 }
3822 
3823 void SIInstrInfo::splitScalar64BitBCNT(
3824     SetVectorType &Worklist, MachineInstr &Inst) const {
3825   MachineBasicBlock &MBB = *Inst.getParent();
3826   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3827 
3828   MachineBasicBlock::iterator MII = Inst;
3829   DebugLoc DL = Inst.getDebugLoc();
3830 
3831   MachineOperand &Dest = Inst.getOperand(0);
3832   MachineOperand &Src = Inst.getOperand(1);
3833 
3834   const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
3835   const TargetRegisterClass *SrcRC = Src.isReg() ?
3836     MRI.getRegClass(Src.getReg()) :
3837     &AMDGPU::SGPR_32RegClass;
3838 
3839   unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3840   unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3841 
3842   const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0);
3843 
3844   MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
3845                                                       AMDGPU::sub0, SrcSubRC);
3846   MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
3847                                                       AMDGPU::sub1, SrcSubRC);
3848 
3849   BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
3850 
3851   BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
3852 
3853   MRI.replaceRegWith(Dest.getReg(), ResultReg);
3854 
3855   // We don't need to legalize operands here. src0 for etiher instruction can be
3856   // an SGPR, and the second input is unused or determined here.
3857   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
3858 }
3859 
3860 void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
3861                                       MachineInstr &Inst) const {
3862   MachineBasicBlock &MBB = *Inst.getParent();
3863   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
3864   MachineBasicBlock::iterator MII = Inst;
3865   DebugLoc DL = Inst.getDebugLoc();
3866 
3867   MachineOperand &Dest = Inst.getOperand(0);
3868   uint32_t Imm = Inst.getOperand(2).getImm();
3869   uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
3870   uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
3871 
3872   (void) Offset;
3873 
3874   // Only sext_inreg cases handled.
3875   assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
3876          Offset == 0 && "Not implemented");
3877 
3878   if (BitWidth < 32) {
3879     unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3880     unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3881     unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
3882 
3883     BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo)
3884         .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
3885         .addImm(0)
3886         .addImm(BitWidth);
3887 
3888     BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
3889       .addImm(31)
3890       .addReg(MidRegLo);
3891 
3892     BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
3893       .addReg(MidRegLo)
3894       .addImm(AMDGPU::sub0)
3895       .addReg(MidRegHi)
3896       .addImm(AMDGPU::sub1);
3897 
3898     MRI.replaceRegWith(Dest.getReg(), ResultReg);
3899     addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
3900     return;
3901   }
3902 
3903   MachineOperand &Src = Inst.getOperand(1);
3904   unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3905   unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
3906 
3907   BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
3908     .addImm(31)
3909     .addReg(Src.getReg(), 0, AMDGPU::sub0);
3910 
3911   BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
3912     .addReg(Src.getReg(), 0, AMDGPU::sub0)
3913     .addImm(AMDGPU::sub0)
3914     .addReg(TmpReg)
3915     .addImm(AMDGPU::sub1);
3916 
3917   MRI.replaceRegWith(Dest.getReg(), ResultReg);
3918   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
3919 }
3920 
3921 void SIInstrInfo::addUsersToMoveToVALUWorklist(
3922   unsigned DstReg,
3923   MachineRegisterInfo &MRI,
3924   SetVectorType &Worklist) const {
3925   for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
3926          E = MRI.use_end(); I != E;) {
3927     MachineInstr &UseMI = *I->getParent();
3928     if (!canReadVGPR(UseMI, I.getOperandNo())) {
3929       Worklist.insert(&UseMI);
3930 
3931       do {
3932         ++I;
3933       } while (I != E && I->getParent() == &UseMI);
3934     } else {
3935       ++I;
3936     }
3937   }
3938 }
3939 
3940 void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
3941                                  MachineRegisterInfo &MRI,
3942                                  MachineInstr &Inst) const {
3943   unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3944   MachineBasicBlock *MBB = Inst.getParent();
3945   MachineOperand &Src0 = Inst.getOperand(1);
3946   MachineOperand &Src1 = Inst.getOperand(2);
3947   const DebugLoc &DL = Inst.getDebugLoc();
3948 
3949   switch (Inst.getOpcode()) {
3950   case AMDGPU::S_PACK_LL_B32_B16: {
3951     unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3952     unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3953 
3954     // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
3955     // 0.
3956     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
3957       .addImm(0xffff);
3958 
3959     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
3960       .addReg(ImmReg, RegState::Kill)
3961       .add(Src0);
3962 
3963     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32), ResultReg)
3964       .add(Src1)
3965       .addImm(16)
3966       .addReg(TmpReg, RegState::Kill);
3967     break;
3968   }
3969   case AMDGPU::S_PACK_LH_B32_B16: {
3970     unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3971     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
3972       .addImm(0xffff);
3973     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32), ResultReg)
3974       .addReg(ImmReg, RegState::Kill)
3975       .add(Src0)
3976       .add(Src1);
3977     break;
3978   }
3979   case AMDGPU::S_PACK_HH_B32_B16: {
3980     unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3981     unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3982     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
3983       .addImm(16)
3984       .add(Src0);
3985     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
3986       .addImm(0xffff0000);
3987     BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32), ResultReg)
3988       .add(Src1)
3989       .addReg(ImmReg, RegState::Kill)
3990       .addReg(TmpReg, RegState::Kill);
3991     break;
3992   }
3993   default:
3994     llvm_unreachable("unhandled s_pack_* instruction");
3995   }
3996 
3997   MachineOperand &Dest = Inst.getOperand(0);
3998   MRI.replaceRegWith(Dest.getReg(), ResultReg);
3999   addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
4000 }
4001 
4002 void SIInstrInfo::addSCCDefUsersToVALUWorklist(
4003     MachineInstr &SCCDefInst, SetVectorType &Worklist) const {
4004   // This assumes that all the users of SCC are in the same block
4005   // as the SCC def.
4006   for (MachineInstr &MI :
4007        make_range(MachineBasicBlock::iterator(SCCDefInst),
4008                       SCCDefInst.getParent()->end())) {
4009     // Exit if we find another SCC def.
4010     if (MI.findRegisterDefOperandIdx(AMDGPU::SCC) != -1)
4011       return;
4012 
4013     if (MI.findRegisterUseOperandIdx(AMDGPU::SCC) != -1)
4014       Worklist.insert(&MI);
4015   }
4016 }
4017 
4018 const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
4019   const MachineInstr &Inst) const {
4020   const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
4021 
4022   switch (Inst.getOpcode()) {
4023   // For target instructions, getOpRegClass just returns the virtual register
4024   // class associated with the operand, so we need to find an equivalent VGPR
4025   // register class in order to move the instruction to the VALU.
4026   case AMDGPU::COPY:
4027   case AMDGPU::PHI:
4028   case AMDGPU::REG_SEQUENCE:
4029   case AMDGPU::INSERT_SUBREG:
4030   case AMDGPU::WQM:
4031   case AMDGPU::WWM:
4032     if (RI.hasVGPRs(NewDstRC))
4033       return nullptr;
4034 
4035     NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
4036     if (!NewDstRC)
4037       return nullptr;
4038     return NewDstRC;
4039   default:
4040     return NewDstRC;
4041   }
4042 }
4043 
4044 // Find the one SGPR operand we are allowed to use.
4045 unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
4046                                    int OpIndices[3]) const {
4047   const MCInstrDesc &Desc = MI.getDesc();
4048 
4049   // Find the one SGPR operand we are allowed to use.
4050   //
4051   // First we need to consider the instruction's operand requirements before
4052   // legalizing. Some operands are required to be SGPRs, such as implicit uses
4053   // of VCC, but we are still bound by the constant bus requirement to only use
4054   // one.
4055   //
4056   // If the operand's class is an SGPR, we can never move it.
4057 
4058   unsigned SGPRReg = findImplicitSGPRRead(MI);
4059   if (SGPRReg != AMDGPU::NoRegister)
4060     return SGPRReg;
4061 
4062   unsigned UsedSGPRs[3] = { AMDGPU::NoRegister };
4063   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
4064 
4065   for (unsigned i = 0; i < 3; ++i) {
4066     int Idx = OpIndices[i];
4067     if (Idx == -1)
4068       break;
4069 
4070     const MachineOperand &MO = MI.getOperand(Idx);
4071     if (!MO.isReg())
4072       continue;
4073 
4074     // Is this operand statically required to be an SGPR based on the operand
4075     // constraints?
4076     const TargetRegisterClass *OpRC = RI.getRegClass(Desc.OpInfo[Idx].RegClass);
4077     bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
4078     if (IsRequiredSGPR)
4079       return MO.getReg();
4080 
4081     // If this could be a VGPR or an SGPR, Check the dynamic register class.
4082     unsigned Reg = MO.getReg();
4083     const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
4084     if (RI.isSGPRClass(RegRC))
4085       UsedSGPRs[i] = Reg;
4086   }
4087 
4088   // We don't have a required SGPR operand, so we have a bit more freedom in
4089   // selecting operands to move.
4090 
4091   // Try to select the most used SGPR. If an SGPR is equal to one of the
4092   // others, we choose that.
4093   //
4094   // e.g.
4095   // V_FMA_F32 v0, s0, s0, s0 -> No moves
4096   // V_FMA_F32 v0, s0, s1, s0 -> Move s1
4097 
4098   // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
4099   // prefer those.
4100 
4101   if (UsedSGPRs[0] != AMDGPU::NoRegister) {
4102     if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
4103       SGPRReg = UsedSGPRs[0];
4104   }
4105 
4106   if (SGPRReg == AMDGPU::NoRegister && UsedSGPRs[1] != AMDGPU::NoRegister) {
4107     if (UsedSGPRs[1] == UsedSGPRs[2])
4108       SGPRReg = UsedSGPRs[1];
4109   }
4110 
4111   return SGPRReg;
4112 }
4113 
4114 MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
4115                                              unsigned OperandName) const {
4116   int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
4117   if (Idx == -1)
4118     return nullptr;
4119 
4120   return &MI.getOperand(Idx);
4121 }
4122 
4123 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
4124   uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
4125   if (ST.isAmdHsaOS()) {
4126     // Set ATC = 1. GFX9 doesn't have this bit.
4127     if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS)
4128       RsrcDataFormat |= (1ULL << 56);
4129 
4130     // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
4131     // BTW, it disables TC L2 and therefore decreases performance.
4132     if (ST.getGeneration() == SISubtarget::VOLCANIC_ISLANDS)
4133       RsrcDataFormat |= (2ULL << 59);
4134   }
4135 
4136   return RsrcDataFormat;
4137 }
4138 
4139 uint64_t SIInstrInfo::getScratchRsrcWords23() const {
4140   uint64_t Rsrc23 = getDefaultRsrcDataFormat() |
4141                     AMDGPU::RSRC_TID_ENABLE |
4142                     0xffffffff; // Size;
4143 
4144   // GFX9 doesn't have ELEMENT_SIZE.
4145   if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS) {
4146     uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1;
4147     Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
4148   }
4149 
4150   // IndexStride = 64.
4151   Rsrc23 |= UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
4152 
4153   // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
4154   // Clear them unless we want a huge stride.
4155   if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
4156     Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
4157 
4158   return Rsrc23;
4159 }
4160 
4161 bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const {
4162   unsigned Opc = MI.getOpcode();
4163 
4164   return isSMRD(Opc);
4165 }
4166 
4167 bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr &MI) const {
4168   unsigned Opc = MI.getOpcode();
4169 
4170   return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc);
4171 }
4172 
4173 unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI,
4174                                     int &FrameIndex) const {
4175   const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
4176   if (!Addr || !Addr->isFI())
4177     return AMDGPU::NoRegister;
4178 
4179   assert(!MI.memoperands_empty() &&
4180          (*MI.memoperands_begin())->getAddrSpace() == AMDGPUASI.PRIVATE_ADDRESS);
4181 
4182   FrameIndex = Addr->getIndex();
4183   return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
4184 }
4185 
4186 unsigned SIInstrInfo::isSGPRStackAccess(const MachineInstr &MI,
4187                                         int &FrameIndex) const {
4188   const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
4189   assert(Addr && Addr->isFI());
4190   FrameIndex = Addr->getIndex();
4191   return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
4192 }
4193 
4194 unsigned SIInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
4195                                           int &FrameIndex) const {
4196   if (!MI.mayLoad())
4197     return AMDGPU::NoRegister;
4198 
4199   if (isMUBUF(MI) || isVGPRSpill(MI))
4200     return isStackAccess(MI, FrameIndex);
4201 
4202   if (isSGPRSpill(MI))
4203     return isSGPRStackAccess(MI, FrameIndex);
4204 
4205   return AMDGPU::NoRegister;
4206 }
4207 
4208 unsigned SIInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
4209                                          int &FrameIndex) const {
4210   if (!MI.mayStore())
4211     return AMDGPU::NoRegister;
4212 
4213   if (isMUBUF(MI) || isVGPRSpill(MI))
4214     return isStackAccess(MI, FrameIndex);
4215 
4216   if (isSGPRSpill(MI))
4217     return isSGPRStackAccess(MI, FrameIndex);
4218 
4219   return AMDGPU::NoRegister;
4220 }
4221 
4222 unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
4223   unsigned Opc = MI.getOpcode();
4224   const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc);
4225   unsigned DescSize = Desc.getSize();
4226 
4227   // If we have a definitive size, we can use it. Otherwise we need to inspect
4228   // the operands to know the size.
4229   //
4230   // FIXME: Instructions that have a base 32-bit encoding report their size as
4231   // 4, even though they are really 8 bytes if they have a literal operand.
4232   if (DescSize != 0 && DescSize != 4)
4233     return DescSize;
4234 
4235   // 4-byte instructions may have a 32-bit literal encoded after them. Check
4236   // operands that coud ever be literals.
4237   if (isVALU(MI) || isSALU(MI)) {
4238     if (isFixedSize(MI))
4239       return DescSize;
4240 
4241     int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
4242     if (Src0Idx == -1)
4243       return 4; // No operands.
4244 
4245     if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx]))
4246       return 8;
4247 
4248     int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
4249     if (Src1Idx == -1)
4250       return 4;
4251 
4252     if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx]))
4253       return 8;
4254 
4255     return 4;
4256   }
4257 
4258   if (DescSize == 4)
4259     return 4;
4260 
4261   switch (Opc) {
4262   case TargetOpcode::IMPLICIT_DEF:
4263   case TargetOpcode::KILL:
4264   case TargetOpcode::DBG_VALUE:
4265   case TargetOpcode::BUNDLE:
4266   case TargetOpcode::EH_LABEL:
4267     return 0;
4268   case TargetOpcode::INLINEASM: {
4269     const MachineFunction *MF = MI.getParent()->getParent();
4270     const char *AsmStr = MI.getOperand(0).getSymbolName();
4271     return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
4272   }
4273   default:
4274     llvm_unreachable("unable to find instruction size");
4275   }
4276 }
4277 
4278 bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const {
4279   if (!isFLAT(MI))
4280     return false;
4281 
4282   if (MI.memoperands_empty())
4283     return true;
4284 
4285   for (const MachineMemOperand *MMO : MI.memoperands()) {
4286     if (MMO->getAddrSpace() == AMDGPUASI.FLAT_ADDRESS)
4287       return true;
4288   }
4289   return false;
4290 }
4291 
4292 bool SIInstrInfo::isNonUniformBranchInstr(MachineInstr &Branch) const {
4293   return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO;
4294 }
4295 
4296 void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry,
4297                                             MachineBasicBlock *IfEnd) const {
4298   MachineBasicBlock::iterator TI = IfEntry->getFirstTerminator();
4299   assert(TI != IfEntry->end());
4300 
4301   MachineInstr *Branch = &(*TI);
4302   MachineFunction *MF = IfEntry->getParent();
4303   MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo();
4304 
4305   if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
4306     unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
4307     MachineInstr *SIIF =
4308         BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg)
4309             .add(Branch->getOperand(0))
4310             .add(Branch->getOperand(1));
4311     MachineInstr *SIEND =
4312         BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF))
4313             .addReg(DstReg);
4314 
4315     IfEntry->erase(TI);
4316     IfEntry->insert(IfEntry->end(), SIIF);
4317     IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND);
4318   }
4319 }
4320 
4321 void SIInstrInfo::convertNonUniformLoopRegion(
4322     MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const {
4323   MachineBasicBlock::iterator TI = LoopEnd->getFirstTerminator();
4324   // We expect 2 terminators, one conditional and one unconditional.
4325   assert(TI != LoopEnd->end());
4326 
4327   MachineInstr *Branch = &(*TI);
4328   MachineFunction *MF = LoopEnd->getParent();
4329   MachineRegisterInfo &MRI = LoopEnd->getParent()->getRegInfo();
4330 
4331   if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
4332 
4333     unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
4334     unsigned BackEdgeReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
4335     MachineInstrBuilder HeaderPHIBuilder =
4336         BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg);
4337     for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(),
4338                                           E = LoopEntry->pred_end();
4339          PI != E; ++PI) {
4340       if (*PI == LoopEnd) {
4341         HeaderPHIBuilder.addReg(BackEdgeReg);
4342       } else {
4343         MachineBasicBlock *PMBB = *PI;
4344         unsigned ZeroReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
4345         materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(),
4346                              ZeroReg, 0);
4347         HeaderPHIBuilder.addReg(ZeroReg);
4348       }
4349       HeaderPHIBuilder.addMBB(*PI);
4350     }
4351     MachineInstr *HeaderPhi = HeaderPHIBuilder;
4352     MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(),
4353                                       get(AMDGPU::SI_IF_BREAK), BackEdgeReg)
4354                                   .addReg(DstReg)
4355                                   .add(Branch->getOperand(0));
4356     MachineInstr *SILOOP =
4357         BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP))
4358             .addReg(BackEdgeReg)
4359             .addMBB(LoopEntry);
4360 
4361     LoopEntry->insert(LoopEntry->begin(), HeaderPhi);
4362     LoopEnd->erase(TI);
4363     LoopEnd->insert(LoopEnd->end(), SIIFBREAK);
4364     LoopEnd->insert(LoopEnd->end(), SILOOP);
4365   }
4366 }
4367 
4368 ArrayRef<std::pair<int, const char *>>
4369 SIInstrInfo::getSerializableTargetIndices() const {
4370   static const std::pair<int, const char *> TargetIndices[] = {
4371       {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
4372       {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
4373       {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
4374       {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
4375       {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
4376   return makeArrayRef(TargetIndices);
4377 }
4378 
4379 /// This is used by the post-RA scheduler (SchedulePostRAList.cpp).  The
4380 /// post-RA version of misched uses CreateTargetMIHazardRecognizer.
4381 ScheduleHazardRecognizer *
4382 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
4383                                             const ScheduleDAG *DAG) const {
4384   return new GCNHazardRecognizer(DAG->MF);
4385 }
4386 
4387 /// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
4388 /// pass.
4389 ScheduleHazardRecognizer *
4390 SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const {
4391   return new GCNHazardRecognizer(MF);
4392 }
4393 
4394 std::pair<unsigned, unsigned>
4395 SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
4396   return std::make_pair(TF & MO_MASK, TF & ~MO_MASK);
4397 }
4398 
4399 ArrayRef<std::pair<unsigned, const char *>>
4400 SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
4401   static const std::pair<unsigned, const char *> TargetFlags[] = {
4402     { MO_GOTPCREL, "amdgpu-gotprel" },
4403     { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" },
4404     { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" },
4405     { MO_REL32_LO, "amdgpu-rel32-lo" },
4406     { MO_REL32_HI, "amdgpu-rel32-hi" }
4407   };
4408 
4409   return makeArrayRef(TargetFlags);
4410 }
4411 
4412 bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const {
4413   return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
4414          MI.modifiesRegister(AMDGPU::EXEC, &RI);
4415 }
4416 
4417 MachineInstrBuilder
4418 SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
4419                            MachineBasicBlock::iterator I,
4420                            const DebugLoc &DL,
4421                            unsigned DestReg) const {
4422   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
4423 
4424   unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
4425 
4426   return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg)
4427            .addReg(UnusedCarry, RegState::Define | RegState::Dead);
4428 }
4429