1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the InstructionSelector class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUInstructionSelector.h"
15 #include "AMDGPUInstrInfo.h"
16 #include "AMDGPUGlobalISelUtils.h"
17 #include "AMDGPURegisterBankInfo.h"
18 #include "AMDGPUSubtarget.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
23 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
24 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
25 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
26 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
27 #include "llvm/CodeGen/GlobalISel/Utils.h"
28 #include "llvm/CodeGen/MachineBasicBlock.h"
29 #include "llvm/CodeGen/MachineFunction.h"
30 #include "llvm/CodeGen/MachineInstr.h"
31 #include "llvm/CodeGen/MachineInstrBuilder.h"
32 #include "llvm/CodeGen/MachineRegisterInfo.h"
33 #include "llvm/IR/Type.h"
34 #include "llvm/Support/Debug.h"
35 #include "llvm/Support/raw_ostream.h"
36 
37 #define DEBUG_TYPE "amdgpu-isel"
38 
39 using namespace llvm;
40 using namespace MIPatternMatch;
41 
42 static cl::opt<bool> AllowRiskySelect(
43   "amdgpu-global-isel-risky-select",
44   cl::desc("Allow GlobalISel to select cases that are likely to not work yet"),
45   cl::init(false),
46   cl::ReallyHidden);
47 
48 #define GET_GLOBALISEL_IMPL
49 #define AMDGPUSubtarget GCNSubtarget
50 #include "AMDGPUGenGlobalISel.inc"
51 #undef GET_GLOBALISEL_IMPL
52 #undef AMDGPUSubtarget
53 
54 AMDGPUInstructionSelector::AMDGPUInstructionSelector(
55     const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
56     const AMDGPUTargetMachine &TM)
57     : InstructionSelector(), TII(*STI.getInstrInfo()),
58       TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
59       STI(STI),
60       EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG),
61 #define GET_GLOBALISEL_PREDICATES_INIT
62 #include "AMDGPUGenGlobalISel.inc"
63 #undef GET_GLOBALISEL_PREDICATES_INIT
64 #define GET_GLOBALISEL_TEMPORARIES_INIT
65 #include "AMDGPUGenGlobalISel.inc"
66 #undef GET_GLOBALISEL_TEMPORARIES_INIT
67 {
68 }
69 
70 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
71 
72 void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB,
73                                         CodeGenCoverage &CoverageInfo) {
74   MRI = &MF.getRegInfo();
75   InstructionSelector::setupMF(MF, KB, CoverageInfo);
76 }
77 
78 bool AMDGPUInstructionSelector::isVCC(Register Reg,
79                                       const MachineRegisterInfo &MRI) const {
80   // The verifier is oblivious to s1 being a valid value for wavesize registers.
81   if (Reg.isPhysical())
82     return false;
83 
84   auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
85   const TargetRegisterClass *RC =
86       RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
87   if (RC) {
88     const LLT Ty = MRI.getType(Reg);
89     return RC->hasSuperClassEq(TRI.getBoolRC()) &&
90            Ty.isValid() && Ty.getSizeInBits() == 1;
91   }
92 
93   const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
94   return RB->getID() == AMDGPU::VCCRegBankID;
95 }
96 
97 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
98                                                         unsigned NewOpc) const {
99   MI.setDesc(TII.get(NewOpc));
100   MI.RemoveOperand(1); // Remove intrinsic ID.
101   MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
102 
103   MachineOperand &Dst = MI.getOperand(0);
104   MachineOperand &Src = MI.getOperand(1);
105 
106   // TODO: This should be legalized to s32 if needed
107   if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
108     return false;
109 
110   const TargetRegisterClass *DstRC
111     = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
112   const TargetRegisterClass *SrcRC
113     = TRI.getConstrainedRegClassForOperand(Src, *MRI);
114   if (!DstRC || DstRC != SrcRC)
115     return false;
116 
117   return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
118          RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
119 }
120 
121 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
122   const DebugLoc &DL = I.getDebugLoc();
123   MachineBasicBlock *BB = I.getParent();
124   I.setDesc(TII.get(TargetOpcode::COPY));
125 
126   const MachineOperand &Src = I.getOperand(1);
127   MachineOperand &Dst = I.getOperand(0);
128   Register DstReg = Dst.getReg();
129   Register SrcReg = Src.getReg();
130 
131   if (isVCC(DstReg, *MRI)) {
132     if (SrcReg == AMDGPU::SCC) {
133       const TargetRegisterClass *RC
134         = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
135       if (!RC)
136         return true;
137       return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
138     }
139 
140     if (!isVCC(SrcReg, *MRI)) {
141       // TODO: Should probably leave the copy and let copyPhysReg expand it.
142       if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
143         return false;
144 
145       const TargetRegisterClass *SrcRC
146         = TRI.getConstrainedRegClassForOperand(Src, *MRI);
147 
148       Register MaskedReg = MRI->createVirtualRegister(SrcRC);
149 
150       // We can't trust the high bits at this point, so clear them.
151 
152       // TODO: Skip masking high bits if def is known boolean.
153 
154       unsigned AndOpc = TRI.isSGPRClass(SrcRC) ?
155         AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
156       BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
157         .addImm(1)
158         .addReg(SrcReg);
159       BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
160         .addImm(0)
161         .addReg(MaskedReg);
162 
163       if (!MRI->getRegClassOrNull(SrcReg))
164         MRI->setRegClass(SrcReg, SrcRC);
165       I.eraseFromParent();
166       return true;
167     }
168 
169     const TargetRegisterClass *RC =
170       TRI.getConstrainedRegClassForOperand(Dst, *MRI);
171     if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
172       return false;
173 
174     return true;
175   }
176 
177   for (const MachineOperand &MO : I.operands()) {
178     if (Register::isPhysicalRegister(MO.getReg()))
179       continue;
180 
181     const TargetRegisterClass *RC =
182             TRI.getConstrainedRegClassForOperand(MO, *MRI);
183     if (!RC)
184       continue;
185     RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
186   }
187   return true;
188 }
189 
190 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
191   const Register DefReg = I.getOperand(0).getReg();
192   const LLT DefTy = MRI->getType(DefReg);
193   if (DefTy == LLT::scalar(1)) {
194     if (!AllowRiskySelect) {
195       LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n");
196       return false;
197     }
198 
199     LLVM_DEBUG(dbgs() << "Selecting risky boolean phi\n");
200   }
201 
202   // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
203 
204   const RegClassOrRegBank &RegClassOrBank =
205     MRI->getRegClassOrRegBank(DefReg);
206 
207   const TargetRegisterClass *DefRC
208     = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
209   if (!DefRC) {
210     if (!DefTy.isValid()) {
211       LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
212       return false;
213     }
214 
215     const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
216     DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI);
217     if (!DefRC) {
218       LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
219       return false;
220     }
221   }
222 
223   // TODO: Verify that all registers have the same bank
224   I.setDesc(TII.get(TargetOpcode::PHI));
225   return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
226 }
227 
228 MachineOperand
229 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
230                                            const TargetRegisterClass &SubRC,
231                                            unsigned SubIdx) const {
232 
233   MachineInstr *MI = MO.getParent();
234   MachineBasicBlock *BB = MO.getParent()->getParent();
235   Register DstReg = MRI->createVirtualRegister(&SubRC);
236 
237   if (MO.isReg()) {
238     unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
239     Register Reg = MO.getReg();
240     BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
241             .addReg(Reg, 0, ComposedSubIdx);
242 
243     return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
244                                      MO.isKill(), MO.isDead(), MO.isUndef(),
245                                      MO.isEarlyClobber(), 0, MO.isDebug(),
246                                      MO.isInternalRead());
247   }
248 
249   assert(MO.isImm());
250 
251   APInt Imm(64, MO.getImm());
252 
253   switch (SubIdx) {
254   default:
255     llvm_unreachable("do not know to split immediate with this sub index.");
256   case AMDGPU::sub0:
257     return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
258   case AMDGPU::sub1:
259     return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
260   }
261 }
262 
263 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
264   switch (Opc) {
265   case AMDGPU::G_AND:
266     return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
267   case AMDGPU::G_OR:
268     return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
269   case AMDGPU::G_XOR:
270     return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
271   default:
272     llvm_unreachable("not a bit op");
273   }
274 }
275 
276 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
277   Register DstReg = I.getOperand(0).getReg();
278   unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
279 
280   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
281   if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
282       DstRB->getID() != AMDGPU::VCCRegBankID)
283     return false;
284 
285   bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
286                             STI.isWave64());
287   I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
288 
289   // Dead implicit-def of scc
290   I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
291                                          true, // isImp
292                                          false, // isKill
293                                          true)); // isDead
294   return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
295 }
296 
297 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
298   MachineBasicBlock *BB = I.getParent();
299   MachineFunction *MF = BB->getParent();
300   Register DstReg = I.getOperand(0).getReg();
301   const DebugLoc &DL = I.getDebugLoc();
302   LLT Ty = MRI->getType(DstReg);
303   if (Ty.isVector())
304     return false;
305 
306   unsigned Size = Ty.getSizeInBits();
307   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
308   const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
309   const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
310 
311   if (Size == 32) {
312     if (IsSALU) {
313       const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
314       MachineInstr *Add =
315         BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
316         .add(I.getOperand(1))
317         .add(I.getOperand(2));
318       I.eraseFromParent();
319       return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
320     }
321 
322     if (STI.hasAddNoCarry()) {
323       const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
324       I.setDesc(TII.get(Opc));
325       I.addOperand(*MF, MachineOperand::CreateImm(0));
326       I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
327       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
328     }
329 
330     const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
331 
332     Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
333     MachineInstr *Add
334       = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
335       .addDef(UnusedCarry, RegState::Dead)
336       .add(I.getOperand(1))
337       .add(I.getOperand(2))
338       .addImm(0);
339     I.eraseFromParent();
340     return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
341   }
342 
343   assert(!Sub && "illegal sub should not reach here");
344 
345   const TargetRegisterClass &RC
346     = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
347   const TargetRegisterClass &HalfRC
348     = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
349 
350   MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
351   MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
352   MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
353   MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
354 
355   Register DstLo = MRI->createVirtualRegister(&HalfRC);
356   Register DstHi = MRI->createVirtualRegister(&HalfRC);
357 
358   if (IsSALU) {
359     BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
360       .add(Lo1)
361       .add(Lo2);
362     BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
363       .add(Hi1)
364       .add(Hi2);
365   } else {
366     const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
367     Register CarryReg = MRI->createVirtualRegister(CarryRC);
368     BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
369       .addDef(CarryReg)
370       .add(Lo1)
371       .add(Lo2)
372       .addImm(0);
373     MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
374       .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
375       .add(Hi1)
376       .add(Hi2)
377       .addReg(CarryReg, RegState::Kill)
378       .addImm(0);
379 
380     if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
381       return false;
382   }
383 
384   BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
385     .addReg(DstLo)
386     .addImm(AMDGPU::sub0)
387     .addReg(DstHi)
388     .addImm(AMDGPU::sub1);
389 
390 
391   if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
392     return false;
393 
394   I.eraseFromParent();
395   return true;
396 }
397 
398 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
399   MachineInstr &I) const {
400   MachineBasicBlock *BB = I.getParent();
401   MachineFunction *MF = BB->getParent();
402   const DebugLoc &DL = I.getDebugLoc();
403   Register Dst0Reg = I.getOperand(0).getReg();
404   Register Dst1Reg = I.getOperand(1).getReg();
405   const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
406                      I.getOpcode() == AMDGPU::G_UADDE;
407   const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
408                           I.getOpcode() == AMDGPU::G_USUBE;
409 
410   if (isVCC(Dst1Reg, *MRI)) {
411     unsigned NoCarryOpc =
412         IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
413     unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
414     I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
415     I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
416     I.addOperand(*MF, MachineOperand::CreateImm(0));
417     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
418   }
419 
420   Register Src0Reg = I.getOperand(2).getReg();
421   Register Src1Reg = I.getOperand(3).getReg();
422 
423   if (HasCarryIn) {
424     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
425       .addReg(I.getOperand(4).getReg());
426   }
427 
428   unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
429   unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
430 
431   BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
432     .add(I.getOperand(2))
433     .add(I.getOperand(3));
434   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
435     .addReg(AMDGPU::SCC);
436 
437   if (!MRI->getRegClassOrNull(Dst1Reg))
438     MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
439 
440   if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
441       !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
442       !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
443     return false;
444 
445   if (HasCarryIn &&
446       !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
447                                     AMDGPU::SReg_32RegClass, *MRI))
448     return false;
449 
450   I.eraseFromParent();
451   return true;
452 }
453 
454 // TODO: We should probably legalize these to only using 32-bit results.
455 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
456   MachineBasicBlock *BB = I.getParent();
457   Register DstReg = I.getOperand(0).getReg();
458   Register SrcReg = I.getOperand(1).getReg();
459   LLT DstTy = MRI->getType(DstReg);
460   LLT SrcTy = MRI->getType(SrcReg);
461   const unsigned SrcSize = SrcTy.getSizeInBits();
462   unsigned DstSize = DstTy.getSizeInBits();
463 
464   // TODO: Should handle any multiple of 32 offset.
465   unsigned Offset = I.getOperand(2).getImm();
466   if (Offset % 32 != 0 || DstSize > 128)
467     return false;
468 
469   // 16-bit operations really use 32-bit registers.
470   // FIXME: Probably should not allow 16-bit G_EXTRACT results.
471   if (DstSize == 16)
472     DstSize = 32;
473 
474   const TargetRegisterClass *DstRC =
475     TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
476   if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
477     return false;
478 
479   const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
480   const TargetRegisterClass *SrcRC =
481     TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
482   if (!SrcRC)
483     return false;
484   unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32,
485                                                          DstSize / 32);
486   SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
487   if (!SrcRC)
488     return false;
489 
490   SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
491                                     *SrcRC, I.getOperand(1));
492   const DebugLoc &DL = I.getDebugLoc();
493   BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
494     .addReg(SrcReg, 0, SubReg);
495 
496   I.eraseFromParent();
497   return true;
498 }
499 
500 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
501   MachineBasicBlock *BB = MI.getParent();
502   Register DstReg = MI.getOperand(0).getReg();
503   LLT DstTy = MRI->getType(DstReg);
504   LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
505 
506   const unsigned SrcSize = SrcTy.getSizeInBits();
507   if (SrcSize < 32)
508     return selectImpl(MI, *CoverageInfo);
509 
510   const DebugLoc &DL = MI.getDebugLoc();
511   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
512   const unsigned DstSize = DstTy.getSizeInBits();
513   const TargetRegisterClass *DstRC =
514     TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
515   if (!DstRC)
516     return false;
517 
518   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
519   MachineInstrBuilder MIB =
520     BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
521   for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
522     MachineOperand &Src = MI.getOperand(I + 1);
523     MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
524     MIB.addImm(SubRegs[I]);
525 
526     const TargetRegisterClass *SrcRC
527       = TRI.getConstrainedRegClassForOperand(Src, *MRI);
528     if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
529       return false;
530   }
531 
532   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
533     return false;
534 
535   MI.eraseFromParent();
536   return true;
537 }
538 
539 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
540   MachineBasicBlock *BB = MI.getParent();
541   const int NumDst = MI.getNumOperands() - 1;
542 
543   MachineOperand &Src = MI.getOperand(NumDst);
544 
545   Register SrcReg = Src.getReg();
546   Register DstReg0 = MI.getOperand(0).getReg();
547   LLT DstTy = MRI->getType(DstReg0);
548   LLT SrcTy = MRI->getType(SrcReg);
549 
550   const unsigned DstSize = DstTy.getSizeInBits();
551   const unsigned SrcSize = SrcTy.getSizeInBits();
552   const DebugLoc &DL = MI.getDebugLoc();
553   const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
554 
555   const TargetRegisterClass *SrcRC =
556     TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
557   if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
558     return false;
559 
560   const unsigned SrcFlags = getUndefRegState(Src.isUndef());
561 
562   // Note we could have mixed SGPR and VGPR destination banks for an SGPR
563   // source, and this relies on the fact that the same subregister indices are
564   // used for both.
565   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
566   for (int I = 0, E = NumDst; I != E; ++I) {
567     MachineOperand &Dst = MI.getOperand(I);
568     BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
569       .addReg(SrcReg, SrcFlags, SubRegs[I]);
570 
571     // Make sure the subregister index is valid for the source register.
572     SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
573     if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
574       return false;
575 
576     const TargetRegisterClass *DstRC =
577       TRI.getConstrainedRegClassForOperand(Dst, *MRI);
578     if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
579       return false;
580   }
581 
582   MI.eraseFromParent();
583   return true;
584 }
585 
586 bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
587   MachineInstr &MI) const {
588   if (selectImpl(MI, *CoverageInfo))
589     return true;
590 
591   const LLT S32 = LLT::scalar(32);
592   const LLT V2S16 = LLT::vector(2, 16);
593 
594   Register Dst = MI.getOperand(0).getReg();
595   if (MRI->getType(Dst) != V2S16)
596     return false;
597 
598   const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
599   if (DstBank->getID() != AMDGPU::SGPRRegBankID)
600     return false;
601 
602   Register Src0 = MI.getOperand(1).getReg();
603   Register Src1 = MI.getOperand(2).getReg();
604   if (MRI->getType(Src0) != S32)
605     return false;
606 
607   const DebugLoc &DL = MI.getDebugLoc();
608   MachineBasicBlock *BB = MI.getParent();
609 
610   auto ConstSrc1 = getConstantVRegValWithLookThrough(Src1, *MRI, true, true);
611   if (ConstSrc1) {
612     auto ConstSrc0 = getConstantVRegValWithLookThrough(Src0, *MRI, true, true);
613     if (ConstSrc0) {
614       uint32_t Lo16 = static_cast<uint32_t>(ConstSrc0->Value) & 0xffff;
615       uint32_t Hi16 = static_cast<uint32_t>(ConstSrc1->Value) & 0xffff;
616 
617       BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst)
618         .addImm(Lo16 | (Hi16 << 16));
619       MI.eraseFromParent();
620       return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
621     }
622   }
623 
624   // TODO: This should probably be a combine somewhere
625   // (build_vector_trunc $src0, undef -> copy $src0
626   MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
627   if (Src1Def && Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
628     MI.setDesc(TII.get(AMDGPU::COPY));
629     MI.RemoveOperand(2);
630     return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI) &&
631            RBI.constrainGenericRegister(Src0, AMDGPU::SReg_32RegClass, *MRI);
632   }
633 
634   Register ShiftSrc0;
635   Register ShiftSrc1;
636   int64_t ShiftAmt;
637 
638   // With multiple uses of the shift, this will duplicate the shift and
639   // increase register pressure.
640   //
641   // (build_vector_trunc (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
642   //  => (S_PACK_HH_B32_B16 $src0, $src1)
643   // (build_vector_trunc $src0, (lshr_oneuse SReg_32:$src1, 16))
644   //  => (S_PACK_LH_B32_B16 $src0, $src1)
645   // (build_vector_trunc $src0, $src1)
646   //  => (S_PACK_LL_B32_B16 $src0, $src1)
647 
648   // FIXME: This is an inconvenient way to check a specific value
649   bool Shift0 = mi_match(
650     Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_ICst(ShiftAmt)))) &&
651     ShiftAmt == 16;
652 
653   bool Shift1 = mi_match(
654     Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_ICst(ShiftAmt)))) &&
655     ShiftAmt == 16;
656 
657   unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
658   if (Shift0 && Shift1) {
659     Opc = AMDGPU::S_PACK_HH_B32_B16;
660     MI.getOperand(1).setReg(ShiftSrc0);
661     MI.getOperand(2).setReg(ShiftSrc1);
662   } else if (Shift1) {
663     Opc = AMDGPU::S_PACK_LH_B32_B16;
664     MI.getOperand(2).setReg(ShiftSrc1);
665   } else if (Shift0 && ConstSrc1 && ConstSrc1->Value == 0) {
666     // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
667     auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
668       .addReg(ShiftSrc0)
669       .addImm(16);
670 
671     MI.eraseFromParent();
672     return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
673   }
674 
675   MI.setDesc(TII.get(Opc));
676   return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
677 }
678 
679 bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const {
680   return selectG_ADD_SUB(I);
681 }
682 
683 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
684   const MachineOperand &MO = I.getOperand(0);
685 
686   // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
687   // regbank check here is to know why getConstrainedRegClassForOperand failed.
688   const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
689   if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
690       (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
691     I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
692     return true;
693   }
694 
695   return false;
696 }
697 
698 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
699   MachineBasicBlock *BB = I.getParent();
700 
701   Register DstReg = I.getOperand(0).getReg();
702   Register Src0Reg = I.getOperand(1).getReg();
703   Register Src1Reg = I.getOperand(2).getReg();
704   LLT Src1Ty = MRI->getType(Src1Reg);
705 
706   unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
707   unsigned InsSize = Src1Ty.getSizeInBits();
708 
709   int64_t Offset = I.getOperand(3).getImm();
710 
711   // FIXME: These cases should have been illegal and unnecessary to check here.
712   if (Offset % 32 != 0 || InsSize % 32 != 0)
713     return false;
714 
715   // Currently not handled by getSubRegFromChannel.
716   if (InsSize > 128)
717     return false;
718 
719   unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
720   if (SubReg == AMDGPU::NoSubRegister)
721     return false;
722 
723   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
724   const TargetRegisterClass *DstRC =
725     TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
726   if (!DstRC)
727     return false;
728 
729   const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
730   const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
731   const TargetRegisterClass *Src0RC =
732     TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI);
733   const TargetRegisterClass *Src1RC =
734     TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI);
735 
736   // Deal with weird cases where the class only partially supports the subreg
737   // index.
738   Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
739   if (!Src0RC || !Src1RC)
740     return false;
741 
742   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
743       !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
744       !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
745     return false;
746 
747   const DebugLoc &DL = I.getDebugLoc();
748   BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
749     .addReg(Src0Reg)
750     .addReg(Src1Reg)
751     .addImm(SubReg);
752 
753   I.eraseFromParent();
754   return true;
755 }
756 
757 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
758   if (STI.getLDSBankCount() != 16)
759     return selectImpl(MI, *CoverageInfo);
760 
761   Register Dst = MI.getOperand(0).getReg();
762   Register Src0 = MI.getOperand(2).getReg();
763   Register M0Val = MI.getOperand(6).getReg();
764   if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
765       !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
766       !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
767     return false;
768 
769   // This requires 2 instructions. It is possible to write a pattern to support
770   // this, but the generated isel emitter doesn't correctly deal with multiple
771   // output instructions using the same physical register input. The copy to m0
772   // is incorrectly placed before the second instruction.
773   //
774   // TODO: Match source modifiers.
775 
776   Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
777   const DebugLoc &DL = MI.getDebugLoc();
778   MachineBasicBlock *MBB = MI.getParent();
779 
780   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
781     .addReg(M0Val);
782   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
783     .addImm(2)
784     .addImm(MI.getOperand(4).getImm())  // $attr
785     .addImm(MI.getOperand(3).getImm()); // $attrchan
786 
787   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
788     .addImm(0)                          // $src0_modifiers
789     .addReg(Src0)                       // $src0
790     .addImm(MI.getOperand(4).getImm())  // $attr
791     .addImm(MI.getOperand(3).getImm())  // $attrchan
792     .addImm(0)                          // $src2_modifiers
793     .addReg(InterpMov)                  // $src2 - 2 f16 values selected by high
794     .addImm(MI.getOperand(5).getImm())  // $high
795     .addImm(0)                          // $clamp
796     .addImm(0);                         // $omod
797 
798   MI.eraseFromParent();
799   return true;
800 }
801 
802 // We need to handle this here because tablegen doesn't support matching
803 // instructions with multiple outputs.
804 bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
805   Register Dst0 = MI.getOperand(0).getReg();
806   Register Dst1 = MI.getOperand(1).getReg();
807 
808   LLT Ty = MRI->getType(Dst0);
809   unsigned Opc;
810   if (Ty == LLT::scalar(32))
811     Opc = AMDGPU::V_DIV_SCALE_F32;
812   else if (Ty == LLT::scalar(64))
813     Opc = AMDGPU::V_DIV_SCALE_F64;
814   else
815     return false;
816 
817   const DebugLoc &DL = MI.getDebugLoc();
818   MachineBasicBlock *MBB = MI.getParent();
819 
820   Register Numer = MI.getOperand(3).getReg();
821   Register Denom = MI.getOperand(4).getReg();
822   unsigned ChooseDenom = MI.getOperand(5).getImm();
823 
824   Register Src0 = ChooseDenom != 0 ? Numer : Denom;
825 
826   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
827     .addDef(Dst1)
828     .addUse(Src0)
829     .addUse(Denom)
830     .addUse(Numer);
831 
832   MI.eraseFromParent();
833   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
834 }
835 
836 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
837   unsigned IntrinsicID = I.getIntrinsicID();
838   switch (IntrinsicID) {
839   case Intrinsic::amdgcn_if_break: {
840     MachineBasicBlock *BB = I.getParent();
841 
842     // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
843     // SelectionDAG uses for wave32 vs wave64.
844     BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
845       .add(I.getOperand(0))
846       .add(I.getOperand(2))
847       .add(I.getOperand(3));
848 
849     Register DstReg = I.getOperand(0).getReg();
850     Register Src0Reg = I.getOperand(2).getReg();
851     Register Src1Reg = I.getOperand(3).getReg();
852 
853     I.eraseFromParent();
854 
855     for (Register Reg : { DstReg, Src0Reg, Src1Reg })
856       MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
857 
858     return true;
859   }
860   case Intrinsic::amdgcn_interp_p1_f16:
861     return selectInterpP1F16(I);
862   case Intrinsic::amdgcn_wqm:
863     return constrainCopyLikeIntrin(I, AMDGPU::WQM);
864   case Intrinsic::amdgcn_softwqm:
865     return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
866   case Intrinsic::amdgcn_wwm:
867     return constrainCopyLikeIntrin(I, AMDGPU::WWM);
868   case Intrinsic::amdgcn_div_scale:
869     return selectDivScale(I);
870   case Intrinsic::amdgcn_icmp:
871     return selectIntrinsicIcmp(I);
872   case Intrinsic::amdgcn_ballot:
873     return selectBallot(I);
874   case Intrinsic::amdgcn_reloc_constant:
875     return selectRelocConstant(I);
876   case Intrinsic::returnaddress:
877     return selectReturnAddress(I);
878   default:
879     return selectImpl(I, *CoverageInfo);
880   }
881 }
882 
883 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) {
884   if (Size != 32 && Size != 64)
885     return -1;
886   switch (P) {
887   default:
888     llvm_unreachable("Unknown condition code!");
889   case CmpInst::ICMP_NE:
890     return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64;
891   case CmpInst::ICMP_EQ:
892     return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64;
893   case CmpInst::ICMP_SGT:
894     return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64;
895   case CmpInst::ICMP_SGE:
896     return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64;
897   case CmpInst::ICMP_SLT:
898     return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64;
899   case CmpInst::ICMP_SLE:
900     return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64;
901   case CmpInst::ICMP_UGT:
902     return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64;
903   case CmpInst::ICMP_UGE:
904     return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64;
905   case CmpInst::ICMP_ULT:
906     return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64;
907   case CmpInst::ICMP_ULE:
908     return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64;
909   }
910 }
911 
912 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
913                                               unsigned Size) const {
914   if (Size == 64) {
915     if (!STI.hasScalarCompareEq64())
916       return -1;
917 
918     switch (P) {
919     case CmpInst::ICMP_NE:
920       return AMDGPU::S_CMP_LG_U64;
921     case CmpInst::ICMP_EQ:
922       return AMDGPU::S_CMP_EQ_U64;
923     default:
924       return -1;
925     }
926   }
927 
928   if (Size != 32)
929     return -1;
930 
931   switch (P) {
932   case CmpInst::ICMP_NE:
933     return AMDGPU::S_CMP_LG_U32;
934   case CmpInst::ICMP_EQ:
935     return AMDGPU::S_CMP_EQ_U32;
936   case CmpInst::ICMP_SGT:
937     return AMDGPU::S_CMP_GT_I32;
938   case CmpInst::ICMP_SGE:
939     return AMDGPU::S_CMP_GE_I32;
940   case CmpInst::ICMP_SLT:
941     return AMDGPU::S_CMP_LT_I32;
942   case CmpInst::ICMP_SLE:
943     return AMDGPU::S_CMP_LE_I32;
944   case CmpInst::ICMP_UGT:
945     return AMDGPU::S_CMP_GT_U32;
946   case CmpInst::ICMP_UGE:
947     return AMDGPU::S_CMP_GE_U32;
948   case CmpInst::ICMP_ULT:
949     return AMDGPU::S_CMP_LT_U32;
950   case CmpInst::ICMP_ULE:
951     return AMDGPU::S_CMP_LE_U32;
952   default:
953     llvm_unreachable("Unknown condition code!");
954   }
955 }
956 
957 bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
958   MachineBasicBlock *BB = I.getParent();
959   const DebugLoc &DL = I.getDebugLoc();
960 
961   Register SrcReg = I.getOperand(2).getReg();
962   unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
963 
964   auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
965 
966   Register CCReg = I.getOperand(0).getReg();
967   if (!isVCC(CCReg, *MRI)) {
968     int Opcode = getS_CMPOpcode(Pred, Size);
969     if (Opcode == -1)
970       return false;
971     MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
972             .add(I.getOperand(2))
973             .add(I.getOperand(3));
974     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
975       .addReg(AMDGPU::SCC);
976     bool Ret =
977         constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
978         RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
979     I.eraseFromParent();
980     return Ret;
981   }
982 
983   int Opcode = getV_CMPOpcode(Pred, Size);
984   if (Opcode == -1)
985     return false;
986 
987   MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode),
988             I.getOperand(0).getReg())
989             .add(I.getOperand(2))
990             .add(I.getOperand(3));
991   RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
992                                *TRI.getBoolRC(), *MRI);
993   bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
994   I.eraseFromParent();
995   return Ret;
996 }
997 
998 bool AMDGPUInstructionSelector::selectIntrinsicIcmp(MachineInstr &I) const {
999   Register Dst = I.getOperand(0).getReg();
1000   if (isVCC(Dst, *MRI))
1001     return false;
1002 
1003   if (MRI->getType(Dst).getSizeInBits() != STI.getWavefrontSize())
1004     return false;
1005 
1006   MachineBasicBlock *BB = I.getParent();
1007   const DebugLoc &DL = I.getDebugLoc();
1008   Register SrcReg = I.getOperand(2).getReg();
1009   unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1010   auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1011 
1012   int Opcode = getV_CMPOpcode(Pred, Size);
1013   if (Opcode == -1)
1014     return false;
1015 
1016   MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst)
1017                            .add(I.getOperand(2))
1018                            .add(I.getOperand(3));
1019   RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), *TRI.getBoolRC(),
1020                                *MRI);
1021   bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1022   I.eraseFromParent();
1023   return Ret;
1024 }
1025 
1026 bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1027   MachineBasicBlock *BB = I.getParent();
1028   const DebugLoc &DL = I.getDebugLoc();
1029   Register DstReg = I.getOperand(0).getReg();
1030   const unsigned Size = MRI->getType(DstReg).getSizeInBits();
1031   const bool Is64 = Size == 64;
1032 
1033   if (Size != STI.getWavefrontSize())
1034     return false;
1035 
1036   Optional<ValueAndVReg> Arg =
1037       getConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI, true);
1038 
1039   if (Arg.hasValue()) {
1040     const int64_t Value = Arg.getValue().Value;
1041     if (Value == 0) {
1042       unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1043       BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
1044     } else if (Value == -1) { // all ones
1045       Register SrcReg = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
1046       BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
1047     } else
1048       return false;
1049   } else {
1050     Register SrcReg = I.getOperand(2).getReg();
1051     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
1052   }
1053 
1054   I.eraseFromParent();
1055   return true;
1056 }
1057 
1058 bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1059   Register DstReg = I.getOperand(0).getReg();
1060   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1061   const TargetRegisterClass *DstRC =
1062     TRI.getRegClassForSizeOnBank(32, *DstBank, *MRI);
1063   if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1064     return false;
1065 
1066   const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1067 
1068   Module *M = MF->getFunction().getParent();
1069   const MDNode *Metadata = I.getOperand(2).getMetadata();
1070   auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1071   auto RelocSymbol = cast<GlobalVariable>(
1072     M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1073 
1074   MachineBasicBlock *BB = I.getParent();
1075   BuildMI(*BB, &I, I.getDebugLoc(),
1076           TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1077     .addGlobalAddress(RelocSymbol, 0, SIInstrInfo::MO_ABS32_LO);
1078 
1079   I.eraseFromParent();
1080   return true;
1081 }
1082 
1083 bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1084   MachineBasicBlock *MBB = I.getParent();
1085   MachineFunction &MF = *MBB->getParent();
1086   const DebugLoc &DL = I.getDebugLoc();
1087 
1088   MachineOperand &Dst = I.getOperand(0);
1089   Register DstReg = Dst.getReg();
1090   unsigned Depth = I.getOperand(2).getImm();
1091 
1092   const TargetRegisterClass *RC
1093     = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1094   if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1095       !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1096     return false;
1097 
1098   // Check for kernel and shader functions
1099   if (Depth != 0 ||
1100       MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1101     BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1102       .addImm(0);
1103     I.eraseFromParent();
1104     return true;
1105   }
1106 
1107   MachineFrameInfo &MFI = MF.getFrameInfo();
1108   // There is a call to @llvm.returnaddress in this function
1109   MFI.setReturnAddressIsTaken(true);
1110 
1111   // Get the return address reg and mark it as an implicit live-in
1112   Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1113   Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1114                                              AMDGPU::SReg_64RegClass);
1115   BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1116     .addReg(LiveIn);
1117   I.eraseFromParent();
1118   return true;
1119 }
1120 
1121 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1122   // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
1123   // SelectionDAG uses for wave32 vs wave64.
1124   MachineBasicBlock *BB = MI.getParent();
1125   BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1126       .add(MI.getOperand(1));
1127 
1128   Register Reg = MI.getOperand(1).getReg();
1129   MI.eraseFromParent();
1130 
1131   if (!MRI->getRegClassOrNull(Reg))
1132     MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1133   return true;
1134 }
1135 
1136 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1137   MachineInstr &MI, Intrinsic::ID IntrID) const {
1138   MachineBasicBlock *MBB = MI.getParent();
1139   MachineFunction *MF = MBB->getParent();
1140   const DebugLoc &DL = MI.getDebugLoc();
1141 
1142   unsigned IndexOperand = MI.getOperand(7).getImm();
1143   bool WaveRelease = MI.getOperand(8).getImm() != 0;
1144   bool WaveDone = MI.getOperand(9).getImm() != 0;
1145 
1146   if (WaveDone && !WaveRelease)
1147     report_fatal_error("ds_ordered_count: wave_done requires wave_release");
1148 
1149   unsigned OrderedCountIndex = IndexOperand & 0x3f;
1150   IndexOperand &= ~0x3f;
1151   unsigned CountDw = 0;
1152 
1153   if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1154     CountDw = (IndexOperand >> 24) & 0xf;
1155     IndexOperand &= ~(0xf << 24);
1156 
1157     if (CountDw < 1 || CountDw > 4) {
1158       report_fatal_error(
1159         "ds_ordered_count: dword count must be between 1 and 4");
1160     }
1161   }
1162 
1163   if (IndexOperand)
1164     report_fatal_error("ds_ordered_count: bad index operand");
1165 
1166   unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1167   unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1168 
1169   unsigned Offset0 = OrderedCountIndex << 2;
1170   unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) |
1171                      (Instruction << 4);
1172 
1173   if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
1174     Offset1 |= (CountDw - 1) << 6;
1175 
1176   unsigned Offset = Offset0 | (Offset1 << 8);
1177 
1178   Register M0Val = MI.getOperand(2).getReg();
1179   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1180     .addReg(M0Val);
1181 
1182   Register DstReg = MI.getOperand(0).getReg();
1183   Register ValReg = MI.getOperand(3).getReg();
1184   MachineInstrBuilder DS =
1185     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1186       .addReg(ValReg)
1187       .addImm(Offset)
1188       .cloneMemRefs(MI);
1189 
1190   if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1191     return false;
1192 
1193   bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1194   MI.eraseFromParent();
1195   return Ret;
1196 }
1197 
1198 static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1199   switch (IntrID) {
1200   case Intrinsic::amdgcn_ds_gws_init:
1201     return AMDGPU::DS_GWS_INIT;
1202   case Intrinsic::amdgcn_ds_gws_barrier:
1203     return AMDGPU::DS_GWS_BARRIER;
1204   case Intrinsic::amdgcn_ds_gws_sema_v:
1205     return AMDGPU::DS_GWS_SEMA_V;
1206   case Intrinsic::amdgcn_ds_gws_sema_br:
1207     return AMDGPU::DS_GWS_SEMA_BR;
1208   case Intrinsic::amdgcn_ds_gws_sema_p:
1209     return AMDGPU::DS_GWS_SEMA_P;
1210   case Intrinsic::amdgcn_ds_gws_sema_release_all:
1211     return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1212   default:
1213     llvm_unreachable("not a gws intrinsic");
1214   }
1215 }
1216 
1217 bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1218                                                      Intrinsic::ID IID) const {
1219   if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1220       !STI.hasGWSSemaReleaseAll())
1221     return false;
1222 
1223   // intrinsic ID, vsrc, offset
1224   const bool HasVSrc = MI.getNumOperands() == 3;
1225   assert(HasVSrc || MI.getNumOperands() == 2);
1226 
1227   Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1228   const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1229   if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1230     return false;
1231 
1232   MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1233   assert(OffsetDef);
1234 
1235   unsigned ImmOffset;
1236 
1237   MachineBasicBlock *MBB = MI.getParent();
1238   const DebugLoc &DL = MI.getDebugLoc();
1239 
1240   MachineInstr *Readfirstlane = nullptr;
1241 
1242   // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1243   // incoming offset, in case there's an add of a constant. We'll have to put it
1244   // back later.
1245   if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1246     Readfirstlane = OffsetDef;
1247     BaseOffset = OffsetDef->getOperand(1).getReg();
1248     OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1249   }
1250 
1251   if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1252     // If we have a constant offset, try to use the 0 in m0 as the base.
1253     // TODO: Look into changing the default m0 initialization value. If the
1254     // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1255     // the immediate offset.
1256 
1257     ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1258     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1259       .addImm(0);
1260   } else {
1261     std::tie(BaseOffset, ImmOffset, OffsetDef)
1262       = AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset);
1263 
1264     if (Readfirstlane) {
1265       // We have the constant offset now, so put the readfirstlane back on the
1266       // variable component.
1267       if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1268         return false;
1269 
1270       Readfirstlane->getOperand(1).setReg(BaseOffset);
1271       BaseOffset = Readfirstlane->getOperand(0).getReg();
1272     } else {
1273       if (!RBI.constrainGenericRegister(BaseOffset,
1274                                         AMDGPU::SReg_32RegClass, *MRI))
1275         return false;
1276     }
1277 
1278     Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1279     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1280       .addReg(BaseOffset)
1281       .addImm(16);
1282 
1283     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1284       .addReg(M0Base);
1285   }
1286 
1287   // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1288   // offset field) % 64. Some versions of the programming guide omit the m0
1289   // part, or claim it's from offset 0.
1290   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1291 
1292   if (HasVSrc) {
1293     Register VSrc = MI.getOperand(1).getReg();
1294     MIB.addReg(VSrc);
1295     if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1296       return false;
1297   }
1298 
1299   MIB.addImm(ImmOffset)
1300      .addImm(-1) // $gds
1301      .cloneMemRefs(MI);
1302 
1303   MI.eraseFromParent();
1304   return true;
1305 }
1306 
1307 bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1308                                                       bool IsAppend) const {
1309   Register PtrBase = MI.getOperand(2).getReg();
1310   LLT PtrTy = MRI->getType(PtrBase);
1311   bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1312 
1313   unsigned Offset;
1314   std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1315 
1316   // TODO: Should this try to look through readfirstlane like GWS?
1317   if (!isDSOffsetLegal(PtrBase, Offset, 16)) {
1318     PtrBase = MI.getOperand(2).getReg();
1319     Offset = 0;
1320   }
1321 
1322   MachineBasicBlock *MBB = MI.getParent();
1323   const DebugLoc &DL = MI.getDebugLoc();
1324   const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1325 
1326   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1327     .addReg(PtrBase);
1328   if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1329     return false;
1330 
1331   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1332     .addImm(Offset)
1333     .addImm(IsGDS ? -1 : 0)
1334     .cloneMemRefs(MI);
1335   MI.eraseFromParent();
1336   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1337 }
1338 
1339 static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
1340                          bool &IsTexFail) {
1341   if (TexFailCtrl)
1342     IsTexFail = true;
1343 
1344   TFE = (TexFailCtrl & 0x1) ? 1 : 0;
1345   TexFailCtrl &= ~(uint64_t)0x1;
1346   LWE = (TexFailCtrl & 0x2) ? 1 : 0;
1347   TexFailCtrl &= ~(uint64_t)0x2;
1348 
1349   return TexFailCtrl == 0;
1350 }
1351 
1352 static bool parseCachePolicy(uint64_t Value,
1353                              bool *GLC, bool *SLC, bool *DLC) {
1354   if (GLC) {
1355     *GLC = (Value & 0x1) ? 1 : 0;
1356     Value &= ~(uint64_t)0x1;
1357   }
1358   if (SLC) {
1359     *SLC = (Value & 0x2) ? 1 : 0;
1360     Value &= ~(uint64_t)0x2;
1361   }
1362   if (DLC) {
1363     *DLC = (Value & 0x4) ? 1 : 0;
1364     Value &= ~(uint64_t)0x4;
1365   }
1366 
1367   return Value == 0;
1368 }
1369 
1370 bool AMDGPUInstructionSelector::selectImageIntrinsic(
1371   MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
1372   MachineBasicBlock *MBB = MI.getParent();
1373   const DebugLoc &DL = MI.getDebugLoc();
1374 
1375   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1376     AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1377 
1378   const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
1379   const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
1380       AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
1381   const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo =
1382       AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode);
1383   unsigned IntrOpcode = Intr->BaseOpcode;
1384   const bool IsGFX10 = STI.getGeneration() >= AMDGPUSubtarget::GFX10;
1385 
1386   const int VAddrIdx = getImageVAddrIdxBegin(BaseOpcode,
1387                                              MI.getNumExplicitDefs());
1388   int NumVAddr, NumGradients;
1389   std::tie(NumVAddr, NumGradients) = getImageNumVAddr(Intr, BaseOpcode);
1390 
1391   Register VDataIn, VDataOut;
1392   LLT VDataTy;
1393   int NumVDataDwords = -1;
1394   bool IsD16 = false;
1395 
1396   // XXX - Can we just get the second to last argument for ctrl?
1397   unsigned CtrlIdx; // Index of texfailctrl argument
1398   bool Unorm;
1399   if (!BaseOpcode->Sampler) {
1400     Unorm = true;
1401     CtrlIdx = VAddrIdx + NumVAddr + 1;
1402   } else {
1403     Unorm = MI.getOperand(VAddrIdx + NumVAddr + 2).getImm() != 0;
1404     CtrlIdx = VAddrIdx + NumVAddr + 3;
1405   }
1406 
1407   bool TFE;
1408   bool LWE;
1409   bool IsTexFail = false;
1410   if (!parseTexFail(MI.getOperand(CtrlIdx).getImm(), TFE, LWE, IsTexFail))
1411     return false;
1412 
1413   const int Flags = MI.getOperand(CtrlIdx + 2).getImm();
1414   const bool IsA16 = (Flags & 1) != 0;
1415   const bool IsG16 = (Flags & 2) != 0;
1416 
1417   // A16 implies 16 bit gradients
1418   if (IsA16 && !IsG16)
1419     return false;
1420 
1421   unsigned DMask = 0;
1422   unsigned DMaskLanes = 0;
1423 
1424   if (BaseOpcode->Atomic) {
1425     VDataOut = MI.getOperand(0).getReg();
1426     VDataIn = MI.getOperand(2).getReg();
1427     LLT Ty = MRI->getType(VDataIn);
1428 
1429     // Be careful to allow atomic swap on 16-bit element vectors.
1430     const bool Is64Bit = BaseOpcode->AtomicX2 ?
1431       Ty.getSizeInBits() == 128 :
1432       Ty.getSizeInBits() == 64;
1433 
1434     if (BaseOpcode->AtomicX2) {
1435       assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
1436 
1437       DMask = Is64Bit ? 0xf : 0x3;
1438       NumVDataDwords = Is64Bit ? 4 : 2;
1439     } else {
1440       DMask = Is64Bit ? 0x3 : 0x1;
1441       NumVDataDwords = Is64Bit ? 2 : 1;
1442     }
1443   } else {
1444     const int DMaskIdx = 2; // Input/output + intrinsic ID.
1445 
1446     DMask = MI.getOperand(DMaskIdx).getImm();
1447     DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
1448 
1449     if (BaseOpcode->Store) {
1450       VDataIn = MI.getOperand(1).getReg();
1451       VDataTy = MRI->getType(VDataIn);
1452       NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
1453     } else {
1454       VDataOut = MI.getOperand(0).getReg();
1455       VDataTy = MRI->getType(VDataOut);
1456       NumVDataDwords = DMaskLanes;
1457 
1458       // One memoperand is mandatory, except for getresinfo.
1459       // FIXME: Check this in verifier.
1460       if (!MI.memoperands_empty()) {
1461         const MachineMemOperand *MMO = *MI.memoperands_begin();
1462 
1463         // Infer d16 from the memory size, as the register type will be mangled by
1464         // unpacked subtargets, or by TFE.
1465         IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32;
1466 
1467         if (IsD16 && !STI.hasUnpackedD16VMem())
1468           NumVDataDwords = (DMaskLanes + 1) / 2;
1469       }
1470     }
1471   }
1472 
1473   // Optimize _L to _LZ when _L is zero
1474   if (LZMappingInfo) {
1475     // The legalizer replaced the register with an immediate 0 if we need to
1476     // change the opcode.
1477     const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1);
1478     if (Lod.isImm()) {
1479       assert(Lod.getImm() == 0);
1480       IntrOpcode = LZMappingInfo->LZ;  // set new opcode to _lz variant of _l
1481     }
1482   }
1483 
1484   // Optimize _mip away, when 'lod' is zero
1485   if (MIPMappingInfo) {
1486     const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1);
1487     if (Lod.isImm()) {
1488       assert(Lod.getImm() == 0);
1489       IntrOpcode = MIPMappingInfo->NONMIP;  // set new opcode to variant without _mip
1490     }
1491   }
1492 
1493   // Set G16 opcode
1494   if (IsG16 && !IsA16) {
1495     const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
1496         AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
1497     assert(G16MappingInfo);
1498     IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
1499   }
1500 
1501   // TODO: Check this in verifier.
1502   assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
1503 
1504   bool GLC = false;
1505   bool SLC = false;
1506   bool DLC = false;
1507   if (BaseOpcode->Atomic) {
1508     GLC = true; // TODO no-return optimization
1509     if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), nullptr, &SLC,
1510                           IsGFX10 ? &DLC : nullptr))
1511       return false;
1512   } else {
1513     if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), &GLC, &SLC,
1514                           IsGFX10 ? &DLC : nullptr))
1515       return false;
1516   }
1517 
1518   int NumVAddrRegs = 0;
1519   int NumVAddrDwords = 0;
1520   for (int I = 0; I < NumVAddr; ++I) {
1521     // Skip the $noregs and 0s inserted during legalization.
1522     MachineOperand &AddrOp = MI.getOperand(VAddrIdx + I);
1523     if (!AddrOp.isReg())
1524       continue; // XXX - Break?
1525 
1526     Register Addr = AddrOp.getReg();
1527     if (!Addr)
1528       break;
1529 
1530     ++NumVAddrRegs;
1531     NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
1532   }
1533 
1534   // The legalizer preprocessed the intrinsic arguments. If we aren't using
1535   // NSA, these should have beeen packed into a single value in the first
1536   // address register
1537   const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs;
1538   if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
1539     LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
1540     return false;
1541   }
1542 
1543   if (IsTexFail)
1544     ++NumVDataDwords;
1545 
1546   int Opcode = -1;
1547   if (IsGFX10) {
1548     Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1549                                    UseNSA ? AMDGPU::MIMGEncGfx10NSA
1550                                           : AMDGPU::MIMGEncGfx10Default,
1551                                    NumVDataDwords, NumVAddrDwords);
1552   } else {
1553     if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
1554       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
1555                                      NumVDataDwords, NumVAddrDwords);
1556     if (Opcode == -1)
1557       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
1558                                      NumVDataDwords, NumVAddrDwords);
1559   }
1560   assert(Opcode != -1);
1561 
1562   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
1563     .cloneMemRefs(MI);
1564 
1565   if (VDataOut) {
1566     if (BaseOpcode->AtomicX2) {
1567       const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
1568 
1569       Register TmpReg = MRI->createVirtualRegister(
1570         Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
1571       unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
1572 
1573       MIB.addDef(TmpReg);
1574       BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
1575         .addReg(TmpReg, RegState::Kill, SubReg);
1576 
1577     } else {
1578       MIB.addDef(VDataOut); // vdata output
1579     }
1580   }
1581 
1582   if (VDataIn)
1583     MIB.addReg(VDataIn); // vdata input
1584 
1585   for (int i = 0; i != NumVAddrRegs; ++i) {
1586     MachineOperand &SrcOp = MI.getOperand(VAddrIdx + i);
1587     if (SrcOp.isReg()) {
1588       assert(SrcOp.getReg() != 0);
1589       MIB.addReg(SrcOp.getReg());
1590     }
1591   }
1592 
1593   MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr).getReg()); // rsrc
1594   if (BaseOpcode->Sampler)
1595     MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr + 1).getReg()); // sampler
1596 
1597   MIB.addImm(DMask); // dmask
1598 
1599   if (IsGFX10)
1600     MIB.addImm(DimInfo->Encoding);
1601   MIB.addImm(Unorm);
1602   if (IsGFX10)
1603     MIB.addImm(DLC);
1604 
1605   MIB.addImm(GLC);
1606   MIB.addImm(SLC);
1607   MIB.addImm(IsA16 &&  // a16 or r128
1608              STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
1609   if (IsGFX10)
1610     MIB.addImm(IsA16 ? -1 : 0);
1611 
1612   MIB.addImm(TFE); // tfe
1613   MIB.addImm(LWE); // lwe
1614   if (!IsGFX10)
1615     MIB.addImm(DimInfo->DA ? -1 : 0);
1616   if (BaseOpcode->HasD16)
1617     MIB.addImm(IsD16 ? -1 : 0);
1618 
1619   MI.eraseFromParent();
1620   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1621 }
1622 
1623 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
1624     MachineInstr &I) const {
1625   unsigned IntrinsicID = I.getIntrinsicID();
1626   switch (IntrinsicID) {
1627   case Intrinsic::amdgcn_end_cf:
1628     return selectEndCfIntrinsic(I);
1629   case Intrinsic::amdgcn_ds_ordered_add:
1630   case Intrinsic::amdgcn_ds_ordered_swap:
1631     return selectDSOrderedIntrinsic(I, IntrinsicID);
1632   case Intrinsic::amdgcn_ds_gws_init:
1633   case Intrinsic::amdgcn_ds_gws_barrier:
1634   case Intrinsic::amdgcn_ds_gws_sema_v:
1635   case Intrinsic::amdgcn_ds_gws_sema_br:
1636   case Intrinsic::amdgcn_ds_gws_sema_p:
1637   case Intrinsic::amdgcn_ds_gws_sema_release_all:
1638     return selectDSGWSIntrinsic(I, IntrinsicID);
1639   case Intrinsic::amdgcn_ds_append:
1640     return selectDSAppendConsume(I, true);
1641   case Intrinsic::amdgcn_ds_consume:
1642     return selectDSAppendConsume(I, false);
1643   default: {
1644     return selectImpl(I, *CoverageInfo);
1645   }
1646   }
1647 }
1648 
1649 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
1650   if (selectImpl(I, *CoverageInfo))
1651     return true;
1652 
1653   MachineBasicBlock *BB = I.getParent();
1654   const DebugLoc &DL = I.getDebugLoc();
1655 
1656   Register DstReg = I.getOperand(0).getReg();
1657   unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
1658   assert(Size <= 32 || Size == 64);
1659   const MachineOperand &CCOp = I.getOperand(1);
1660   Register CCReg = CCOp.getReg();
1661   if (!isVCC(CCReg, *MRI)) {
1662     unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
1663                                          AMDGPU::S_CSELECT_B32;
1664     MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
1665             .addReg(CCReg);
1666 
1667     // The generic constrainSelectedInstRegOperands doesn't work for the scc register
1668     // bank, because it does not cover the register class that we used to represent
1669     // for it.  So we need to manually set the register class here.
1670     if (!MRI->getRegClassOrNull(CCReg))
1671         MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
1672     MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
1673             .add(I.getOperand(2))
1674             .add(I.getOperand(3));
1675 
1676     bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI) |
1677                constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
1678     I.eraseFromParent();
1679     return Ret;
1680   }
1681 
1682   // Wide VGPR select should have been split in RegBankSelect.
1683   if (Size > 32)
1684     return false;
1685 
1686   MachineInstr *Select =
1687       BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1688               .addImm(0)
1689               .add(I.getOperand(3))
1690               .addImm(0)
1691               .add(I.getOperand(2))
1692               .add(I.getOperand(1));
1693 
1694   bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
1695   I.eraseFromParent();
1696   return Ret;
1697 }
1698 
1699 bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const {
1700   initM0(I);
1701   return selectImpl(I, *CoverageInfo);
1702 }
1703 
1704 static int sizeToSubRegIndex(unsigned Size) {
1705   switch (Size) {
1706   case 32:
1707     return AMDGPU::sub0;
1708   case 64:
1709     return AMDGPU::sub0_sub1;
1710   case 96:
1711     return AMDGPU::sub0_sub1_sub2;
1712   case 128:
1713     return AMDGPU::sub0_sub1_sub2_sub3;
1714   case 256:
1715     return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
1716   default:
1717     if (Size < 32)
1718       return AMDGPU::sub0;
1719     if (Size > 256)
1720       return -1;
1721     return sizeToSubRegIndex(PowerOf2Ceil(Size));
1722   }
1723 }
1724 
1725 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
1726   Register DstReg = I.getOperand(0).getReg();
1727   Register SrcReg = I.getOperand(1).getReg();
1728   const LLT DstTy = MRI->getType(DstReg);
1729   const LLT SrcTy = MRI->getType(SrcReg);
1730   const LLT S1 = LLT::scalar(1);
1731 
1732   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
1733   const RegisterBank *DstRB;
1734   if (DstTy == S1) {
1735     // This is a special case. We don't treat s1 for legalization artifacts as
1736     // vcc booleans.
1737     DstRB = SrcRB;
1738   } else {
1739     DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1740     if (SrcRB != DstRB)
1741       return false;
1742   }
1743 
1744   const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
1745 
1746   unsigned DstSize = DstTy.getSizeInBits();
1747   unsigned SrcSize = SrcTy.getSizeInBits();
1748 
1749   const TargetRegisterClass *SrcRC
1750     = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI);
1751   const TargetRegisterClass *DstRC
1752     = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI);
1753   if (!SrcRC || !DstRC)
1754     return false;
1755 
1756   if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
1757       !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
1758     LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
1759     return false;
1760   }
1761 
1762   if (DstTy == LLT::vector(2, 16) && SrcTy == LLT::vector(2, 32)) {
1763     MachineBasicBlock *MBB = I.getParent();
1764     const DebugLoc &DL = I.getDebugLoc();
1765 
1766     Register LoReg = MRI->createVirtualRegister(DstRC);
1767     Register HiReg = MRI->createVirtualRegister(DstRC);
1768     BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
1769       .addReg(SrcReg, 0, AMDGPU::sub0);
1770     BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
1771       .addReg(SrcReg, 0, AMDGPU::sub1);
1772 
1773     if (IsVALU && STI.hasSDWA()) {
1774       // Write the low 16-bits of the high element into the high 16-bits of the
1775       // low element.
1776       MachineInstr *MovSDWA =
1777         BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
1778         .addImm(0)                             // $src0_modifiers
1779         .addReg(HiReg)                         // $src0
1780         .addImm(0)                             // $clamp
1781         .addImm(AMDGPU::SDWA::WORD_1)          // $dst_sel
1782         .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
1783         .addImm(AMDGPU::SDWA::WORD_0)          // $src0_sel
1784         .addReg(LoReg, RegState::Implicit);
1785       MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
1786     } else {
1787       Register TmpReg0 = MRI->createVirtualRegister(DstRC);
1788       Register TmpReg1 = MRI->createVirtualRegister(DstRC);
1789       Register ImmReg = MRI->createVirtualRegister(DstRC);
1790       if (IsVALU) {
1791         BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
1792           .addImm(16)
1793           .addReg(HiReg);
1794       } else {
1795         BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
1796           .addReg(HiReg)
1797           .addImm(16);
1798       }
1799 
1800       unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
1801       unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
1802       unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
1803 
1804       BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
1805         .addImm(0xffff);
1806       BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
1807         .addReg(LoReg)
1808         .addReg(ImmReg);
1809       BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
1810         .addReg(TmpReg0)
1811         .addReg(TmpReg1);
1812     }
1813 
1814     I.eraseFromParent();
1815     return true;
1816   }
1817 
1818   if (!DstTy.isScalar())
1819     return false;
1820 
1821   if (SrcSize > 32) {
1822     int SubRegIdx = sizeToSubRegIndex(DstSize);
1823     if (SubRegIdx == -1)
1824       return false;
1825 
1826     // Deal with weird cases where the class only partially supports the subreg
1827     // index.
1828     const TargetRegisterClass *SrcWithSubRC
1829       = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
1830     if (!SrcWithSubRC)
1831       return false;
1832 
1833     if (SrcWithSubRC != SrcRC) {
1834       if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
1835         return false;
1836     }
1837 
1838     I.getOperand(1).setSubReg(SubRegIdx);
1839   }
1840 
1841   I.setDesc(TII.get(TargetOpcode::COPY));
1842   return true;
1843 }
1844 
1845 /// \returns true if a bitmask for \p Size bits will be an inline immediate.
1846 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
1847   Mask = maskTrailingOnes<unsigned>(Size);
1848   int SignedMask = static_cast<int>(Mask);
1849   return SignedMask >= -16 && SignedMask <= 64;
1850 }
1851 
1852 // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
1853 const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
1854   Register Reg, const MachineRegisterInfo &MRI,
1855   const TargetRegisterInfo &TRI) const {
1856   const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
1857   if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>())
1858     return RB;
1859 
1860   // Ignore the type, since we don't use vcc in artifacts.
1861   if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
1862     return &RBI.getRegBankFromRegClass(*RC, LLT());
1863   return nullptr;
1864 }
1865 
1866 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
1867   bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
1868   bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
1869   const DebugLoc &DL = I.getDebugLoc();
1870   MachineBasicBlock &MBB = *I.getParent();
1871   const Register DstReg = I.getOperand(0).getReg();
1872   const Register SrcReg = I.getOperand(1).getReg();
1873 
1874   const LLT DstTy = MRI->getType(DstReg);
1875   const LLT SrcTy = MRI->getType(SrcReg);
1876   const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
1877     I.getOperand(2).getImm() : SrcTy.getSizeInBits();
1878   const unsigned DstSize = DstTy.getSizeInBits();
1879   if (!DstTy.isScalar())
1880     return false;
1881 
1882   // Artifact casts should never use vcc.
1883   const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
1884 
1885   // FIXME: This should probably be illegal and split earlier.
1886   if (I.getOpcode() == AMDGPU::G_ANYEXT) {
1887     if (DstSize <= 32)
1888       return selectCOPY(I);
1889 
1890     const TargetRegisterClass *SrcRC =
1891         TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank, *MRI);
1892     const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1893     const TargetRegisterClass *DstRC =
1894         TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
1895 
1896     Register UndefReg = MRI->createVirtualRegister(SrcRC);
1897     BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
1898     BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1899       .addReg(SrcReg)
1900       .addImm(AMDGPU::sub0)
1901       .addReg(UndefReg)
1902       .addImm(AMDGPU::sub1);
1903     I.eraseFromParent();
1904 
1905     return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
1906            RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
1907   }
1908 
1909   if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
1910     // 64-bit should have been split up in RegBankSelect
1911 
1912     // Try to use an and with a mask if it will save code size.
1913     unsigned Mask;
1914     if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
1915       MachineInstr *ExtI =
1916       BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
1917         .addImm(Mask)
1918         .addReg(SrcReg);
1919       I.eraseFromParent();
1920       return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
1921     }
1922 
1923     const unsigned BFE = Signed ? AMDGPU::V_BFE_I32 : AMDGPU::V_BFE_U32;
1924     MachineInstr *ExtI =
1925       BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
1926       .addReg(SrcReg)
1927       .addImm(0) // Offset
1928       .addImm(SrcSize); // Width
1929     I.eraseFromParent();
1930     return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
1931   }
1932 
1933   if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
1934     const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
1935       AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
1936     if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
1937       return false;
1938 
1939     if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
1940       const unsigned SextOpc = SrcSize == 8 ?
1941         AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
1942       BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
1943         .addReg(SrcReg);
1944       I.eraseFromParent();
1945       return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
1946     }
1947 
1948     const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
1949     const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
1950 
1951     // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
1952     if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
1953       // We need a 64-bit register source, but the high bits don't matter.
1954       Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
1955       Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1956       unsigned SubReg = InReg ? AMDGPU::sub0 : 0;
1957 
1958       BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
1959       BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
1960         .addReg(SrcReg, 0, SubReg)
1961         .addImm(AMDGPU::sub0)
1962         .addReg(UndefReg)
1963         .addImm(AMDGPU::sub1);
1964 
1965       BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
1966         .addReg(ExtReg)
1967         .addImm(SrcSize << 16);
1968 
1969       I.eraseFromParent();
1970       return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
1971     }
1972 
1973     unsigned Mask;
1974     if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
1975       BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
1976         .addReg(SrcReg)
1977         .addImm(Mask);
1978     } else {
1979       BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
1980         .addReg(SrcReg)
1981         .addImm(SrcSize << 16);
1982     }
1983 
1984     I.eraseFromParent();
1985     return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
1986   }
1987 
1988   return false;
1989 }
1990 
1991 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
1992   MachineBasicBlock *BB = I.getParent();
1993   MachineOperand &ImmOp = I.getOperand(1);
1994 
1995   // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
1996   if (ImmOp.isFPImm()) {
1997     const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
1998     ImmOp.ChangeToImmediate(Imm.getZExtValue());
1999   } else if (ImmOp.isCImm()) {
2000     ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue());
2001   }
2002 
2003   Register DstReg = I.getOperand(0).getReg();
2004   unsigned Size;
2005   bool IsSgpr;
2006   const RegisterBank *RB = MRI->getRegBankOrNull(I.getOperand(0).getReg());
2007   if (RB) {
2008     IsSgpr = RB->getID() == AMDGPU::SGPRRegBankID;
2009     Size = MRI->getType(DstReg).getSizeInBits();
2010   } else {
2011     const TargetRegisterClass *RC = TRI.getRegClassForReg(*MRI, DstReg);
2012     IsSgpr = TRI.isSGPRClass(RC);
2013     Size = TRI.getRegSizeInBits(*RC);
2014   }
2015 
2016   if (Size != 32 && Size != 64)
2017     return false;
2018 
2019   unsigned Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
2020   if (Size == 32) {
2021     I.setDesc(TII.get(Opcode));
2022     I.addImplicitDefUseOperands(*MF);
2023     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2024   }
2025 
2026   const DebugLoc &DL = I.getDebugLoc();
2027 
2028   APInt Imm(Size, I.getOperand(1).getImm());
2029 
2030   MachineInstr *ResInst;
2031   if (IsSgpr && TII.isInlineConstant(Imm)) {
2032     ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
2033       .addImm(I.getOperand(1).getImm());
2034   } else {
2035     const TargetRegisterClass *RC = IsSgpr ?
2036       &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass;
2037     Register LoReg = MRI->createVirtualRegister(RC);
2038     Register HiReg = MRI->createVirtualRegister(RC);
2039 
2040     BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
2041       .addImm(Imm.trunc(32).getZExtValue());
2042 
2043     BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
2044       .addImm(Imm.ashr(32).getZExtValue());
2045 
2046     ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2047       .addReg(LoReg)
2048       .addImm(AMDGPU::sub0)
2049       .addReg(HiReg)
2050       .addImm(AMDGPU::sub1);
2051   }
2052 
2053   // We can't call constrainSelectedInstRegOperands here, because it doesn't
2054   // work for target independent opcodes
2055   I.eraseFromParent();
2056   const TargetRegisterClass *DstRC =
2057     TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI);
2058   if (!DstRC)
2059     return true;
2060   return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI);
2061 }
2062 
2063 bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2064   // Only manually handle the f64 SGPR case.
2065   //
2066   // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2067   // the bit ops theoretically have a second result due to the implicit def of
2068   // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2069   // that is easy by disabling the check. The result works, but uses a
2070   // nonsensical sreg32orlds_and_sreg_1 regclass.
2071   //
2072   // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2073   // the variadic REG_SEQUENCE operands.
2074 
2075   Register Dst = MI.getOperand(0).getReg();
2076   const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2077   if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2078       MRI->getType(Dst) != LLT::scalar(64))
2079     return false;
2080 
2081   Register Src = MI.getOperand(1).getReg();
2082   MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2083   if (Fabs)
2084     Src = Fabs->getOperand(1).getReg();
2085 
2086   if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2087       !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2088     return false;
2089 
2090   MachineBasicBlock *BB = MI.getParent();
2091   const DebugLoc &DL = MI.getDebugLoc();
2092   Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2093   Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2094   Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2095   Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2096 
2097   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2098     .addReg(Src, 0, AMDGPU::sub0);
2099   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2100     .addReg(Src, 0, AMDGPU::sub1);
2101   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2102     .addImm(0x80000000);
2103 
2104   // Set or toggle sign bit.
2105   unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2106   BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2107     .addReg(HiReg)
2108     .addReg(ConstReg);
2109   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2110     .addReg(LoReg)
2111     .addImm(AMDGPU::sub0)
2112     .addReg(OpReg)
2113     .addImm(AMDGPU::sub1);
2114   MI.eraseFromParent();
2115   return true;
2116 }
2117 
2118 // FIXME: This is a workaround for the same tablegen problems as G_FNEG
2119 bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2120   Register Dst = MI.getOperand(0).getReg();
2121   const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2122   if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2123       MRI->getType(Dst) != LLT::scalar(64))
2124     return false;
2125 
2126   Register Src = MI.getOperand(1).getReg();
2127   MachineBasicBlock *BB = MI.getParent();
2128   const DebugLoc &DL = MI.getDebugLoc();
2129   Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2130   Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2131   Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2132   Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2133 
2134   if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2135       !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2136     return false;
2137 
2138   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2139     .addReg(Src, 0, AMDGPU::sub0);
2140   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2141     .addReg(Src, 0, AMDGPU::sub1);
2142   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2143     .addImm(0x7fffffff);
2144 
2145   // Clear sign bit.
2146   // TODO: Should this used S_BITSET0_*?
2147   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2148     .addReg(HiReg)
2149     .addReg(ConstReg);
2150   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2151     .addReg(LoReg)
2152     .addImm(AMDGPU::sub0)
2153     .addReg(OpReg)
2154     .addImm(AMDGPU::sub1);
2155 
2156   MI.eraseFromParent();
2157   return true;
2158 }
2159 
2160 static bool isConstant(const MachineInstr &MI) {
2161   return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2162 }
2163 
2164 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2165     const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2166 
2167   const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg());
2168 
2169   assert(PtrMI);
2170 
2171   if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2172     return;
2173 
2174   GEPInfo GEPInfo(*PtrMI);
2175 
2176   for (unsigned i = 1; i != 3; ++i) {
2177     const MachineOperand &GEPOp = PtrMI->getOperand(i);
2178     const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2179     assert(OpDef);
2180     if (i == 2 && isConstant(*OpDef)) {
2181       // TODO: Could handle constant base + variable offset, but a combine
2182       // probably should have commuted it.
2183       assert(GEPInfo.Imm == 0);
2184       GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2185       continue;
2186     }
2187     const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2188     if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2189       GEPInfo.SgprParts.push_back(GEPOp.getReg());
2190     else
2191       GEPInfo.VgprParts.push_back(GEPOp.getReg());
2192   }
2193 
2194   AddrInfo.push_back(GEPInfo);
2195   getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2196 }
2197 
2198 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2199   if (!MI.hasOneMemOperand())
2200     return false;
2201 
2202   const MachineMemOperand *MMO = *MI.memoperands_begin();
2203   const Value *Ptr = MMO->getValue();
2204 
2205   // UndefValue means this is a load of a kernel input.  These are uniform.
2206   // Sometimes LDS instructions have constant pointers.
2207   // If Ptr is null, then that means this mem operand contains a
2208   // PseudoSourceValue like GOT.
2209   if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
2210       isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
2211     return true;
2212 
2213   if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
2214     return true;
2215 
2216   const Instruction *I = dyn_cast<Instruction>(Ptr);
2217   return I && I->getMetadata("amdgpu.uniform");
2218 }
2219 
2220 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2221   for (const GEPInfo &GEPInfo : AddrInfo) {
2222     if (!GEPInfo.VgprParts.empty())
2223       return true;
2224   }
2225   return false;
2226 }
2227 
2228 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2229   MachineBasicBlock *BB = I.getParent();
2230 
2231   const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2232   unsigned AS = PtrTy.getAddressSpace();
2233   if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) &&
2234       STI.ldsRequiresM0Init()) {
2235     // If DS instructions require M0 initializtion, insert it before selecting.
2236     BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2237       .addImm(-1);
2238   }
2239 }
2240 
2241 bool AMDGPUInstructionSelector::selectG_LOAD_ATOMICRMW(MachineInstr &I) const {
2242   initM0(I);
2243   return selectImpl(I, *CoverageInfo);
2244 }
2245 
2246 // TODO: No rtn optimization.
2247 bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG(
2248   MachineInstr &MI) const {
2249   Register PtrReg = MI.getOperand(1).getReg();
2250   const LLT PtrTy = MRI->getType(PtrReg);
2251   if (PtrTy.getAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
2252       STI.useFlatForGlobal())
2253     return selectImpl(MI, *CoverageInfo);
2254 
2255   Register DstReg = MI.getOperand(0).getReg();
2256   const LLT Ty = MRI->getType(DstReg);
2257   const bool Is64 = Ty.getSizeInBits() == 64;
2258   const unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2259   Register TmpReg = MRI->createVirtualRegister(
2260     Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2261 
2262   const DebugLoc &DL = MI.getDebugLoc();
2263   MachineBasicBlock *BB = MI.getParent();
2264 
2265   Register VAddr, RSrcReg, SOffset;
2266   int64_t Offset = 0;
2267 
2268   unsigned Opcode;
2269   if (selectMUBUFOffsetImpl(MI.getOperand(1), RSrcReg, SOffset, Offset)) {
2270     Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN :
2271                              AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN;
2272   } else if (selectMUBUFAddr64Impl(MI.getOperand(1), VAddr,
2273                                    RSrcReg, SOffset, Offset)) {
2274     Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN :
2275                     AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN;
2276   } else
2277     return selectImpl(MI, *CoverageInfo);
2278 
2279   auto MIB = BuildMI(*BB, &MI, DL, TII.get(Opcode), TmpReg)
2280     .addReg(MI.getOperand(2).getReg());
2281 
2282   if (VAddr)
2283     MIB.addReg(VAddr);
2284 
2285   MIB.addReg(RSrcReg);
2286   if (SOffset)
2287     MIB.addReg(SOffset);
2288   else
2289     MIB.addImm(0);
2290 
2291   MIB.addImm(Offset);
2292   MIB.addImm(0); // slc
2293   MIB.cloneMemRefs(MI);
2294 
2295   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), DstReg)
2296     .addReg(TmpReg, RegState::Kill, SubReg);
2297 
2298   MI.eraseFromParent();
2299 
2300   MRI->setRegClass(
2301     DstReg, Is64 ? &AMDGPU::VReg_64RegClass : &AMDGPU::VGPR_32RegClass);
2302   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2303 }
2304 
2305 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
2306   MachineBasicBlock *BB = I.getParent();
2307   MachineOperand &CondOp = I.getOperand(0);
2308   Register CondReg = CondOp.getReg();
2309   const DebugLoc &DL = I.getDebugLoc();
2310 
2311   unsigned BrOpcode;
2312   Register CondPhysReg;
2313   const TargetRegisterClass *ConstrainRC;
2314 
2315   // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
2316   // whether the branch is uniform when selecting the instruction. In
2317   // GlobalISel, we should push that decision into RegBankSelect. Assume for now
2318   // RegBankSelect knows what it's doing if the branch condition is scc, even
2319   // though it currently does not.
2320   if (!isVCC(CondReg, *MRI)) {
2321     if (MRI->getType(CondReg) != LLT::scalar(32))
2322       return false;
2323 
2324     CondPhysReg = AMDGPU::SCC;
2325     BrOpcode = AMDGPU::S_CBRANCH_SCC1;
2326     ConstrainRC = &AMDGPU::SReg_32RegClass;
2327   } else {
2328     // FIXME: Do we have to insert an and with exec here, like in SelectionDAG?
2329     // We sort of know that a VCC producer based on the register bank, that ands
2330     // inactive lanes with 0. What if there was a logical operation with vcc
2331     // producers in different blocks/with different exec masks?
2332     // FIXME: Should scc->vcc copies and with exec?
2333     CondPhysReg = TRI.getVCC();
2334     BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
2335     ConstrainRC = TRI.getBoolRC();
2336   }
2337 
2338   if (!MRI->getRegClassOrNull(CondReg))
2339     MRI->setRegClass(CondReg, ConstrainRC);
2340 
2341   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
2342     .addReg(CondReg);
2343   BuildMI(*BB, &I, DL, TII.get(BrOpcode))
2344     .addMBB(I.getOperand(1).getMBB());
2345 
2346   I.eraseFromParent();
2347   return true;
2348 }
2349 
2350 bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
2351   MachineInstr &I) const {
2352   Register DstReg = I.getOperand(0).getReg();
2353   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2354   const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2355   I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
2356   if (IsVGPR)
2357     I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
2358 
2359   return RBI.constrainGenericRegister(
2360     DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
2361 }
2362 
2363 bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
2364   Register DstReg = I.getOperand(0).getReg();
2365   Register SrcReg = I.getOperand(1).getReg();
2366   Register MaskReg = I.getOperand(2).getReg();
2367   LLT Ty = MRI->getType(DstReg);
2368   LLT MaskTy = MRI->getType(MaskReg);
2369 
2370   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2371   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2372   const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
2373   const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2374   if (DstRB != SrcRB) // Should only happen for hand written MIR.
2375     return false;
2376 
2377   unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2378   const TargetRegisterClass &RegRC
2379     = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2380 
2381   const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB,
2382                                                                   *MRI);
2383   const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB,
2384                                                                   *MRI);
2385   const TargetRegisterClass *MaskRC =
2386       TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB, *MRI);
2387 
2388   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2389       !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2390       !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
2391     return false;
2392 
2393   MachineBasicBlock *BB = I.getParent();
2394   const DebugLoc &DL = I.getDebugLoc();
2395   if (Ty.getSizeInBits() == 32) {
2396     assert(MaskTy.getSizeInBits() == 32 &&
2397            "ptrmask should have been narrowed during legalize");
2398 
2399     BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
2400       .addReg(SrcReg)
2401       .addReg(MaskReg);
2402     I.eraseFromParent();
2403     return true;
2404   }
2405 
2406   Register HiReg = MRI->createVirtualRegister(&RegRC);
2407   Register LoReg = MRI->createVirtualRegister(&RegRC);
2408 
2409   // Extract the subregisters from the source pointer.
2410   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
2411     .addReg(SrcReg, 0, AMDGPU::sub0);
2412   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
2413     .addReg(SrcReg, 0, AMDGPU::sub1);
2414 
2415   Register MaskedLo, MaskedHi;
2416 
2417   // Try to avoid emitting a bit operation when we only need to touch half of
2418   // the 64-bit pointer.
2419   APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64);
2420 
2421   const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
2422   const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
2423   if ((MaskOnes & MaskLo32) == MaskLo32) {
2424     // If all the bits in the low half are 1, we only need a copy for it.
2425     MaskedLo = LoReg;
2426   } else {
2427     // Extract the mask subregister and apply the and.
2428     Register MaskLo = MRI->createVirtualRegister(&RegRC);
2429     MaskedLo = MRI->createVirtualRegister(&RegRC);
2430 
2431     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
2432       .addReg(MaskReg, 0, AMDGPU::sub0);
2433     BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
2434       .addReg(LoReg)
2435       .addReg(MaskLo);
2436   }
2437 
2438   if ((MaskOnes & MaskHi32) == MaskHi32) {
2439     // If all the bits in the high half are 1, we only need a copy for it.
2440     MaskedHi = HiReg;
2441   } else {
2442     Register MaskHi = MRI->createVirtualRegister(&RegRC);
2443     MaskedHi = MRI->createVirtualRegister(&RegRC);
2444 
2445     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
2446       .addReg(MaskReg, 0, AMDGPU::sub1);
2447     BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
2448       .addReg(HiReg)
2449       .addReg(MaskHi);
2450   }
2451 
2452   BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2453     .addReg(MaskedLo)
2454     .addImm(AMDGPU::sub0)
2455     .addReg(MaskedHi)
2456     .addImm(AMDGPU::sub1);
2457   I.eraseFromParent();
2458   return true;
2459 }
2460 
2461 /// Return the register to use for the index value, and the subregister to use
2462 /// for the indirectly accessed register.
2463 static std::pair<Register, unsigned>
2464 computeIndirectRegIndex(MachineRegisterInfo &MRI,
2465                         const SIRegisterInfo &TRI,
2466                         const TargetRegisterClass *SuperRC,
2467                         Register IdxReg,
2468                         unsigned EltSize) {
2469   Register IdxBaseReg;
2470   int Offset;
2471   MachineInstr *Unused;
2472 
2473   std::tie(IdxBaseReg, Offset, Unused)
2474     = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg);
2475   if (IdxBaseReg == AMDGPU::NoRegister) {
2476     // This will happen if the index is a known constant. This should ordinarily
2477     // be legalized out, but handle it as a register just in case.
2478     assert(Offset == 0);
2479     IdxBaseReg = IdxReg;
2480   }
2481 
2482   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
2483 
2484   // Skip out of bounds offsets, or else we would end up using an undefined
2485   // register.
2486   if (static_cast<unsigned>(Offset) >= SubRegs.size())
2487     return std::make_pair(IdxReg, SubRegs[0]);
2488   return std::make_pair(IdxBaseReg, SubRegs[Offset]);
2489 }
2490 
2491 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
2492   MachineInstr &MI) const {
2493   Register DstReg = MI.getOperand(0).getReg();
2494   Register SrcReg = MI.getOperand(1).getReg();
2495   Register IdxReg = MI.getOperand(2).getReg();
2496 
2497   LLT DstTy = MRI->getType(DstReg);
2498   LLT SrcTy = MRI->getType(SrcReg);
2499 
2500   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2501   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2502   const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
2503 
2504   // The index must be scalar. If it wasn't RegBankSelect should have moved this
2505   // into a waterfall loop.
2506   if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
2507     return false;
2508 
2509   const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB,
2510                                                                   *MRI);
2511   const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(DstTy, *DstRB,
2512                                                                   *MRI);
2513   if (!SrcRC || !DstRC)
2514     return false;
2515   if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2516       !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2517       !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
2518     return false;
2519 
2520   MachineBasicBlock *BB = MI.getParent();
2521   const DebugLoc &DL = MI.getDebugLoc();
2522   const bool Is64 = DstTy.getSizeInBits() == 64;
2523 
2524   unsigned SubReg;
2525   std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, SrcRC, IdxReg,
2526                                                      DstTy.getSizeInBits() / 8);
2527 
2528   if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
2529     if (DstTy.getSizeInBits() != 32 && !Is64)
2530       return false;
2531 
2532     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2533       .addReg(IdxReg);
2534 
2535     unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
2536     BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
2537       .addReg(SrcReg, 0, SubReg)
2538       .addReg(SrcReg, RegState::Implicit);
2539     MI.eraseFromParent();
2540     return true;
2541   }
2542 
2543   if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
2544     return false;
2545 
2546   if (!STI.useVGPRIndexMode()) {
2547     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2548       .addReg(IdxReg);
2549     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
2550       .addReg(SrcReg, 0, SubReg)
2551       .addReg(SrcReg, RegState::Implicit);
2552     MI.eraseFromParent();
2553     return true;
2554   }
2555 
2556   BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON))
2557     .addReg(IdxReg)
2558     .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE);
2559   BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), DstReg)
2560     .addReg(SrcReg, 0, SubReg)
2561     .addReg(SrcReg, RegState::Implicit)
2562     .addReg(AMDGPU::M0, RegState::Implicit);
2563   BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF));
2564 
2565   MI.eraseFromParent();
2566   return true;
2567 }
2568 
2569 // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
2570 bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
2571   MachineInstr &MI) const {
2572   Register DstReg = MI.getOperand(0).getReg();
2573   Register VecReg = MI.getOperand(1).getReg();
2574   Register ValReg = MI.getOperand(2).getReg();
2575   Register IdxReg = MI.getOperand(3).getReg();
2576 
2577   LLT VecTy = MRI->getType(DstReg);
2578   LLT ValTy = MRI->getType(ValReg);
2579   unsigned VecSize = VecTy.getSizeInBits();
2580   unsigned ValSize = ValTy.getSizeInBits();
2581 
2582   const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
2583   const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
2584   const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
2585 
2586   assert(VecTy.getElementType() == ValTy);
2587 
2588   // The index must be scalar. If it wasn't RegBankSelect should have moved this
2589   // into a waterfall loop.
2590   if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
2591     return false;
2592 
2593   const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB,
2594                                                                   *MRI);
2595   const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB,
2596                                                                   *MRI);
2597 
2598   if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
2599       !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
2600       !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
2601       !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
2602     return false;
2603 
2604   if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
2605     return false;
2606 
2607   unsigned SubReg;
2608   std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg,
2609                                                      ValSize / 8);
2610 
2611   const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
2612                          STI.useVGPRIndexMode();
2613 
2614   MachineBasicBlock *BB = MI.getParent();
2615   const DebugLoc &DL = MI.getDebugLoc();
2616 
2617   if (IndexMode) {
2618     BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON))
2619       .addReg(IdxReg)
2620       .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE);
2621   } else {
2622     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2623       .addReg(IdxReg);
2624   }
2625 
2626   const MCInstrDesc &RegWriteOp
2627     = TII.getIndirectRegWritePseudo(VecSize, ValSize,
2628                                     VecRB->getID() == AMDGPU::SGPRRegBankID);
2629   BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
2630     .addReg(VecReg)
2631     .addReg(ValReg)
2632     .addImm(SubReg);
2633 
2634   if (IndexMode)
2635     BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF));
2636 
2637   MI.eraseFromParent();
2638   return true;
2639 }
2640 
2641 static bool isZeroOrUndef(int X) {
2642   return X == 0 || X == -1;
2643 }
2644 
2645 static bool isOneOrUndef(int X) {
2646   return X == 1 || X == -1;
2647 }
2648 
2649 static bool isZeroOrOneOrUndef(int X) {
2650   return X == 0 || X == 1 || X == -1;
2651 }
2652 
2653 // Normalize a VOP3P shuffle mask to refer to the low/high half of a single
2654 // 32-bit register.
2655 static Register normalizeVOP3PMask(int NewMask[2], Register Src0, Register Src1,
2656                                    ArrayRef<int> Mask) {
2657   NewMask[0] = Mask[0];
2658   NewMask[1] = Mask[1];
2659   if (isZeroOrOneOrUndef(Mask[0]) && isZeroOrOneOrUndef(Mask[1]))
2660     return Src0;
2661 
2662   assert(NewMask[0] == 2 || NewMask[0] == 3 || NewMask[0] == -1);
2663   assert(NewMask[1] == 2 || NewMask[1] == 3 || NewMask[1] == -1);
2664 
2665   // Shift the mask inputs to be 0/1;
2666   NewMask[0] = NewMask[0] == -1 ? -1 : NewMask[0] - 2;
2667   NewMask[1] = NewMask[1] == -1 ? -1 : NewMask[1] - 2;
2668   return Src1;
2669 }
2670 
2671 // This is only legal with VOP3P instructions as an aid to op_sel matching.
2672 bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
2673   MachineInstr &MI) const {
2674   Register DstReg = MI.getOperand(0).getReg();
2675   Register Src0Reg = MI.getOperand(1).getReg();
2676   Register Src1Reg = MI.getOperand(2).getReg();
2677   ArrayRef<int> ShufMask = MI.getOperand(3).getShuffleMask();
2678 
2679   const LLT V2S16 = LLT::vector(2, 16);
2680   if (MRI->getType(DstReg) != V2S16 || MRI->getType(Src0Reg) != V2S16)
2681     return false;
2682 
2683   if (!AMDGPU::isLegalVOP3PShuffleMask(ShufMask))
2684     return false;
2685 
2686   assert(ShufMask.size() == 2);
2687   assert(STI.hasSDWA() && "no target has VOP3P but not SDWA");
2688 
2689   MachineBasicBlock *MBB = MI.getParent();
2690   const DebugLoc &DL = MI.getDebugLoc();
2691 
2692   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2693   const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2694   const TargetRegisterClass &RC = IsVALU ?
2695     AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2696 
2697   // Handle the degenerate case which should have folded out.
2698   if (ShufMask[0] == -1 && ShufMask[1] == -1) {
2699     BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), DstReg);
2700 
2701     MI.eraseFromParent();
2702     return RBI.constrainGenericRegister(DstReg, RC, *MRI);
2703   }
2704 
2705   // A legal VOP3P mask only reads one of the sources.
2706   int Mask[2];
2707   Register SrcVec = normalizeVOP3PMask(Mask, Src0Reg, Src1Reg, ShufMask);
2708 
2709   if (!RBI.constrainGenericRegister(DstReg, RC, *MRI) ||
2710       !RBI.constrainGenericRegister(SrcVec, RC, *MRI))
2711     return false;
2712 
2713   // TODO: This also should have been folded out
2714   if (isZeroOrUndef(Mask[0]) && isOneOrUndef(Mask[1])) {
2715     BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), DstReg)
2716       .addReg(SrcVec);
2717 
2718     MI.eraseFromParent();
2719     return true;
2720   }
2721 
2722   if (Mask[0] == 1 && Mask[1] == -1) {
2723     if (IsVALU) {
2724       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
2725         .addImm(16)
2726         .addReg(SrcVec);
2727     } else {
2728       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
2729         .addReg(SrcVec)
2730         .addImm(16);
2731     }
2732   } else if (Mask[0] == -1 && Mask[1] == 0) {
2733     if (IsVALU) {
2734       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), DstReg)
2735         .addImm(16)
2736         .addReg(SrcVec);
2737     } else {
2738       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHL_B32), DstReg)
2739         .addReg(SrcVec)
2740         .addImm(16);
2741     }
2742   } else if (Mask[0] == 0 && Mask[1] == 0) {
2743     if (IsVALU) {
2744       // Write low half of the register into the high half.
2745       MachineInstr *MovSDWA =
2746         BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2747         .addImm(0)                             // $src0_modifiers
2748         .addReg(SrcVec)                        // $src0
2749         .addImm(0)                             // $clamp
2750         .addImm(AMDGPU::SDWA::WORD_1)          // $dst_sel
2751         .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2752         .addImm(AMDGPU::SDWA::WORD_0)          // $src0_sel
2753         .addReg(SrcVec, RegState::Implicit);
2754       MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2755     } else {
2756       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
2757         .addReg(SrcVec)
2758         .addReg(SrcVec);
2759     }
2760   } else if (Mask[0] == 1 && Mask[1] == 1) {
2761     if (IsVALU) {
2762       // Write high half of the register into the low half.
2763       MachineInstr *MovSDWA =
2764         BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2765         .addImm(0)                             // $src0_modifiers
2766         .addReg(SrcVec)                        // $src0
2767         .addImm(0)                             // $clamp
2768         .addImm(AMDGPU::SDWA::WORD_0)          // $dst_sel
2769         .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2770         .addImm(AMDGPU::SDWA::WORD_1)          // $src0_sel
2771         .addReg(SrcVec, RegState::Implicit);
2772       MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2773     } else {
2774       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_HH_B32_B16), DstReg)
2775         .addReg(SrcVec)
2776         .addReg(SrcVec);
2777     }
2778   } else if (Mask[0] == 1 && Mask[1] == 0) {
2779     if (IsVALU) {
2780       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_ALIGNBIT_B32), DstReg)
2781         .addReg(SrcVec)
2782         .addReg(SrcVec)
2783         .addImm(16);
2784     } else {
2785       Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2786       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg)
2787         .addReg(SrcVec)
2788         .addImm(16);
2789       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
2790         .addReg(TmpReg)
2791         .addReg(SrcVec);
2792     }
2793   } else
2794     llvm_unreachable("all shuffle masks should be handled");
2795 
2796   MI.eraseFromParent();
2797   return true;
2798 }
2799 
2800 bool AMDGPUInstructionSelector::select(MachineInstr &I) {
2801   if (I.isPHI())
2802     return selectPHI(I);
2803 
2804   if (!I.isPreISelOpcode()) {
2805     if (I.isCopy())
2806       return selectCOPY(I);
2807     return true;
2808   }
2809 
2810   switch (I.getOpcode()) {
2811   case TargetOpcode::G_AND:
2812   case TargetOpcode::G_OR:
2813   case TargetOpcode::G_XOR:
2814     if (selectImpl(I, *CoverageInfo))
2815       return true;
2816     return selectG_AND_OR_XOR(I);
2817   case TargetOpcode::G_ADD:
2818   case TargetOpcode::G_SUB:
2819     if (selectImpl(I, *CoverageInfo))
2820       return true;
2821     return selectG_ADD_SUB(I);
2822   case TargetOpcode::G_UADDO:
2823   case TargetOpcode::G_USUBO:
2824   case TargetOpcode::G_UADDE:
2825   case TargetOpcode::G_USUBE:
2826     return selectG_UADDO_USUBO_UADDE_USUBE(I);
2827   case TargetOpcode::G_INTTOPTR:
2828   case TargetOpcode::G_BITCAST:
2829   case TargetOpcode::G_PTRTOINT:
2830     return selectCOPY(I);
2831   case TargetOpcode::G_CONSTANT:
2832   case TargetOpcode::G_FCONSTANT:
2833     return selectG_CONSTANT(I);
2834   case TargetOpcode::G_FNEG:
2835     if (selectImpl(I, *CoverageInfo))
2836       return true;
2837     return selectG_FNEG(I);
2838   case TargetOpcode::G_FABS:
2839     if (selectImpl(I, *CoverageInfo))
2840       return true;
2841     return selectG_FABS(I);
2842   case TargetOpcode::G_EXTRACT:
2843     return selectG_EXTRACT(I);
2844   case TargetOpcode::G_MERGE_VALUES:
2845   case TargetOpcode::G_BUILD_VECTOR:
2846   case TargetOpcode::G_CONCAT_VECTORS:
2847     return selectG_MERGE_VALUES(I);
2848   case TargetOpcode::G_UNMERGE_VALUES:
2849     return selectG_UNMERGE_VALUES(I);
2850   case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2851     return selectG_BUILD_VECTOR_TRUNC(I);
2852   case TargetOpcode::G_PTR_ADD:
2853     return selectG_PTR_ADD(I);
2854   case TargetOpcode::G_IMPLICIT_DEF:
2855     return selectG_IMPLICIT_DEF(I);
2856   case TargetOpcode::G_FREEZE:
2857     return selectCOPY(I);
2858   case TargetOpcode::G_INSERT:
2859     return selectG_INSERT(I);
2860   case TargetOpcode::G_INTRINSIC:
2861     return selectG_INTRINSIC(I);
2862   case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
2863     return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
2864   case TargetOpcode::G_ICMP:
2865     if (selectG_ICMP(I))
2866       return true;
2867     return selectImpl(I, *CoverageInfo);
2868   case TargetOpcode::G_LOAD:
2869   case TargetOpcode::G_ATOMIC_CMPXCHG:
2870   case TargetOpcode::G_ATOMICRMW_XCHG:
2871   case TargetOpcode::G_ATOMICRMW_ADD:
2872   case TargetOpcode::G_ATOMICRMW_SUB:
2873   case TargetOpcode::G_ATOMICRMW_AND:
2874   case TargetOpcode::G_ATOMICRMW_OR:
2875   case TargetOpcode::G_ATOMICRMW_XOR:
2876   case TargetOpcode::G_ATOMICRMW_MIN:
2877   case TargetOpcode::G_ATOMICRMW_MAX:
2878   case TargetOpcode::G_ATOMICRMW_UMIN:
2879   case TargetOpcode::G_ATOMICRMW_UMAX:
2880   case TargetOpcode::G_ATOMICRMW_FADD:
2881   case AMDGPU::G_AMDGPU_ATOMIC_INC:
2882   case AMDGPU::G_AMDGPU_ATOMIC_DEC:
2883   case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
2884   case AMDGPU::G_AMDGPU_ATOMIC_FMAX:
2885     return selectG_LOAD_ATOMICRMW(I);
2886   case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
2887     return selectG_AMDGPU_ATOMIC_CMPXCHG(I);
2888   case TargetOpcode::G_SELECT:
2889     return selectG_SELECT(I);
2890   case TargetOpcode::G_STORE:
2891     return selectG_STORE(I);
2892   case TargetOpcode::G_TRUNC:
2893     return selectG_TRUNC(I);
2894   case TargetOpcode::G_SEXT:
2895   case TargetOpcode::G_ZEXT:
2896   case TargetOpcode::G_ANYEXT:
2897   case TargetOpcode::G_SEXT_INREG:
2898     if (selectImpl(I, *CoverageInfo))
2899       return true;
2900     return selectG_SZA_EXT(I);
2901   case TargetOpcode::G_BRCOND:
2902     return selectG_BRCOND(I);
2903   case TargetOpcode::G_GLOBAL_VALUE:
2904     return selectG_GLOBAL_VALUE(I);
2905   case TargetOpcode::G_PTRMASK:
2906     return selectG_PTRMASK(I);
2907   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2908     return selectG_EXTRACT_VECTOR_ELT(I);
2909   case TargetOpcode::G_INSERT_VECTOR_ELT:
2910     return selectG_INSERT_VECTOR_ELT(I);
2911   case TargetOpcode::G_SHUFFLE_VECTOR:
2912     return selectG_SHUFFLE_VECTOR(I);
2913   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
2914   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
2915     const AMDGPU::ImageDimIntrinsicInfo *Intr
2916       = AMDGPU::getImageDimIntrinsicInfo(I.getIntrinsicID());
2917     assert(Intr && "not an image intrinsic with image pseudo");
2918     return selectImageIntrinsic(I, Intr);
2919   }
2920   default:
2921     return selectImpl(I, *CoverageInfo);
2922   }
2923   return false;
2924 }
2925 
2926 InstructionSelector::ComplexRendererFns
2927 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
2928   return {{
2929       [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
2930   }};
2931 
2932 }
2933 
2934 std::pair<Register, unsigned>
2935 AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root) const {
2936   Register Src = Root.getReg();
2937   Register OrigSrc = Src;
2938   unsigned Mods = 0;
2939   MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
2940 
2941   if (MI && MI->getOpcode() == AMDGPU::G_FNEG) {
2942     Src = MI->getOperand(1).getReg();
2943     Mods |= SISrcMods::NEG;
2944     MI = getDefIgnoringCopies(Src, *MRI);
2945   }
2946 
2947   if (MI && MI->getOpcode() == AMDGPU::G_FABS) {
2948     Src = MI->getOperand(1).getReg();
2949     Mods |= SISrcMods::ABS;
2950   }
2951 
2952   if (Mods != 0 &&
2953       RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
2954     MachineInstr *UseMI = Root.getParent();
2955 
2956     // If we looked through copies to find source modifiers on an SGPR operand,
2957     // we now have an SGPR register source. To avoid potentially violating the
2958     // constant bus restriction, we need to insert a copy to a VGPR.
2959     Register VGPRSrc = MRI->cloneVirtualRegister(OrigSrc);
2960     BuildMI(*UseMI->getParent(), UseMI, UseMI->getDebugLoc(),
2961             TII.get(AMDGPU::COPY), VGPRSrc)
2962       .addReg(Src);
2963     Src = VGPRSrc;
2964   }
2965 
2966   return std::make_pair(Src, Mods);
2967 }
2968 
2969 ///
2970 /// This will select either an SGPR or VGPR operand and will save us from
2971 /// having to write an extra tablegen pattern.
2972 InstructionSelector::ComplexRendererFns
2973 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
2974   return {{
2975       [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
2976   }};
2977 }
2978 
2979 InstructionSelector::ComplexRendererFns
2980 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
2981   Register Src;
2982   unsigned Mods;
2983   std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
2984 
2985   return {{
2986       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
2987       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
2988       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },    // clamp
2989       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }     // omod
2990   }};
2991 }
2992 
2993 InstructionSelector::ComplexRendererFns
2994 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
2995   return {{
2996       [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
2997       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
2998       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }  // omod
2999   }};
3000 }
3001 
3002 InstructionSelector::ComplexRendererFns
3003 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
3004   Register Src;
3005   unsigned Mods;
3006   std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3007 
3008   return {{
3009       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3010       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
3011   }};
3012 }
3013 
3014 InstructionSelector::ComplexRendererFns
3015 AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
3016   Register Reg = Root.getReg();
3017   const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3018   if (Def && (Def->getOpcode() == AMDGPU::G_FNEG ||
3019               Def->getOpcode() == AMDGPU::G_FABS))
3020     return {};
3021   return {{
3022       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3023   }};
3024 }
3025 
3026 std::pair<Register, unsigned>
3027 AMDGPUInstructionSelector::selectVOP3PModsImpl(
3028   Register Src, const MachineRegisterInfo &MRI) const {
3029   unsigned Mods = 0;
3030   MachineInstr *MI = MRI.getVRegDef(Src);
3031 
3032   if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
3033       // It's possible to see an f32 fneg here, but unlikely.
3034       // TODO: Treat f32 fneg as only high bit.
3035       MRI.getType(Src) == LLT::vector(2, 16)) {
3036     Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
3037     Src = MI->getOperand(1).getReg();
3038     MI = MRI.getVRegDef(Src);
3039   }
3040 
3041   // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
3042 
3043   // Packed instructions do not have abs modifiers.
3044   Mods |= SISrcMods::OP_SEL_1;
3045 
3046   return std::make_pair(Src, Mods);
3047 }
3048 
3049 InstructionSelector::ComplexRendererFns
3050 AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
3051   MachineRegisterInfo &MRI
3052     = Root.getParent()->getParent()->getParent()->getRegInfo();
3053 
3054   Register Src;
3055   unsigned Mods;
3056   std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
3057 
3058   return {{
3059       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3060       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
3061   }};
3062 }
3063 
3064 InstructionSelector::ComplexRendererFns
3065 AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {
3066   Register Src;
3067   unsigned Mods;
3068   std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3069   if (!TM.Options.NoNaNsFPMath && !isKnownNeverNaN(Src, *MRI))
3070     return None;
3071 
3072   return {{
3073       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3074       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
3075   }};
3076 }
3077 
3078 InstructionSelector::ComplexRendererFns
3079 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
3080   // FIXME: Handle op_sel
3081   return {{
3082       [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
3083       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods
3084   }};
3085 }
3086 
3087 InstructionSelector::ComplexRendererFns
3088 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
3089   SmallVector<GEPInfo, 4> AddrInfo;
3090   getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
3091 
3092   if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3093     return None;
3094 
3095   const GEPInfo &GEPInfo = AddrInfo[0];
3096   Optional<int64_t> EncodedImm =
3097       AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm, false);
3098   if (!EncodedImm)
3099     return None;
3100 
3101   unsigned PtrReg = GEPInfo.SgprParts[0];
3102   return {{
3103     [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3104     [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
3105   }};
3106 }
3107 
3108 InstructionSelector::ComplexRendererFns
3109 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
3110   SmallVector<GEPInfo, 4> AddrInfo;
3111   getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
3112 
3113   if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3114     return None;
3115 
3116   const GEPInfo &GEPInfo = AddrInfo[0];
3117   Register PtrReg = GEPInfo.SgprParts[0];
3118   Optional<int64_t> EncodedImm =
3119       AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
3120   if (!EncodedImm)
3121     return None;
3122 
3123   return {{
3124     [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3125     [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
3126   }};
3127 }
3128 
3129 InstructionSelector::ComplexRendererFns
3130 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
3131   MachineInstr *MI = Root.getParent();
3132   MachineBasicBlock *MBB = MI->getParent();
3133 
3134   SmallVector<GEPInfo, 4> AddrInfo;
3135   getAddrModeInfo(*MI, *MRI, AddrInfo);
3136 
3137   // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
3138   // then we can select all ptr + 32-bit offsets not just immediate offsets.
3139   if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3140     return None;
3141 
3142   const GEPInfo &GEPInfo = AddrInfo[0];
3143   // SGPR offset is unsigned.
3144   if (!GEPInfo.Imm || GEPInfo.Imm < 0 || !isUInt<32>(GEPInfo.Imm))
3145     return None;
3146 
3147   // If we make it this far we have a load with an 32-bit immediate offset.
3148   // It is OK to select this using a sgpr offset, because we have already
3149   // failed trying to select this load into one of the _IMM variants since
3150   // the _IMM Patterns are considered before the _SGPR patterns.
3151   Register PtrReg = GEPInfo.SgprParts[0];
3152   Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3153   BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg)
3154           .addImm(GEPInfo.Imm);
3155   return {{
3156     [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3157     [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }
3158   }};
3159 }
3160 
3161 template <bool Signed>
3162 InstructionSelector::ComplexRendererFns
3163 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const {
3164   MachineInstr *MI = Root.getParent();
3165 
3166   InstructionSelector::ComplexRendererFns Default = {{
3167       [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
3168       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },  // offset
3169       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }  // slc
3170     }};
3171 
3172   if (!STI.hasFlatInstOffsets())
3173     return Default;
3174 
3175   const MachineInstr *OpDef = MRI->getVRegDef(Root.getReg());
3176   if (!OpDef || OpDef->getOpcode() != AMDGPU::G_PTR_ADD)
3177     return Default;
3178 
3179   Optional<int64_t> Offset =
3180     getConstantVRegVal(OpDef->getOperand(2).getReg(), *MRI);
3181   if (!Offset.hasValue())
3182     return Default;
3183 
3184   unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
3185   if (!TII.isLegalFLATOffset(Offset.getValue(), AddrSpace, Signed))
3186     return Default;
3187 
3188   Register BasePtr = OpDef->getOperand(1).getReg();
3189 
3190   return {{
3191       [=](MachineInstrBuilder &MIB) { MIB.addReg(BasePtr); },
3192       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset.getValue()); },
3193       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }  // slc
3194     }};
3195 }
3196 
3197 InstructionSelector::ComplexRendererFns
3198 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
3199   return selectFlatOffsetImpl<false>(Root);
3200 }
3201 
3202 InstructionSelector::ComplexRendererFns
3203 AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const {
3204   return selectFlatOffsetImpl<true>(Root);
3205 }
3206 
3207 static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
3208   auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>();
3209   return PSV && PSV->isStack();
3210 }
3211 
3212 InstructionSelector::ComplexRendererFns
3213 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
3214   MachineInstr *MI = Root.getParent();
3215   MachineBasicBlock *MBB = MI->getParent();
3216   MachineFunction *MF = MBB->getParent();
3217   const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3218 
3219   int64_t Offset = 0;
3220   if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
3221       Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) {
3222     Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3223 
3224     // TODO: Should this be inside the render function? The iterator seems to
3225     // move.
3226     BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
3227             HighBits)
3228       .addImm(Offset & ~4095);
3229 
3230     return {{[=](MachineInstrBuilder &MIB) { // rsrc
3231                MIB.addReg(Info->getScratchRSrcReg());
3232              },
3233              [=](MachineInstrBuilder &MIB) { // vaddr
3234                MIB.addReg(HighBits);
3235              },
3236              [=](MachineInstrBuilder &MIB) { // soffset
3237                const MachineMemOperand *MMO = *MI->memoperands_begin();
3238                const MachinePointerInfo &PtrInfo = MMO->getPointerInfo();
3239 
3240                if (isStackPtrRelative(PtrInfo))
3241                  MIB.addReg(Info->getStackPtrOffsetReg());
3242                else
3243                  MIB.addImm(0);
3244              },
3245              [=](MachineInstrBuilder &MIB) { // offset
3246                MIB.addImm(Offset & 4095);
3247              }}};
3248   }
3249 
3250   assert(Offset == 0 || Offset == -1);
3251 
3252   // Try to fold a frame index directly into the MUBUF vaddr field, and any
3253   // offsets.
3254   Optional<int> FI;
3255   Register VAddr = Root.getReg();
3256   if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) {
3257     if (isBaseWithConstantOffset(Root, *MRI)) {
3258       const MachineOperand &LHS = RootDef->getOperand(1);
3259       const MachineOperand &RHS = RootDef->getOperand(2);
3260       const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg());
3261       const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg());
3262       if (LHSDef && RHSDef) {
3263         int64_t PossibleOffset =
3264             RHSDef->getOperand(1).getCImm()->getSExtValue();
3265         if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) &&
3266             (!STI.privateMemoryResourceIsRangeChecked() ||
3267              KnownBits->signBitIsZero(LHS.getReg()))) {
3268           if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
3269             FI = LHSDef->getOperand(1).getIndex();
3270           else
3271             VAddr = LHS.getReg();
3272           Offset = PossibleOffset;
3273         }
3274       }
3275     } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
3276       FI = RootDef->getOperand(1).getIndex();
3277     }
3278   }
3279 
3280   return {{[=](MachineInstrBuilder &MIB) { // rsrc
3281              MIB.addReg(Info->getScratchRSrcReg());
3282            },
3283            [=](MachineInstrBuilder &MIB) { // vaddr
3284              if (FI.hasValue())
3285                MIB.addFrameIndex(FI.getValue());
3286              else
3287                MIB.addReg(VAddr);
3288            },
3289            [=](MachineInstrBuilder &MIB) { // soffset
3290              // If we don't know this private access is a local stack object, it
3291              // needs to be relative to the entry point's scratch wave offset.
3292              // TODO: Should split large offsets that don't fit like above.
3293              // TODO: Don't use scratch wave offset just because the offset
3294              // didn't fit.
3295              if (!Info->isEntryFunction() && FI.hasValue())
3296                MIB.addReg(Info->getStackPtrOffsetReg());
3297              else
3298                MIB.addImm(0);
3299            },
3300            [=](MachineInstrBuilder &MIB) { // offset
3301              MIB.addImm(Offset);
3302            }}};
3303 }
3304 
3305 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
3306                                                 int64_t Offset,
3307                                                 unsigned OffsetBits) const {
3308   if ((OffsetBits == 16 && !isUInt<16>(Offset)) ||
3309       (OffsetBits == 8 && !isUInt<8>(Offset)))
3310     return false;
3311 
3312   if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
3313     return true;
3314 
3315   // On Southern Islands instruction with a negative base value and an offset
3316   // don't seem to work.
3317   return KnownBits->signBitIsZero(Base);
3318 }
3319 
3320 InstructionSelector::ComplexRendererFns
3321 AMDGPUInstructionSelector::selectMUBUFScratchOffset(
3322     MachineOperand &Root) const {
3323   MachineInstr *MI = Root.getParent();
3324   MachineBasicBlock *MBB = MI->getParent();
3325 
3326   int64_t Offset = 0;
3327   if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
3328       !SIInstrInfo::isLegalMUBUFImmOffset(Offset))
3329     return {};
3330 
3331   const MachineFunction *MF = MBB->getParent();
3332   const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3333   const MachineMemOperand *MMO = *MI->memoperands_begin();
3334   const MachinePointerInfo &PtrInfo = MMO->getPointerInfo();
3335 
3336   return {{
3337       [=](MachineInstrBuilder &MIB) { // rsrc
3338         MIB.addReg(Info->getScratchRSrcReg());
3339       },
3340       [=](MachineInstrBuilder &MIB) { // soffset
3341         if (isStackPtrRelative(PtrInfo))
3342           MIB.addReg(Info->getStackPtrOffsetReg());
3343         else
3344           MIB.addImm(0);
3345       },
3346       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
3347   }};
3348 }
3349 
3350 std::pair<Register, unsigned>
3351 AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
3352   const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
3353   if (!RootDef)
3354     return std::make_pair(Root.getReg(), 0);
3355 
3356   int64_t ConstAddr = 0;
3357 
3358   Register PtrBase;
3359   int64_t Offset;
3360   std::tie(PtrBase, Offset) =
3361     getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3362 
3363   if (Offset) {
3364     if (isDSOffsetLegal(PtrBase, Offset, 16)) {
3365       // (add n0, c0)
3366       return std::make_pair(PtrBase, Offset);
3367     }
3368   } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
3369     // TODO
3370 
3371 
3372   } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
3373     // TODO
3374 
3375   }
3376 
3377   return std::make_pair(Root.getReg(), 0);
3378 }
3379 
3380 InstructionSelector::ComplexRendererFns
3381 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
3382   Register Reg;
3383   unsigned Offset;
3384   std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
3385   return {{
3386       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3387       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
3388     }};
3389 }
3390 
3391 InstructionSelector::ComplexRendererFns
3392 AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
3393   Register Reg;
3394   unsigned Offset;
3395   std::tie(Reg, Offset) = selectDS64Bit4ByteAlignedImpl(Root);
3396   return {{
3397       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3398       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
3399       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
3400     }};
3401 }
3402 
3403 std::pair<Register, unsigned>
3404 AMDGPUInstructionSelector::selectDS64Bit4ByteAlignedImpl(MachineOperand &Root) const {
3405   const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
3406   if (!RootDef)
3407     return std::make_pair(Root.getReg(), 0);
3408 
3409   int64_t ConstAddr = 0;
3410 
3411   Register PtrBase;
3412   int64_t Offset;
3413   std::tie(PtrBase, Offset) =
3414     getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3415 
3416   if (Offset) {
3417     int64_t DWordOffset0 = Offset / 4;
3418     int64_t DWordOffset1 = DWordOffset0 + 1;
3419     if (isDSOffsetLegal(PtrBase, DWordOffset1, 8)) {
3420       // (add n0, c0)
3421       return std::make_pair(PtrBase, DWordOffset0);
3422     }
3423   } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
3424     // TODO
3425 
3426   } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
3427     // TODO
3428 
3429   }
3430 
3431   return std::make_pair(Root.getReg(), 0);
3432 }
3433 
3434 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
3435 /// the base value with the constant offset. There may be intervening copies
3436 /// between \p Root and the identified constant. Returns \p Root, 0 if this does
3437 /// not match the pattern.
3438 std::pair<Register, int64_t>
3439 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
3440   Register Root, const MachineRegisterInfo &MRI) const {
3441   MachineInstr *RootI = MRI.getVRegDef(Root);
3442   if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
3443     return {Root, 0};
3444 
3445   MachineOperand &RHS = RootI->getOperand(2);
3446   Optional<ValueAndVReg> MaybeOffset
3447     = getConstantVRegValWithLookThrough(RHS.getReg(), MRI, true);
3448   if (!MaybeOffset)
3449     return {Root, 0};
3450   return {RootI->getOperand(1).getReg(), MaybeOffset->Value};
3451 }
3452 
3453 static void addZeroImm(MachineInstrBuilder &MIB) {
3454   MIB.addImm(0);
3455 }
3456 
3457 /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
3458 /// BasePtr is not valid, a null base pointer will be used.
3459 static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI,
3460                           uint32_t FormatLo, uint32_t FormatHi,
3461                           Register BasePtr) {
3462   Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
3463   Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
3464   Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3465   Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
3466 
3467   B.buildInstr(AMDGPU::S_MOV_B32)
3468     .addDef(RSrc2)
3469     .addImm(FormatLo);
3470   B.buildInstr(AMDGPU::S_MOV_B32)
3471     .addDef(RSrc3)
3472     .addImm(FormatHi);
3473 
3474   // Build the half of the subregister with the constants before building the
3475   // full 128-bit register. If we are building multiple resource descriptors,
3476   // this will allow CSEing of the 2-component register.
3477   B.buildInstr(AMDGPU::REG_SEQUENCE)
3478     .addDef(RSrcHi)
3479     .addReg(RSrc2)
3480     .addImm(AMDGPU::sub0)
3481     .addReg(RSrc3)
3482     .addImm(AMDGPU::sub1);
3483 
3484   Register RSrcLo = BasePtr;
3485   if (!BasePtr) {
3486     RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3487     B.buildInstr(AMDGPU::S_MOV_B64)
3488       .addDef(RSrcLo)
3489       .addImm(0);
3490   }
3491 
3492   B.buildInstr(AMDGPU::REG_SEQUENCE)
3493     .addDef(RSrc)
3494     .addReg(RSrcLo)
3495     .addImm(AMDGPU::sub0_sub1)
3496     .addReg(RSrcHi)
3497     .addImm(AMDGPU::sub2_sub3);
3498 
3499   return RSrc;
3500 }
3501 
3502 static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
3503                                 const SIInstrInfo &TII, Register BasePtr) {
3504   uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
3505 
3506   // FIXME: Why are half the "default" bits ignored based on the addressing
3507   // mode?
3508   return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
3509 }
3510 
3511 static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
3512                                const SIInstrInfo &TII, Register BasePtr) {
3513   uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
3514 
3515   // FIXME: Why are half the "default" bits ignored based on the addressing
3516   // mode?
3517   return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
3518 }
3519 
3520 AMDGPUInstructionSelector::MUBUFAddressData
3521 AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
3522   MUBUFAddressData Data;
3523   Data.N0 = Src;
3524 
3525   Register PtrBase;
3526   int64_t Offset;
3527 
3528   std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
3529   if (isUInt<32>(Offset)) {
3530     Data.N0 = PtrBase;
3531     Data.Offset = Offset;
3532   }
3533 
3534   if (MachineInstr *InputAdd
3535       = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
3536     Data.N2 = InputAdd->getOperand(1).getReg();
3537     Data.N3 = InputAdd->getOperand(2).getReg();
3538 
3539     // FIXME: Need to fix extra SGPR->VGPRcopies inserted
3540     // FIXME: Don't know this was defined by operand 0
3541     //
3542     // TODO: Remove this when we have copy folding optimizations after
3543     // RegBankSelect.
3544     Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
3545     Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
3546   }
3547 
3548   return Data;
3549 }
3550 
3551 /// Return if the addr64 mubuf mode should be used for the given address.
3552 bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
3553   // (ptr_add N2, N3) -> addr64, or
3554   // (ptr_add (ptr_add N2, N3), C1) -> addr64
3555   if (Addr.N2)
3556     return true;
3557 
3558   const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
3559   return N0Bank->getID() == AMDGPU::VGPRRegBankID;
3560 }
3561 
3562 /// Split an immediate offset \p ImmOffset depending on whether it fits in the
3563 /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
3564 /// component.
3565 void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
3566   MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
3567   if (SIInstrInfo::isLegalMUBUFImmOffset(ImmOffset))
3568     return;
3569 
3570   // Illegal offset, store it in soffset.
3571   SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3572   B.buildInstr(AMDGPU::S_MOV_B32)
3573     .addDef(SOffset)
3574     .addImm(ImmOffset);
3575   ImmOffset = 0;
3576 }
3577 
3578 bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
3579   MachineOperand &Root, Register &VAddr, Register &RSrcReg,
3580   Register &SOffset, int64_t &Offset) const {
3581   // FIXME: Predicates should stop this from reaching here.
3582   // addr64 bit was removed for volcanic islands.
3583   if (!STI.hasAddr64() || STI.useFlatForGlobal())
3584     return false;
3585 
3586   MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
3587   if (!shouldUseAddr64(AddrData))
3588     return false;
3589 
3590   Register N0 = AddrData.N0;
3591   Register N2 = AddrData.N2;
3592   Register N3 = AddrData.N3;
3593   Offset = AddrData.Offset;
3594 
3595   // Base pointer for the SRD.
3596   Register SRDPtr;
3597 
3598   if (N2) {
3599     if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
3600       assert(N3);
3601       if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
3602         // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
3603         // addr64, and construct the default resource from a 0 address.
3604         VAddr = N0;
3605       } else {
3606         SRDPtr = N3;
3607         VAddr = N2;
3608       }
3609     } else {
3610       // N2 is not divergent.
3611       SRDPtr = N2;
3612       VAddr = N3;
3613     }
3614   } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
3615     // Use the default null pointer in the resource
3616     VAddr = N0;
3617   } else {
3618     // N0 -> offset, or
3619     // (N0 + C1) -> offset
3620     SRDPtr = N0;
3621   }
3622 
3623   MachineIRBuilder B(*Root.getParent());
3624   RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
3625   splitIllegalMUBUFOffset(B, SOffset, Offset);
3626   return true;
3627 }
3628 
3629 bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
3630   MachineOperand &Root, Register &RSrcReg, Register &SOffset,
3631   int64_t &Offset) const {
3632   MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
3633   if (shouldUseAddr64(AddrData))
3634     return false;
3635 
3636   // N0 -> offset, or
3637   // (N0 + C1) -> offset
3638   Register SRDPtr = AddrData.N0;
3639   Offset = AddrData.Offset;
3640 
3641   // TODO: Look through extensions for 32-bit soffset.
3642   MachineIRBuilder B(*Root.getParent());
3643 
3644   RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
3645   splitIllegalMUBUFOffset(B, SOffset, Offset);
3646   return true;
3647 }
3648 
3649 InstructionSelector::ComplexRendererFns
3650 AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
3651   Register VAddr;
3652   Register RSrcReg;
3653   Register SOffset;
3654   int64_t Offset = 0;
3655 
3656   if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
3657     return {};
3658 
3659   // FIXME: Use defaulted operands for trailing 0s and remove from the complex
3660   // pattern.
3661   return {{
3662       [=](MachineInstrBuilder &MIB) {  // rsrc
3663         MIB.addReg(RSrcReg);
3664       },
3665       [=](MachineInstrBuilder &MIB) { // vaddr
3666         MIB.addReg(VAddr);
3667       },
3668       [=](MachineInstrBuilder &MIB) { // soffset
3669         if (SOffset)
3670           MIB.addReg(SOffset);
3671         else
3672           MIB.addImm(0);
3673       },
3674       [=](MachineInstrBuilder &MIB) { // offset
3675         MIB.addImm(Offset);
3676       },
3677       addZeroImm, //  glc
3678       addZeroImm, //  slc
3679       addZeroImm, //  tfe
3680       addZeroImm, //  dlc
3681       addZeroImm  //  swz
3682     }};
3683 }
3684 
3685 InstructionSelector::ComplexRendererFns
3686 AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
3687   Register RSrcReg;
3688   Register SOffset;
3689   int64_t Offset = 0;
3690 
3691   if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
3692     return {};
3693 
3694   return {{
3695       [=](MachineInstrBuilder &MIB) {  // rsrc
3696         MIB.addReg(RSrcReg);
3697       },
3698       [=](MachineInstrBuilder &MIB) { // soffset
3699         if (SOffset)
3700           MIB.addReg(SOffset);
3701         else
3702           MIB.addImm(0);
3703       },
3704       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
3705       addZeroImm, //  glc
3706       addZeroImm, //  slc
3707       addZeroImm, //  tfe
3708       addZeroImm, //  dlc
3709       addZeroImm  //  swz
3710     }};
3711 }
3712 
3713 InstructionSelector::ComplexRendererFns
3714 AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const {
3715   Register VAddr;
3716   Register RSrcReg;
3717   Register SOffset;
3718   int64_t Offset = 0;
3719 
3720   if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
3721     return {};
3722 
3723   // FIXME: Use defaulted operands for trailing 0s and remove from the complex
3724   // pattern.
3725   return {{
3726       [=](MachineInstrBuilder &MIB) {  // rsrc
3727         MIB.addReg(RSrcReg);
3728       },
3729       [=](MachineInstrBuilder &MIB) { // vaddr
3730         MIB.addReg(VAddr);
3731       },
3732       [=](MachineInstrBuilder &MIB) { // soffset
3733         if (SOffset)
3734           MIB.addReg(SOffset);
3735         else
3736           MIB.addImm(0);
3737       },
3738       [=](MachineInstrBuilder &MIB) { // offset
3739         MIB.addImm(Offset);
3740       },
3741       addZeroImm //  slc
3742     }};
3743 }
3744 
3745 InstructionSelector::ComplexRendererFns
3746 AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const {
3747   Register RSrcReg;
3748   Register SOffset;
3749   int64_t Offset = 0;
3750 
3751   if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
3752     return {};
3753 
3754   return {{
3755       [=](MachineInstrBuilder &MIB) {  // rsrc
3756         MIB.addReg(RSrcReg);
3757       },
3758       [=](MachineInstrBuilder &MIB) { // soffset
3759         if (SOffset)
3760           MIB.addReg(SOffset);
3761         else
3762           MIB.addImm(0);
3763       },
3764       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
3765       addZeroImm //  slc
3766     }};
3767 }
3768 
3769 /// Get an immediate that must be 32-bits, and treated as zero extended.
3770 static Optional<uint64_t> getConstantZext32Val(Register Reg,
3771                                                const MachineRegisterInfo &MRI) {
3772   // getConstantVRegVal sexts any values, so see if that matters.
3773   Optional<int64_t> OffsetVal = getConstantVRegVal(Reg, MRI);
3774   if (!OffsetVal || !isInt<32>(*OffsetVal))
3775     return None;
3776   return Lo_32(*OffsetVal);
3777 }
3778 
3779 InstructionSelector::ComplexRendererFns
3780 AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
3781   Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
3782   if (!OffsetVal)
3783     return {};
3784 
3785   Optional<int64_t> EncodedImm =
3786       AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
3787   if (!EncodedImm)
3788     return {};
3789 
3790   return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }  }};
3791 }
3792 
3793 InstructionSelector::ComplexRendererFns
3794 AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
3795   assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
3796 
3797   Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
3798   if (!OffsetVal)
3799     return {};
3800 
3801   Optional<int64_t> EncodedImm
3802     = AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal);
3803   if (!EncodedImm)
3804     return {};
3805 
3806   return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }  }};
3807 }
3808 
3809 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
3810                                                  const MachineInstr &MI,
3811                                                  int OpIdx) const {
3812   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
3813          "Expected G_CONSTANT");
3814   MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
3815 }
3816 
3817 void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
3818                                                 const MachineInstr &MI,
3819                                                 int OpIdx) const {
3820   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
3821          "Expected G_CONSTANT");
3822   MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
3823 }
3824 
3825 void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB,
3826                                                  const MachineInstr &MI,
3827                                                  int OpIdx) const {
3828   assert(OpIdx == -1);
3829 
3830   const MachineOperand &Op = MI.getOperand(1);
3831   if (MI.getOpcode() == TargetOpcode::G_FCONSTANT)
3832     MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
3833   else {
3834     assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
3835     MIB.addImm(Op.getCImm()->getSExtValue());
3836   }
3837 }
3838 
3839 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
3840                                                 const MachineInstr &MI,
3841                                                 int OpIdx) const {
3842   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
3843          "Expected G_CONSTANT");
3844   MIB.addImm(MI.getOperand(1).getCImm()->getValue().countPopulation());
3845 }
3846 
3847 /// This only really exists to satisfy DAG type checking machinery, so is a
3848 /// no-op here.
3849 void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
3850                                                 const MachineInstr &MI,
3851                                                 int OpIdx) const {
3852   MIB.addImm(MI.getOperand(OpIdx).getImm());
3853 }
3854 
3855 void AMDGPUInstructionSelector::renderExtractGLC(MachineInstrBuilder &MIB,
3856                                                  const MachineInstr &MI,
3857                                                  int OpIdx) const {
3858   assert(OpIdx >= 0 && "expected to match an immediate operand");
3859   MIB.addImm(MI.getOperand(OpIdx).getImm() & 1);
3860 }
3861 
3862 void AMDGPUInstructionSelector::renderExtractSLC(MachineInstrBuilder &MIB,
3863                                                  const MachineInstr &MI,
3864                                                  int OpIdx) const {
3865   assert(OpIdx >= 0 && "expected to match an immediate operand");
3866   MIB.addImm((MI.getOperand(OpIdx).getImm() >> 1) & 1);
3867 }
3868 
3869 void AMDGPUInstructionSelector::renderExtractDLC(MachineInstrBuilder &MIB,
3870                                                  const MachineInstr &MI,
3871                                                  int OpIdx) const {
3872   assert(OpIdx >= 0 && "expected to match an immediate operand");
3873   MIB.addImm((MI.getOperand(OpIdx).getImm() >> 2) & 1);
3874 }
3875 
3876 void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
3877                                                  const MachineInstr &MI,
3878                                                  int OpIdx) const {
3879   assert(OpIdx >= 0 && "expected to match an immediate operand");
3880   MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1);
3881 }
3882 
3883 void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
3884                                                  const MachineInstr &MI,
3885                                                  int OpIdx) const {
3886   MIB.addFrameIndex((MI.getOperand(1).getIndex()));
3887 }
3888 
3889 bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const {
3890   return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm());
3891 }
3892 
3893 bool AMDGPUInstructionSelector::isInlineImmediate32(int64_t Imm) const {
3894   return AMDGPU::isInlinableLiteral32(Imm, STI.hasInv2PiInlineImm());
3895 }
3896 
3897 bool AMDGPUInstructionSelector::isInlineImmediate64(int64_t Imm) const {
3898   return AMDGPU::isInlinableLiteral64(Imm, STI.hasInv2PiInlineImm());
3899 }
3900 
3901 bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
3902   return TII.isInlineConstant(Imm);
3903 }
3904