1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the InstructionSelector class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUInstructionSelector.h"
15 #include "AMDGPUInstrInfo.h"
16 #include "AMDGPUGlobalISelUtils.h"
17 #include "AMDGPURegisterBankInfo.h"
18 #include "AMDGPUSubtarget.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
23 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
24 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
25 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
26 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
27 #include "llvm/CodeGen/GlobalISel/Utils.h"
28 #include "llvm/CodeGen/MachineBasicBlock.h"
29 #include "llvm/CodeGen/MachineFunction.h"
30 #include "llvm/CodeGen/MachineInstr.h"
31 #include "llvm/CodeGen/MachineInstrBuilder.h"
32 #include "llvm/CodeGen/MachineRegisterInfo.h"
33 #include "llvm/IR/Type.h"
34 #include "llvm/Support/Debug.h"
35 #include "llvm/Support/raw_ostream.h"
36 
37 #define DEBUG_TYPE "amdgpu-isel"
38 
39 using namespace llvm;
40 using namespace MIPatternMatch;
41 
42 static cl::opt<bool> AllowRiskySelect(
43   "amdgpu-global-isel-risky-select",
44   cl::desc("Allow GlobalISel to select cases that are likely to not work yet"),
45   cl::init(false),
46   cl::ReallyHidden);
47 
48 #define GET_GLOBALISEL_IMPL
49 #define AMDGPUSubtarget GCNSubtarget
50 #include "AMDGPUGenGlobalISel.inc"
51 #undef GET_GLOBALISEL_IMPL
52 #undef AMDGPUSubtarget
53 
54 AMDGPUInstructionSelector::AMDGPUInstructionSelector(
55     const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
56     const AMDGPUTargetMachine &TM)
57     : InstructionSelector(), TII(*STI.getInstrInfo()),
58       TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
59       STI(STI),
60       EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG),
61 #define GET_GLOBALISEL_PREDICATES_INIT
62 #include "AMDGPUGenGlobalISel.inc"
63 #undef GET_GLOBALISEL_PREDICATES_INIT
64 #define GET_GLOBALISEL_TEMPORARIES_INIT
65 #include "AMDGPUGenGlobalISel.inc"
66 #undef GET_GLOBALISEL_TEMPORARIES_INIT
67 {
68 }
69 
70 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
71 
72 void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB,
73                                         CodeGenCoverage &CoverageInfo) {
74   MRI = &MF.getRegInfo();
75   InstructionSelector::setupMF(MF, KB, CoverageInfo);
76 }
77 
78 bool AMDGPUInstructionSelector::isVCC(Register Reg,
79                                       const MachineRegisterInfo &MRI) const {
80   // The verifier is oblivious to s1 being a valid value for wavesize registers.
81   if (Reg.isPhysical())
82     return false;
83 
84   auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
85   const TargetRegisterClass *RC =
86       RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
87   if (RC) {
88     const LLT Ty = MRI.getType(Reg);
89     return RC->hasSuperClassEq(TRI.getBoolRC()) &&
90            Ty.isValid() && Ty.getSizeInBits() == 1;
91   }
92 
93   const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
94   return RB->getID() == AMDGPU::VCCRegBankID;
95 }
96 
97 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
98                                                         unsigned NewOpc) const {
99   MI.setDesc(TII.get(NewOpc));
100   MI.RemoveOperand(1); // Remove intrinsic ID.
101   MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
102 
103   MachineOperand &Dst = MI.getOperand(0);
104   MachineOperand &Src = MI.getOperand(1);
105 
106   // TODO: This should be legalized to s32 if needed
107   if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
108     return false;
109 
110   const TargetRegisterClass *DstRC
111     = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
112   const TargetRegisterClass *SrcRC
113     = TRI.getConstrainedRegClassForOperand(Src, *MRI);
114   if (!DstRC || DstRC != SrcRC)
115     return false;
116 
117   return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
118          RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
119 }
120 
121 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
122   const DebugLoc &DL = I.getDebugLoc();
123   MachineBasicBlock *BB = I.getParent();
124   I.setDesc(TII.get(TargetOpcode::COPY));
125 
126   const MachineOperand &Src = I.getOperand(1);
127   MachineOperand &Dst = I.getOperand(0);
128   Register DstReg = Dst.getReg();
129   Register SrcReg = Src.getReg();
130 
131   if (isVCC(DstReg, *MRI)) {
132     if (SrcReg == AMDGPU::SCC) {
133       const TargetRegisterClass *RC
134         = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
135       if (!RC)
136         return true;
137       return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
138     }
139 
140     if (!isVCC(SrcReg, *MRI)) {
141       // TODO: Should probably leave the copy and let copyPhysReg expand it.
142       if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
143         return false;
144 
145       const TargetRegisterClass *SrcRC
146         = TRI.getConstrainedRegClassForOperand(Src, *MRI);
147 
148       Register MaskedReg = MRI->createVirtualRegister(SrcRC);
149 
150       // We can't trust the high bits at this point, so clear them.
151 
152       // TODO: Skip masking high bits if def is known boolean.
153 
154       unsigned AndOpc = TRI.isSGPRClass(SrcRC) ?
155         AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
156       BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
157         .addImm(1)
158         .addReg(SrcReg);
159       BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
160         .addImm(0)
161         .addReg(MaskedReg);
162 
163       if (!MRI->getRegClassOrNull(SrcReg))
164         MRI->setRegClass(SrcReg, SrcRC);
165       I.eraseFromParent();
166       return true;
167     }
168 
169     const TargetRegisterClass *RC =
170       TRI.getConstrainedRegClassForOperand(Dst, *MRI);
171     if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
172       return false;
173 
174     return true;
175   }
176 
177   for (const MachineOperand &MO : I.operands()) {
178     if (MO.getReg().isPhysical())
179       continue;
180 
181     const TargetRegisterClass *RC =
182             TRI.getConstrainedRegClassForOperand(MO, *MRI);
183     if (!RC)
184       continue;
185     RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
186   }
187   return true;
188 }
189 
190 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
191   const Register DefReg = I.getOperand(0).getReg();
192   const LLT DefTy = MRI->getType(DefReg);
193   if (DefTy == LLT::scalar(1)) {
194     if (!AllowRiskySelect) {
195       LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n");
196       return false;
197     }
198 
199     LLVM_DEBUG(dbgs() << "Selecting risky boolean phi\n");
200   }
201 
202   // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
203 
204   const RegClassOrRegBank &RegClassOrBank =
205     MRI->getRegClassOrRegBank(DefReg);
206 
207   const TargetRegisterClass *DefRC
208     = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
209   if (!DefRC) {
210     if (!DefTy.isValid()) {
211       LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
212       return false;
213     }
214 
215     const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
216     DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI);
217     if (!DefRC) {
218       LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
219       return false;
220     }
221   }
222 
223   // TODO: Verify that all registers have the same bank
224   I.setDesc(TII.get(TargetOpcode::PHI));
225   return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
226 }
227 
228 MachineOperand
229 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
230                                            const TargetRegisterClass &SubRC,
231                                            unsigned SubIdx) const {
232 
233   MachineInstr *MI = MO.getParent();
234   MachineBasicBlock *BB = MO.getParent()->getParent();
235   Register DstReg = MRI->createVirtualRegister(&SubRC);
236 
237   if (MO.isReg()) {
238     unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
239     Register Reg = MO.getReg();
240     BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
241             .addReg(Reg, 0, ComposedSubIdx);
242 
243     return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
244                                      MO.isKill(), MO.isDead(), MO.isUndef(),
245                                      MO.isEarlyClobber(), 0, MO.isDebug(),
246                                      MO.isInternalRead());
247   }
248 
249   assert(MO.isImm());
250 
251   APInt Imm(64, MO.getImm());
252 
253   switch (SubIdx) {
254   default:
255     llvm_unreachable("do not know to split immediate with this sub index.");
256   case AMDGPU::sub0:
257     return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
258   case AMDGPU::sub1:
259     return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
260   }
261 }
262 
263 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
264   switch (Opc) {
265   case AMDGPU::G_AND:
266     return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
267   case AMDGPU::G_OR:
268     return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
269   case AMDGPU::G_XOR:
270     return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
271   default:
272     llvm_unreachable("not a bit op");
273   }
274 }
275 
276 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
277   Register DstReg = I.getOperand(0).getReg();
278   unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
279 
280   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
281   if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
282       DstRB->getID() != AMDGPU::VCCRegBankID)
283     return false;
284 
285   bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
286                             STI.isWave64());
287   I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
288 
289   // Dead implicit-def of scc
290   I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
291                                          true, // isImp
292                                          false, // isKill
293                                          true)); // isDead
294   return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
295 }
296 
297 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
298   MachineBasicBlock *BB = I.getParent();
299   MachineFunction *MF = BB->getParent();
300   Register DstReg = I.getOperand(0).getReg();
301   const DebugLoc &DL = I.getDebugLoc();
302   LLT Ty = MRI->getType(DstReg);
303   if (Ty.isVector())
304     return false;
305 
306   unsigned Size = Ty.getSizeInBits();
307   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
308   const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
309   const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
310 
311   if (Size == 32) {
312     if (IsSALU) {
313       const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
314       MachineInstr *Add =
315         BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
316         .add(I.getOperand(1))
317         .add(I.getOperand(2));
318       I.eraseFromParent();
319       return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
320     }
321 
322     if (STI.hasAddNoCarry()) {
323       const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
324       I.setDesc(TII.get(Opc));
325       I.addOperand(*MF, MachineOperand::CreateImm(0));
326       I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
327       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
328     }
329 
330     const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
331 
332     Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
333     MachineInstr *Add
334       = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
335       .addDef(UnusedCarry, RegState::Dead)
336       .add(I.getOperand(1))
337       .add(I.getOperand(2))
338       .addImm(0);
339     I.eraseFromParent();
340     return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
341   }
342 
343   assert(!Sub && "illegal sub should not reach here");
344 
345   const TargetRegisterClass &RC
346     = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
347   const TargetRegisterClass &HalfRC
348     = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
349 
350   MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
351   MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
352   MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
353   MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
354 
355   Register DstLo = MRI->createVirtualRegister(&HalfRC);
356   Register DstHi = MRI->createVirtualRegister(&HalfRC);
357 
358   if (IsSALU) {
359     BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
360       .add(Lo1)
361       .add(Lo2);
362     BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
363       .add(Hi1)
364       .add(Hi2);
365   } else {
366     const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
367     Register CarryReg = MRI->createVirtualRegister(CarryRC);
368     BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
369       .addDef(CarryReg)
370       .add(Lo1)
371       .add(Lo2)
372       .addImm(0);
373     MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
374       .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
375       .add(Hi1)
376       .add(Hi2)
377       .addReg(CarryReg, RegState::Kill)
378       .addImm(0);
379 
380     if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
381       return false;
382   }
383 
384   BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
385     .addReg(DstLo)
386     .addImm(AMDGPU::sub0)
387     .addReg(DstHi)
388     .addImm(AMDGPU::sub1);
389 
390 
391   if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
392     return false;
393 
394   I.eraseFromParent();
395   return true;
396 }
397 
398 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
399   MachineInstr &I) const {
400   MachineBasicBlock *BB = I.getParent();
401   MachineFunction *MF = BB->getParent();
402   const DebugLoc &DL = I.getDebugLoc();
403   Register Dst0Reg = I.getOperand(0).getReg();
404   Register Dst1Reg = I.getOperand(1).getReg();
405   const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
406                      I.getOpcode() == AMDGPU::G_UADDE;
407   const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
408                           I.getOpcode() == AMDGPU::G_USUBE;
409 
410   if (isVCC(Dst1Reg, *MRI)) {
411     unsigned NoCarryOpc =
412         IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
413     unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
414     I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
415     I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
416     I.addOperand(*MF, MachineOperand::CreateImm(0));
417     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
418   }
419 
420   Register Src0Reg = I.getOperand(2).getReg();
421   Register Src1Reg = I.getOperand(3).getReg();
422 
423   if (HasCarryIn) {
424     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
425       .addReg(I.getOperand(4).getReg());
426   }
427 
428   unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
429   unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
430 
431   BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
432     .add(I.getOperand(2))
433     .add(I.getOperand(3));
434   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
435     .addReg(AMDGPU::SCC);
436 
437   if (!MRI->getRegClassOrNull(Dst1Reg))
438     MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
439 
440   if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
441       !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
442       !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
443     return false;
444 
445   if (HasCarryIn &&
446       !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
447                                     AMDGPU::SReg_32RegClass, *MRI))
448     return false;
449 
450   I.eraseFromParent();
451   return true;
452 }
453 
454 // TODO: We should probably legalize these to only using 32-bit results.
455 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
456   MachineBasicBlock *BB = I.getParent();
457   Register DstReg = I.getOperand(0).getReg();
458   Register SrcReg = I.getOperand(1).getReg();
459   LLT DstTy = MRI->getType(DstReg);
460   LLT SrcTy = MRI->getType(SrcReg);
461   const unsigned SrcSize = SrcTy.getSizeInBits();
462   unsigned DstSize = DstTy.getSizeInBits();
463 
464   // TODO: Should handle any multiple of 32 offset.
465   unsigned Offset = I.getOperand(2).getImm();
466   if (Offset % 32 != 0 || DstSize > 128)
467     return false;
468 
469   // 16-bit operations really use 32-bit registers.
470   // FIXME: Probably should not allow 16-bit G_EXTRACT results.
471   if (DstSize == 16)
472     DstSize = 32;
473 
474   const TargetRegisterClass *DstRC =
475     TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
476   if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
477     return false;
478 
479   const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
480   const TargetRegisterClass *SrcRC =
481     TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
482   if (!SrcRC)
483     return false;
484   unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32,
485                                                          DstSize / 32);
486   SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
487   if (!SrcRC)
488     return false;
489 
490   SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
491                                     *SrcRC, I.getOperand(1));
492   const DebugLoc &DL = I.getDebugLoc();
493   BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
494     .addReg(SrcReg, 0, SubReg);
495 
496   I.eraseFromParent();
497   return true;
498 }
499 
500 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
501   MachineBasicBlock *BB = MI.getParent();
502   Register DstReg = MI.getOperand(0).getReg();
503   LLT DstTy = MRI->getType(DstReg);
504   LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
505 
506   const unsigned SrcSize = SrcTy.getSizeInBits();
507   if (SrcSize < 32)
508     return selectImpl(MI, *CoverageInfo);
509 
510   const DebugLoc &DL = MI.getDebugLoc();
511   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
512   const unsigned DstSize = DstTy.getSizeInBits();
513   const TargetRegisterClass *DstRC =
514     TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
515   if (!DstRC)
516     return false;
517 
518   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
519   MachineInstrBuilder MIB =
520     BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
521   for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
522     MachineOperand &Src = MI.getOperand(I + 1);
523     MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
524     MIB.addImm(SubRegs[I]);
525 
526     const TargetRegisterClass *SrcRC
527       = TRI.getConstrainedRegClassForOperand(Src, *MRI);
528     if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
529       return false;
530   }
531 
532   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
533     return false;
534 
535   MI.eraseFromParent();
536   return true;
537 }
538 
539 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
540   MachineBasicBlock *BB = MI.getParent();
541   const int NumDst = MI.getNumOperands() - 1;
542 
543   MachineOperand &Src = MI.getOperand(NumDst);
544 
545   Register SrcReg = Src.getReg();
546   Register DstReg0 = MI.getOperand(0).getReg();
547   LLT DstTy = MRI->getType(DstReg0);
548   LLT SrcTy = MRI->getType(SrcReg);
549 
550   const unsigned DstSize = DstTy.getSizeInBits();
551   const unsigned SrcSize = SrcTy.getSizeInBits();
552   const DebugLoc &DL = MI.getDebugLoc();
553   const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
554 
555   const TargetRegisterClass *SrcRC =
556     TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
557   if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
558     return false;
559 
560   // Note we could have mixed SGPR and VGPR destination banks for an SGPR
561   // source, and this relies on the fact that the same subregister indices are
562   // used for both.
563   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
564   for (int I = 0, E = NumDst; I != E; ++I) {
565     MachineOperand &Dst = MI.getOperand(I);
566     BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
567       .addReg(SrcReg, 0, SubRegs[I]);
568 
569     // Make sure the subregister index is valid for the source register.
570     SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
571     if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
572       return false;
573 
574     const TargetRegisterClass *DstRC =
575       TRI.getConstrainedRegClassForOperand(Dst, *MRI);
576     if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
577       return false;
578   }
579 
580   MI.eraseFromParent();
581   return true;
582 }
583 
584 bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
585   MachineInstr &MI) const {
586   if (selectImpl(MI, *CoverageInfo))
587     return true;
588 
589   const LLT S32 = LLT::scalar(32);
590   const LLT V2S16 = LLT::vector(2, 16);
591 
592   Register Dst = MI.getOperand(0).getReg();
593   if (MRI->getType(Dst) != V2S16)
594     return false;
595 
596   const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
597   if (DstBank->getID() != AMDGPU::SGPRRegBankID)
598     return false;
599 
600   Register Src0 = MI.getOperand(1).getReg();
601   Register Src1 = MI.getOperand(2).getReg();
602   if (MRI->getType(Src0) != S32)
603     return false;
604 
605   const DebugLoc &DL = MI.getDebugLoc();
606   MachineBasicBlock *BB = MI.getParent();
607 
608   auto ConstSrc1 = getConstantVRegValWithLookThrough(Src1, *MRI, true, true);
609   if (ConstSrc1) {
610     auto ConstSrc0 = getConstantVRegValWithLookThrough(Src0, *MRI, true, true);
611     if (ConstSrc0) {
612       uint32_t Lo16 = static_cast<uint32_t>(ConstSrc0->Value) & 0xffff;
613       uint32_t Hi16 = static_cast<uint32_t>(ConstSrc1->Value) & 0xffff;
614 
615       BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst)
616         .addImm(Lo16 | (Hi16 << 16));
617       MI.eraseFromParent();
618       return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
619     }
620   }
621 
622   // TODO: This should probably be a combine somewhere
623   // (build_vector_trunc $src0, undef -> copy $src0
624   MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
625   if (Src1Def && Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
626     MI.setDesc(TII.get(AMDGPU::COPY));
627     MI.RemoveOperand(2);
628     return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI) &&
629            RBI.constrainGenericRegister(Src0, AMDGPU::SReg_32RegClass, *MRI);
630   }
631 
632   Register ShiftSrc0;
633   Register ShiftSrc1;
634   int64_t ShiftAmt;
635 
636   // With multiple uses of the shift, this will duplicate the shift and
637   // increase register pressure.
638   //
639   // (build_vector_trunc (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
640   //  => (S_PACK_HH_B32_B16 $src0, $src1)
641   // (build_vector_trunc $src0, (lshr_oneuse SReg_32:$src1, 16))
642   //  => (S_PACK_LH_B32_B16 $src0, $src1)
643   // (build_vector_trunc $src0, $src1)
644   //  => (S_PACK_LL_B32_B16 $src0, $src1)
645 
646   // FIXME: This is an inconvenient way to check a specific value
647   bool Shift0 = mi_match(
648     Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_ICst(ShiftAmt)))) &&
649     ShiftAmt == 16;
650 
651   bool Shift1 = mi_match(
652     Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_ICst(ShiftAmt)))) &&
653     ShiftAmt == 16;
654 
655   unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
656   if (Shift0 && Shift1) {
657     Opc = AMDGPU::S_PACK_HH_B32_B16;
658     MI.getOperand(1).setReg(ShiftSrc0);
659     MI.getOperand(2).setReg(ShiftSrc1);
660   } else if (Shift1) {
661     Opc = AMDGPU::S_PACK_LH_B32_B16;
662     MI.getOperand(2).setReg(ShiftSrc1);
663   } else if (Shift0 && ConstSrc1 && ConstSrc1->Value == 0) {
664     // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
665     auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
666       .addReg(ShiftSrc0)
667       .addImm(16);
668 
669     MI.eraseFromParent();
670     return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
671   }
672 
673   MI.setDesc(TII.get(Opc));
674   return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
675 }
676 
677 bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const {
678   return selectG_ADD_SUB(I);
679 }
680 
681 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
682   const MachineOperand &MO = I.getOperand(0);
683 
684   // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
685   // regbank check here is to know why getConstrainedRegClassForOperand failed.
686   const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
687   if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
688       (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
689     I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
690     return true;
691   }
692 
693   return false;
694 }
695 
696 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
697   MachineBasicBlock *BB = I.getParent();
698 
699   Register DstReg = I.getOperand(0).getReg();
700   Register Src0Reg = I.getOperand(1).getReg();
701   Register Src1Reg = I.getOperand(2).getReg();
702   LLT Src1Ty = MRI->getType(Src1Reg);
703 
704   unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
705   unsigned InsSize = Src1Ty.getSizeInBits();
706 
707   int64_t Offset = I.getOperand(3).getImm();
708 
709   // FIXME: These cases should have been illegal and unnecessary to check here.
710   if (Offset % 32 != 0 || InsSize % 32 != 0)
711     return false;
712 
713   // Currently not handled by getSubRegFromChannel.
714   if (InsSize > 128)
715     return false;
716 
717   unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
718   if (SubReg == AMDGPU::NoSubRegister)
719     return false;
720 
721   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
722   const TargetRegisterClass *DstRC =
723     TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
724   if (!DstRC)
725     return false;
726 
727   const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
728   const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
729   const TargetRegisterClass *Src0RC =
730     TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI);
731   const TargetRegisterClass *Src1RC =
732     TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI);
733 
734   // Deal with weird cases where the class only partially supports the subreg
735   // index.
736   Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
737   if (!Src0RC || !Src1RC)
738     return false;
739 
740   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
741       !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
742       !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
743     return false;
744 
745   const DebugLoc &DL = I.getDebugLoc();
746   BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
747     .addReg(Src0Reg)
748     .addReg(Src1Reg)
749     .addImm(SubReg);
750 
751   I.eraseFromParent();
752   return true;
753 }
754 
755 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
756   if (STI.getLDSBankCount() != 16)
757     return selectImpl(MI, *CoverageInfo);
758 
759   Register Dst = MI.getOperand(0).getReg();
760   Register Src0 = MI.getOperand(2).getReg();
761   Register M0Val = MI.getOperand(6).getReg();
762   if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
763       !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
764       !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
765     return false;
766 
767   // This requires 2 instructions. It is possible to write a pattern to support
768   // this, but the generated isel emitter doesn't correctly deal with multiple
769   // output instructions using the same physical register input. The copy to m0
770   // is incorrectly placed before the second instruction.
771   //
772   // TODO: Match source modifiers.
773 
774   Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
775   const DebugLoc &DL = MI.getDebugLoc();
776   MachineBasicBlock *MBB = MI.getParent();
777 
778   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
779     .addReg(M0Val);
780   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
781     .addImm(2)
782     .addImm(MI.getOperand(4).getImm())  // $attr
783     .addImm(MI.getOperand(3).getImm()); // $attrchan
784 
785   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
786     .addImm(0)                          // $src0_modifiers
787     .addReg(Src0)                       // $src0
788     .addImm(MI.getOperand(4).getImm())  // $attr
789     .addImm(MI.getOperand(3).getImm())  // $attrchan
790     .addImm(0)                          // $src2_modifiers
791     .addReg(InterpMov)                  // $src2 - 2 f16 values selected by high
792     .addImm(MI.getOperand(5).getImm())  // $high
793     .addImm(0)                          // $clamp
794     .addImm(0);                         // $omod
795 
796   MI.eraseFromParent();
797   return true;
798 }
799 
800 // Writelane is special in that it can use SGPR and M0 (which would normally
801 // count as using the constant bus twice - but in this case it is allowed since
802 // the lane selector doesn't count as a use of the constant bus). However, it is
803 // still required to abide by the 1 SGPR rule. Fix this up if we might have
804 // multiple SGPRs.
805 bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
806   // With a constant bus limit of at least 2, there's no issue.
807   if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
808     return selectImpl(MI, *CoverageInfo);
809 
810   MachineBasicBlock *MBB = MI.getParent();
811   const DebugLoc &DL = MI.getDebugLoc();
812   Register VDst = MI.getOperand(0).getReg();
813   Register Val = MI.getOperand(2).getReg();
814   Register LaneSelect = MI.getOperand(3).getReg();
815   Register VDstIn = MI.getOperand(4).getReg();
816 
817   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
818 
819   Optional<ValueAndVReg> ConstSelect =
820     getConstantVRegValWithLookThrough(LaneSelect, *MRI, true, true);
821   if (ConstSelect) {
822     // The selector has to be an inline immediate, so we can use whatever for
823     // the other operands.
824     MIB.addReg(Val);
825     MIB.addImm(ConstSelect->Value &
826                maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
827   } else {
828     Optional<ValueAndVReg> ConstVal =
829       getConstantVRegValWithLookThrough(Val, *MRI, true, true);
830 
831     // If the value written is an inline immediate, we can get away without a
832     // copy to m0.
833     if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value,
834                                                  STI.hasInv2PiInlineImm())) {
835       MIB.addImm(ConstVal->Value);
836       MIB.addReg(LaneSelect);
837     } else {
838       MIB.addReg(Val);
839 
840       // If the lane selector was originally in a VGPR and copied with
841       // readfirstlane, there's a hazard to read the same SGPR from the
842       // VALU. Constrain to a different SGPR to help avoid needing a nop later.
843       RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
844 
845       BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
846         .addReg(LaneSelect);
847       MIB.addReg(AMDGPU::M0);
848     }
849   }
850 
851   MIB.addReg(VDstIn);
852 
853   MI.eraseFromParent();
854   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
855 }
856 
857 // We need to handle this here because tablegen doesn't support matching
858 // instructions with multiple outputs.
859 bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
860   Register Dst0 = MI.getOperand(0).getReg();
861   Register Dst1 = MI.getOperand(1).getReg();
862 
863   LLT Ty = MRI->getType(Dst0);
864   unsigned Opc;
865   if (Ty == LLT::scalar(32))
866     Opc = AMDGPU::V_DIV_SCALE_F32;
867   else if (Ty == LLT::scalar(64))
868     Opc = AMDGPU::V_DIV_SCALE_F64;
869   else
870     return false;
871 
872   const DebugLoc &DL = MI.getDebugLoc();
873   MachineBasicBlock *MBB = MI.getParent();
874 
875   Register Numer = MI.getOperand(3).getReg();
876   Register Denom = MI.getOperand(4).getReg();
877   unsigned ChooseDenom = MI.getOperand(5).getImm();
878 
879   Register Src0 = ChooseDenom != 0 ? Numer : Denom;
880 
881   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
882     .addDef(Dst1)
883     .addUse(Src0)
884     .addUse(Denom)
885     .addUse(Numer);
886 
887   MI.eraseFromParent();
888   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
889 }
890 
891 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
892   unsigned IntrinsicID = I.getIntrinsicID();
893   switch (IntrinsicID) {
894   case Intrinsic::amdgcn_if_break: {
895     MachineBasicBlock *BB = I.getParent();
896 
897     // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
898     // SelectionDAG uses for wave32 vs wave64.
899     BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
900       .add(I.getOperand(0))
901       .add(I.getOperand(2))
902       .add(I.getOperand(3));
903 
904     Register DstReg = I.getOperand(0).getReg();
905     Register Src0Reg = I.getOperand(2).getReg();
906     Register Src1Reg = I.getOperand(3).getReg();
907 
908     I.eraseFromParent();
909 
910     for (Register Reg : { DstReg, Src0Reg, Src1Reg })
911       MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
912 
913     return true;
914   }
915   case Intrinsic::amdgcn_interp_p1_f16:
916     return selectInterpP1F16(I);
917   case Intrinsic::amdgcn_wqm:
918     return constrainCopyLikeIntrin(I, AMDGPU::WQM);
919   case Intrinsic::amdgcn_softwqm:
920     return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
921   case Intrinsic::amdgcn_wwm:
922     return constrainCopyLikeIntrin(I, AMDGPU::WWM);
923   case Intrinsic::amdgcn_writelane:
924     return selectWritelane(I);
925   case Intrinsic::amdgcn_div_scale:
926     return selectDivScale(I);
927   case Intrinsic::amdgcn_icmp:
928     return selectIntrinsicIcmp(I);
929   case Intrinsic::amdgcn_ballot:
930     return selectBallot(I);
931   case Intrinsic::amdgcn_reloc_constant:
932     return selectRelocConstant(I);
933   case Intrinsic::amdgcn_groupstaticsize:
934     return selectGroupStaticSize(I);
935   case Intrinsic::returnaddress:
936     return selectReturnAddress(I);
937   default:
938     return selectImpl(I, *CoverageInfo);
939   }
940 }
941 
942 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) {
943   if (Size != 32 && Size != 64)
944     return -1;
945   switch (P) {
946   default:
947     llvm_unreachable("Unknown condition code!");
948   case CmpInst::ICMP_NE:
949     return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64;
950   case CmpInst::ICMP_EQ:
951     return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64;
952   case CmpInst::ICMP_SGT:
953     return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64;
954   case CmpInst::ICMP_SGE:
955     return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64;
956   case CmpInst::ICMP_SLT:
957     return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64;
958   case CmpInst::ICMP_SLE:
959     return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64;
960   case CmpInst::ICMP_UGT:
961     return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64;
962   case CmpInst::ICMP_UGE:
963     return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64;
964   case CmpInst::ICMP_ULT:
965     return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64;
966   case CmpInst::ICMP_ULE:
967     return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64;
968   }
969 }
970 
971 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
972                                               unsigned Size) const {
973   if (Size == 64) {
974     if (!STI.hasScalarCompareEq64())
975       return -1;
976 
977     switch (P) {
978     case CmpInst::ICMP_NE:
979       return AMDGPU::S_CMP_LG_U64;
980     case CmpInst::ICMP_EQ:
981       return AMDGPU::S_CMP_EQ_U64;
982     default:
983       return -1;
984     }
985   }
986 
987   if (Size != 32)
988     return -1;
989 
990   switch (P) {
991   case CmpInst::ICMP_NE:
992     return AMDGPU::S_CMP_LG_U32;
993   case CmpInst::ICMP_EQ:
994     return AMDGPU::S_CMP_EQ_U32;
995   case CmpInst::ICMP_SGT:
996     return AMDGPU::S_CMP_GT_I32;
997   case CmpInst::ICMP_SGE:
998     return AMDGPU::S_CMP_GE_I32;
999   case CmpInst::ICMP_SLT:
1000     return AMDGPU::S_CMP_LT_I32;
1001   case CmpInst::ICMP_SLE:
1002     return AMDGPU::S_CMP_LE_I32;
1003   case CmpInst::ICMP_UGT:
1004     return AMDGPU::S_CMP_GT_U32;
1005   case CmpInst::ICMP_UGE:
1006     return AMDGPU::S_CMP_GE_U32;
1007   case CmpInst::ICMP_ULT:
1008     return AMDGPU::S_CMP_LT_U32;
1009   case CmpInst::ICMP_ULE:
1010     return AMDGPU::S_CMP_LE_U32;
1011   default:
1012     llvm_unreachable("Unknown condition code!");
1013   }
1014 }
1015 
1016 bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
1017   MachineBasicBlock *BB = I.getParent();
1018   const DebugLoc &DL = I.getDebugLoc();
1019 
1020   Register SrcReg = I.getOperand(2).getReg();
1021   unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1022 
1023   auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1024 
1025   Register CCReg = I.getOperand(0).getReg();
1026   if (!isVCC(CCReg, *MRI)) {
1027     int Opcode = getS_CMPOpcode(Pred, Size);
1028     if (Opcode == -1)
1029       return false;
1030     MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1031             .add(I.getOperand(2))
1032             .add(I.getOperand(3));
1033     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1034       .addReg(AMDGPU::SCC);
1035     bool Ret =
1036         constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
1037         RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1038     I.eraseFromParent();
1039     return Ret;
1040   }
1041 
1042   int Opcode = getV_CMPOpcode(Pred, Size);
1043   if (Opcode == -1)
1044     return false;
1045 
1046   MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode),
1047             I.getOperand(0).getReg())
1048             .add(I.getOperand(2))
1049             .add(I.getOperand(3));
1050   RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
1051                                *TRI.getBoolRC(), *MRI);
1052   bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1053   I.eraseFromParent();
1054   return Ret;
1055 }
1056 
1057 bool AMDGPUInstructionSelector::selectIntrinsicIcmp(MachineInstr &I) const {
1058   Register Dst = I.getOperand(0).getReg();
1059   if (isVCC(Dst, *MRI))
1060     return false;
1061 
1062   if (MRI->getType(Dst).getSizeInBits() != STI.getWavefrontSize())
1063     return false;
1064 
1065   MachineBasicBlock *BB = I.getParent();
1066   const DebugLoc &DL = I.getDebugLoc();
1067   Register SrcReg = I.getOperand(2).getReg();
1068   unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1069   auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1070 
1071   int Opcode = getV_CMPOpcode(Pred, Size);
1072   if (Opcode == -1)
1073     return false;
1074 
1075   MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst)
1076                            .add(I.getOperand(2))
1077                            .add(I.getOperand(3));
1078   RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), *TRI.getBoolRC(),
1079                                *MRI);
1080   bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1081   I.eraseFromParent();
1082   return Ret;
1083 }
1084 
1085 bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1086   MachineBasicBlock *BB = I.getParent();
1087   const DebugLoc &DL = I.getDebugLoc();
1088   Register DstReg = I.getOperand(0).getReg();
1089   const unsigned Size = MRI->getType(DstReg).getSizeInBits();
1090   const bool Is64 = Size == 64;
1091 
1092   if (Size != STI.getWavefrontSize())
1093     return false;
1094 
1095   Optional<ValueAndVReg> Arg =
1096       getConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI, true);
1097 
1098   if (Arg.hasValue()) {
1099     const int64_t Value = Arg.getValue().Value;
1100     if (Value == 0) {
1101       unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1102       BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
1103     } else if (Value == -1) { // all ones
1104       Register SrcReg = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
1105       BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
1106     } else
1107       return false;
1108   } else {
1109     Register SrcReg = I.getOperand(2).getReg();
1110     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
1111   }
1112 
1113   I.eraseFromParent();
1114   return true;
1115 }
1116 
1117 bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1118   Register DstReg = I.getOperand(0).getReg();
1119   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1120   const TargetRegisterClass *DstRC =
1121     TRI.getRegClassForSizeOnBank(32, *DstBank, *MRI);
1122   if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1123     return false;
1124 
1125   const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1126 
1127   Module *M = MF->getFunction().getParent();
1128   const MDNode *Metadata = I.getOperand(2).getMetadata();
1129   auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1130   auto RelocSymbol = cast<GlobalVariable>(
1131     M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1132 
1133   MachineBasicBlock *BB = I.getParent();
1134   BuildMI(*BB, &I, I.getDebugLoc(),
1135           TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1136     .addGlobalAddress(RelocSymbol, 0, SIInstrInfo::MO_ABS32_LO);
1137 
1138   I.eraseFromParent();
1139   return true;
1140 }
1141 
1142 bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1143   Triple::OSType OS = MF->getTarget().getTargetTriple().getOS();
1144 
1145   Register DstReg = I.getOperand(0).getReg();
1146   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1147   unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1148     AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1149 
1150   MachineBasicBlock *MBB = I.getParent();
1151   const DebugLoc &DL = I.getDebugLoc();
1152 
1153   auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1154 
1155   if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1156     const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1157     MIB.addImm(MFI->getLDSSize());
1158   } else {
1159     Module *M = MF->getFunction().getParent();
1160     const GlobalValue *GV
1161       = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1162     MIB.addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
1163   }
1164 
1165   I.eraseFromParent();
1166   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1167 }
1168 
1169 bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1170   MachineBasicBlock *MBB = I.getParent();
1171   MachineFunction &MF = *MBB->getParent();
1172   const DebugLoc &DL = I.getDebugLoc();
1173 
1174   MachineOperand &Dst = I.getOperand(0);
1175   Register DstReg = Dst.getReg();
1176   unsigned Depth = I.getOperand(2).getImm();
1177 
1178   const TargetRegisterClass *RC
1179     = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1180   if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1181       !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1182     return false;
1183 
1184   // Check for kernel and shader functions
1185   if (Depth != 0 ||
1186       MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1187     BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1188       .addImm(0);
1189     I.eraseFromParent();
1190     return true;
1191   }
1192 
1193   MachineFrameInfo &MFI = MF.getFrameInfo();
1194   // There is a call to @llvm.returnaddress in this function
1195   MFI.setReturnAddressIsTaken(true);
1196 
1197   // Get the return address reg and mark it as an implicit live-in
1198   Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1199   Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1200                                              AMDGPU::SReg_64RegClass);
1201   BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1202     .addReg(LiveIn);
1203   I.eraseFromParent();
1204   return true;
1205 }
1206 
1207 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1208   // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
1209   // SelectionDAG uses for wave32 vs wave64.
1210   MachineBasicBlock *BB = MI.getParent();
1211   BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1212       .add(MI.getOperand(1));
1213 
1214   Register Reg = MI.getOperand(1).getReg();
1215   MI.eraseFromParent();
1216 
1217   if (!MRI->getRegClassOrNull(Reg))
1218     MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1219   return true;
1220 }
1221 
1222 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1223   MachineInstr &MI, Intrinsic::ID IntrID) const {
1224   MachineBasicBlock *MBB = MI.getParent();
1225   MachineFunction *MF = MBB->getParent();
1226   const DebugLoc &DL = MI.getDebugLoc();
1227 
1228   unsigned IndexOperand = MI.getOperand(7).getImm();
1229   bool WaveRelease = MI.getOperand(8).getImm() != 0;
1230   bool WaveDone = MI.getOperand(9).getImm() != 0;
1231 
1232   if (WaveDone && !WaveRelease)
1233     report_fatal_error("ds_ordered_count: wave_done requires wave_release");
1234 
1235   unsigned OrderedCountIndex = IndexOperand & 0x3f;
1236   IndexOperand &= ~0x3f;
1237   unsigned CountDw = 0;
1238 
1239   if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1240     CountDw = (IndexOperand >> 24) & 0xf;
1241     IndexOperand &= ~(0xf << 24);
1242 
1243     if (CountDw < 1 || CountDw > 4) {
1244       report_fatal_error(
1245         "ds_ordered_count: dword count must be between 1 and 4");
1246     }
1247   }
1248 
1249   if (IndexOperand)
1250     report_fatal_error("ds_ordered_count: bad index operand");
1251 
1252   unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1253   unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1254 
1255   unsigned Offset0 = OrderedCountIndex << 2;
1256   unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) |
1257                      (Instruction << 4);
1258 
1259   if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
1260     Offset1 |= (CountDw - 1) << 6;
1261 
1262   unsigned Offset = Offset0 | (Offset1 << 8);
1263 
1264   Register M0Val = MI.getOperand(2).getReg();
1265   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1266     .addReg(M0Val);
1267 
1268   Register DstReg = MI.getOperand(0).getReg();
1269   Register ValReg = MI.getOperand(3).getReg();
1270   MachineInstrBuilder DS =
1271     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1272       .addReg(ValReg)
1273       .addImm(Offset)
1274       .cloneMemRefs(MI);
1275 
1276   if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1277     return false;
1278 
1279   bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1280   MI.eraseFromParent();
1281   return Ret;
1282 }
1283 
1284 static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1285   switch (IntrID) {
1286   case Intrinsic::amdgcn_ds_gws_init:
1287     return AMDGPU::DS_GWS_INIT;
1288   case Intrinsic::amdgcn_ds_gws_barrier:
1289     return AMDGPU::DS_GWS_BARRIER;
1290   case Intrinsic::amdgcn_ds_gws_sema_v:
1291     return AMDGPU::DS_GWS_SEMA_V;
1292   case Intrinsic::amdgcn_ds_gws_sema_br:
1293     return AMDGPU::DS_GWS_SEMA_BR;
1294   case Intrinsic::amdgcn_ds_gws_sema_p:
1295     return AMDGPU::DS_GWS_SEMA_P;
1296   case Intrinsic::amdgcn_ds_gws_sema_release_all:
1297     return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1298   default:
1299     llvm_unreachable("not a gws intrinsic");
1300   }
1301 }
1302 
1303 bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1304                                                      Intrinsic::ID IID) const {
1305   if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1306       !STI.hasGWSSemaReleaseAll())
1307     return false;
1308 
1309   // intrinsic ID, vsrc, offset
1310   const bool HasVSrc = MI.getNumOperands() == 3;
1311   assert(HasVSrc || MI.getNumOperands() == 2);
1312 
1313   Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1314   const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1315   if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1316     return false;
1317 
1318   MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1319   assert(OffsetDef);
1320 
1321   unsigned ImmOffset;
1322 
1323   MachineBasicBlock *MBB = MI.getParent();
1324   const DebugLoc &DL = MI.getDebugLoc();
1325 
1326   MachineInstr *Readfirstlane = nullptr;
1327 
1328   // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1329   // incoming offset, in case there's an add of a constant. We'll have to put it
1330   // back later.
1331   if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1332     Readfirstlane = OffsetDef;
1333     BaseOffset = OffsetDef->getOperand(1).getReg();
1334     OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1335   }
1336 
1337   if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1338     // If we have a constant offset, try to use the 0 in m0 as the base.
1339     // TODO: Look into changing the default m0 initialization value. If the
1340     // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1341     // the immediate offset.
1342 
1343     ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1344     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1345       .addImm(0);
1346   } else {
1347     std::tie(BaseOffset, ImmOffset, OffsetDef)
1348       = AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset);
1349 
1350     if (Readfirstlane) {
1351       // We have the constant offset now, so put the readfirstlane back on the
1352       // variable component.
1353       if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1354         return false;
1355 
1356       Readfirstlane->getOperand(1).setReg(BaseOffset);
1357       BaseOffset = Readfirstlane->getOperand(0).getReg();
1358     } else {
1359       if (!RBI.constrainGenericRegister(BaseOffset,
1360                                         AMDGPU::SReg_32RegClass, *MRI))
1361         return false;
1362     }
1363 
1364     Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1365     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1366       .addReg(BaseOffset)
1367       .addImm(16);
1368 
1369     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1370       .addReg(M0Base);
1371   }
1372 
1373   // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1374   // offset field) % 64. Some versions of the programming guide omit the m0
1375   // part, or claim it's from offset 0.
1376   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1377 
1378   if (HasVSrc) {
1379     Register VSrc = MI.getOperand(1).getReg();
1380     MIB.addReg(VSrc);
1381     if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1382       return false;
1383   }
1384 
1385   MIB.addImm(ImmOffset)
1386      .addImm(-1) // $gds
1387      .cloneMemRefs(MI);
1388 
1389   MI.eraseFromParent();
1390   return true;
1391 }
1392 
1393 bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1394                                                       bool IsAppend) const {
1395   Register PtrBase = MI.getOperand(2).getReg();
1396   LLT PtrTy = MRI->getType(PtrBase);
1397   bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1398 
1399   unsigned Offset;
1400   std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1401 
1402   // TODO: Should this try to look through readfirstlane like GWS?
1403   if (!isDSOffsetLegal(PtrBase, Offset, 16)) {
1404     PtrBase = MI.getOperand(2).getReg();
1405     Offset = 0;
1406   }
1407 
1408   MachineBasicBlock *MBB = MI.getParent();
1409   const DebugLoc &DL = MI.getDebugLoc();
1410   const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1411 
1412   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1413     .addReg(PtrBase);
1414   if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1415     return false;
1416 
1417   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1418     .addImm(Offset)
1419     .addImm(IsGDS ? -1 : 0)
1420     .cloneMemRefs(MI);
1421   MI.eraseFromParent();
1422   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1423 }
1424 
1425 static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
1426                          bool &IsTexFail) {
1427   if (TexFailCtrl)
1428     IsTexFail = true;
1429 
1430   TFE = (TexFailCtrl & 0x1) ? 1 : 0;
1431   TexFailCtrl &= ~(uint64_t)0x1;
1432   LWE = (TexFailCtrl & 0x2) ? 1 : 0;
1433   TexFailCtrl &= ~(uint64_t)0x2;
1434 
1435   return TexFailCtrl == 0;
1436 }
1437 
1438 static bool parseCachePolicy(uint64_t Value,
1439                              bool *GLC, bool *SLC, bool *DLC) {
1440   if (GLC) {
1441     *GLC = (Value & 0x1) ? 1 : 0;
1442     Value &= ~(uint64_t)0x1;
1443   }
1444   if (SLC) {
1445     *SLC = (Value & 0x2) ? 1 : 0;
1446     Value &= ~(uint64_t)0x2;
1447   }
1448   if (DLC) {
1449     *DLC = (Value & 0x4) ? 1 : 0;
1450     Value &= ~(uint64_t)0x4;
1451   }
1452 
1453   return Value == 0;
1454 }
1455 
1456 bool AMDGPUInstructionSelector::selectImageIntrinsic(
1457   MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
1458   MachineBasicBlock *MBB = MI.getParent();
1459   const DebugLoc &DL = MI.getDebugLoc();
1460 
1461   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1462     AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1463 
1464   const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
1465   const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
1466       AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
1467   const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo =
1468       AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode);
1469   unsigned IntrOpcode = Intr->BaseOpcode;
1470   const bool IsGFX10 = STI.getGeneration() >= AMDGPUSubtarget::GFX10;
1471 
1472   const int VAddrIdx = getImageVAddrIdxBegin(BaseOpcode,
1473                                              MI.getNumExplicitDefs());
1474   int NumVAddr, NumGradients;
1475   std::tie(NumVAddr, NumGradients) = getImageNumVAddr(Intr, BaseOpcode);
1476 
1477   Register VDataIn, VDataOut;
1478   LLT VDataTy;
1479   int NumVDataDwords = -1;
1480   bool IsD16 = false;
1481 
1482   // XXX - Can we just get the second to last argument for ctrl?
1483   unsigned CtrlIdx; // Index of texfailctrl argument
1484   bool Unorm;
1485   if (!BaseOpcode->Sampler) {
1486     Unorm = true;
1487     CtrlIdx = VAddrIdx + NumVAddr + 1;
1488   } else {
1489     Unorm = MI.getOperand(VAddrIdx + NumVAddr + 2).getImm() != 0;
1490     CtrlIdx = VAddrIdx + NumVAddr + 3;
1491   }
1492 
1493   bool TFE;
1494   bool LWE;
1495   bool IsTexFail = false;
1496   if (!parseTexFail(MI.getOperand(CtrlIdx).getImm(), TFE, LWE, IsTexFail))
1497     return false;
1498 
1499   const int Flags = MI.getOperand(CtrlIdx + 2).getImm();
1500   const bool IsA16 = (Flags & 1) != 0;
1501   const bool IsG16 = (Flags & 2) != 0;
1502 
1503   // A16 implies 16 bit gradients
1504   if (IsA16 && !IsG16)
1505     return false;
1506 
1507   unsigned DMask = 0;
1508   unsigned DMaskLanes = 0;
1509 
1510   if (BaseOpcode->Atomic) {
1511     VDataOut = MI.getOperand(0).getReg();
1512     VDataIn = MI.getOperand(2).getReg();
1513     LLT Ty = MRI->getType(VDataIn);
1514 
1515     // Be careful to allow atomic swap on 16-bit element vectors.
1516     const bool Is64Bit = BaseOpcode->AtomicX2 ?
1517       Ty.getSizeInBits() == 128 :
1518       Ty.getSizeInBits() == 64;
1519 
1520     if (BaseOpcode->AtomicX2) {
1521       assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
1522 
1523       DMask = Is64Bit ? 0xf : 0x3;
1524       NumVDataDwords = Is64Bit ? 4 : 2;
1525     } else {
1526       DMask = Is64Bit ? 0x3 : 0x1;
1527       NumVDataDwords = Is64Bit ? 2 : 1;
1528     }
1529   } else {
1530     const int DMaskIdx = 2; // Input/output + intrinsic ID.
1531 
1532     DMask = MI.getOperand(DMaskIdx).getImm();
1533     DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
1534 
1535     if (BaseOpcode->Store) {
1536       VDataIn = MI.getOperand(1).getReg();
1537       VDataTy = MRI->getType(VDataIn);
1538       NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
1539     } else {
1540       VDataOut = MI.getOperand(0).getReg();
1541       VDataTy = MRI->getType(VDataOut);
1542       NumVDataDwords = DMaskLanes;
1543 
1544       // One memoperand is mandatory, except for getresinfo.
1545       // FIXME: Check this in verifier.
1546       if (!MI.memoperands_empty()) {
1547         const MachineMemOperand *MMO = *MI.memoperands_begin();
1548 
1549         // Infer d16 from the memory size, as the register type will be mangled by
1550         // unpacked subtargets, or by TFE.
1551         IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32;
1552 
1553         if (IsD16 && !STI.hasUnpackedD16VMem())
1554           NumVDataDwords = (DMaskLanes + 1) / 2;
1555       }
1556     }
1557   }
1558 
1559   // Optimize _L to _LZ when _L is zero
1560   if (LZMappingInfo) {
1561     // The legalizer replaced the register with an immediate 0 if we need to
1562     // change the opcode.
1563     const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1);
1564     if (Lod.isImm()) {
1565       assert(Lod.getImm() == 0);
1566       IntrOpcode = LZMappingInfo->LZ;  // set new opcode to _lz variant of _l
1567     }
1568   }
1569 
1570   // Optimize _mip away, when 'lod' is zero
1571   if (MIPMappingInfo) {
1572     const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1);
1573     if (Lod.isImm()) {
1574       assert(Lod.getImm() == 0);
1575       IntrOpcode = MIPMappingInfo->NONMIP;  // set new opcode to variant without _mip
1576     }
1577   }
1578 
1579   // Set G16 opcode
1580   if (IsG16 && !IsA16) {
1581     const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
1582         AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
1583     assert(G16MappingInfo);
1584     IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
1585   }
1586 
1587   // TODO: Check this in verifier.
1588   assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
1589 
1590   bool GLC = false;
1591   bool SLC = false;
1592   bool DLC = false;
1593   if (BaseOpcode->Atomic) {
1594     GLC = true; // TODO no-return optimization
1595     if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), nullptr, &SLC,
1596                           IsGFX10 ? &DLC : nullptr))
1597       return false;
1598   } else {
1599     if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), &GLC, &SLC,
1600                           IsGFX10 ? &DLC : nullptr))
1601       return false;
1602   }
1603 
1604   int NumVAddrRegs = 0;
1605   int NumVAddrDwords = 0;
1606   for (int I = 0; I < NumVAddr; ++I) {
1607     // Skip the $noregs and 0s inserted during legalization.
1608     MachineOperand &AddrOp = MI.getOperand(VAddrIdx + I);
1609     if (!AddrOp.isReg())
1610       continue; // XXX - Break?
1611 
1612     Register Addr = AddrOp.getReg();
1613     if (!Addr)
1614       break;
1615 
1616     ++NumVAddrRegs;
1617     NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
1618   }
1619 
1620   // The legalizer preprocessed the intrinsic arguments. If we aren't using
1621   // NSA, these should have beeen packed into a single value in the first
1622   // address register
1623   const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs;
1624   if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
1625     LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
1626     return false;
1627   }
1628 
1629   if (IsTexFail)
1630     ++NumVDataDwords;
1631 
1632   int Opcode = -1;
1633   if (IsGFX10) {
1634     Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1635                                    UseNSA ? AMDGPU::MIMGEncGfx10NSA
1636                                           : AMDGPU::MIMGEncGfx10Default,
1637                                    NumVDataDwords, NumVAddrDwords);
1638   } else {
1639     if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
1640       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
1641                                      NumVDataDwords, NumVAddrDwords);
1642     if (Opcode == -1)
1643       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
1644                                      NumVDataDwords, NumVAddrDwords);
1645   }
1646   assert(Opcode != -1);
1647 
1648   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
1649     .cloneMemRefs(MI);
1650 
1651   if (VDataOut) {
1652     if (BaseOpcode->AtomicX2) {
1653       const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
1654 
1655       Register TmpReg = MRI->createVirtualRegister(
1656         Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
1657       unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
1658 
1659       MIB.addDef(TmpReg);
1660       BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
1661         .addReg(TmpReg, RegState::Kill, SubReg);
1662 
1663     } else {
1664       MIB.addDef(VDataOut); // vdata output
1665     }
1666   }
1667 
1668   if (VDataIn)
1669     MIB.addReg(VDataIn); // vdata input
1670 
1671   for (int i = 0; i != NumVAddrRegs; ++i) {
1672     MachineOperand &SrcOp = MI.getOperand(VAddrIdx + i);
1673     if (SrcOp.isReg()) {
1674       assert(SrcOp.getReg() != 0);
1675       MIB.addReg(SrcOp.getReg());
1676     }
1677   }
1678 
1679   MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr).getReg()); // rsrc
1680   if (BaseOpcode->Sampler)
1681     MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr + 1).getReg()); // sampler
1682 
1683   MIB.addImm(DMask); // dmask
1684 
1685   if (IsGFX10)
1686     MIB.addImm(DimInfo->Encoding);
1687   MIB.addImm(Unorm);
1688   if (IsGFX10)
1689     MIB.addImm(DLC);
1690 
1691   MIB.addImm(GLC);
1692   MIB.addImm(SLC);
1693   MIB.addImm(IsA16 &&  // a16 or r128
1694              STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
1695   if (IsGFX10)
1696     MIB.addImm(IsA16 ? -1 : 0);
1697 
1698   MIB.addImm(TFE); // tfe
1699   MIB.addImm(LWE); // lwe
1700   if (!IsGFX10)
1701     MIB.addImm(DimInfo->DA ? -1 : 0);
1702   if (BaseOpcode->HasD16)
1703     MIB.addImm(IsD16 ? -1 : 0);
1704 
1705   MI.eraseFromParent();
1706   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1707 }
1708 
1709 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
1710     MachineInstr &I) const {
1711   unsigned IntrinsicID = I.getIntrinsicID();
1712   switch (IntrinsicID) {
1713   case Intrinsic::amdgcn_end_cf:
1714     return selectEndCfIntrinsic(I);
1715   case Intrinsic::amdgcn_ds_ordered_add:
1716   case Intrinsic::amdgcn_ds_ordered_swap:
1717     return selectDSOrderedIntrinsic(I, IntrinsicID);
1718   case Intrinsic::amdgcn_ds_gws_init:
1719   case Intrinsic::amdgcn_ds_gws_barrier:
1720   case Intrinsic::amdgcn_ds_gws_sema_v:
1721   case Intrinsic::amdgcn_ds_gws_sema_br:
1722   case Intrinsic::amdgcn_ds_gws_sema_p:
1723   case Intrinsic::amdgcn_ds_gws_sema_release_all:
1724     return selectDSGWSIntrinsic(I, IntrinsicID);
1725   case Intrinsic::amdgcn_ds_append:
1726     return selectDSAppendConsume(I, true);
1727   case Intrinsic::amdgcn_ds_consume:
1728     return selectDSAppendConsume(I, false);
1729   default: {
1730     return selectImpl(I, *CoverageInfo);
1731   }
1732   }
1733 }
1734 
1735 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
1736   if (selectImpl(I, *CoverageInfo))
1737     return true;
1738 
1739   MachineBasicBlock *BB = I.getParent();
1740   const DebugLoc &DL = I.getDebugLoc();
1741 
1742   Register DstReg = I.getOperand(0).getReg();
1743   unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
1744   assert(Size <= 32 || Size == 64);
1745   const MachineOperand &CCOp = I.getOperand(1);
1746   Register CCReg = CCOp.getReg();
1747   if (!isVCC(CCReg, *MRI)) {
1748     unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
1749                                          AMDGPU::S_CSELECT_B32;
1750     MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
1751             .addReg(CCReg);
1752 
1753     // The generic constrainSelectedInstRegOperands doesn't work for the scc register
1754     // bank, because it does not cover the register class that we used to represent
1755     // for it.  So we need to manually set the register class here.
1756     if (!MRI->getRegClassOrNull(CCReg))
1757         MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
1758     MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
1759             .add(I.getOperand(2))
1760             .add(I.getOperand(3));
1761 
1762     bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI) |
1763                constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
1764     I.eraseFromParent();
1765     return Ret;
1766   }
1767 
1768   // Wide VGPR select should have been split in RegBankSelect.
1769   if (Size > 32)
1770     return false;
1771 
1772   MachineInstr *Select =
1773       BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1774               .addImm(0)
1775               .add(I.getOperand(3))
1776               .addImm(0)
1777               .add(I.getOperand(2))
1778               .add(I.getOperand(1));
1779 
1780   bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
1781   I.eraseFromParent();
1782   return Ret;
1783 }
1784 
1785 static int sizeToSubRegIndex(unsigned Size) {
1786   switch (Size) {
1787   case 32:
1788     return AMDGPU::sub0;
1789   case 64:
1790     return AMDGPU::sub0_sub1;
1791   case 96:
1792     return AMDGPU::sub0_sub1_sub2;
1793   case 128:
1794     return AMDGPU::sub0_sub1_sub2_sub3;
1795   case 256:
1796     return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
1797   default:
1798     if (Size < 32)
1799       return AMDGPU::sub0;
1800     if (Size > 256)
1801       return -1;
1802     return sizeToSubRegIndex(PowerOf2Ceil(Size));
1803   }
1804 }
1805 
1806 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
1807   Register DstReg = I.getOperand(0).getReg();
1808   Register SrcReg = I.getOperand(1).getReg();
1809   const LLT DstTy = MRI->getType(DstReg);
1810   const LLT SrcTy = MRI->getType(SrcReg);
1811   const LLT S1 = LLT::scalar(1);
1812 
1813   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
1814   const RegisterBank *DstRB;
1815   if (DstTy == S1) {
1816     // This is a special case. We don't treat s1 for legalization artifacts as
1817     // vcc booleans.
1818     DstRB = SrcRB;
1819   } else {
1820     DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1821     if (SrcRB != DstRB)
1822       return false;
1823   }
1824 
1825   const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
1826 
1827   unsigned DstSize = DstTy.getSizeInBits();
1828   unsigned SrcSize = SrcTy.getSizeInBits();
1829 
1830   const TargetRegisterClass *SrcRC
1831     = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI);
1832   const TargetRegisterClass *DstRC
1833     = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI);
1834   if (!SrcRC || !DstRC)
1835     return false;
1836 
1837   if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
1838       !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
1839     LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
1840     return false;
1841   }
1842 
1843   if (DstTy == LLT::vector(2, 16) && SrcTy == LLT::vector(2, 32)) {
1844     MachineBasicBlock *MBB = I.getParent();
1845     const DebugLoc &DL = I.getDebugLoc();
1846 
1847     Register LoReg = MRI->createVirtualRegister(DstRC);
1848     Register HiReg = MRI->createVirtualRegister(DstRC);
1849     BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
1850       .addReg(SrcReg, 0, AMDGPU::sub0);
1851     BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
1852       .addReg(SrcReg, 0, AMDGPU::sub1);
1853 
1854     if (IsVALU && STI.hasSDWA()) {
1855       // Write the low 16-bits of the high element into the high 16-bits of the
1856       // low element.
1857       MachineInstr *MovSDWA =
1858         BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
1859         .addImm(0)                             // $src0_modifiers
1860         .addReg(HiReg)                         // $src0
1861         .addImm(0)                             // $clamp
1862         .addImm(AMDGPU::SDWA::WORD_1)          // $dst_sel
1863         .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
1864         .addImm(AMDGPU::SDWA::WORD_0)          // $src0_sel
1865         .addReg(LoReg, RegState::Implicit);
1866       MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
1867     } else {
1868       Register TmpReg0 = MRI->createVirtualRegister(DstRC);
1869       Register TmpReg1 = MRI->createVirtualRegister(DstRC);
1870       Register ImmReg = MRI->createVirtualRegister(DstRC);
1871       if (IsVALU) {
1872         BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
1873           .addImm(16)
1874           .addReg(HiReg);
1875       } else {
1876         BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
1877           .addReg(HiReg)
1878           .addImm(16);
1879       }
1880 
1881       unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
1882       unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
1883       unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
1884 
1885       BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
1886         .addImm(0xffff);
1887       BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
1888         .addReg(LoReg)
1889         .addReg(ImmReg);
1890       BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
1891         .addReg(TmpReg0)
1892         .addReg(TmpReg1);
1893     }
1894 
1895     I.eraseFromParent();
1896     return true;
1897   }
1898 
1899   if (!DstTy.isScalar())
1900     return false;
1901 
1902   if (SrcSize > 32) {
1903     int SubRegIdx = sizeToSubRegIndex(DstSize);
1904     if (SubRegIdx == -1)
1905       return false;
1906 
1907     // Deal with weird cases where the class only partially supports the subreg
1908     // index.
1909     const TargetRegisterClass *SrcWithSubRC
1910       = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
1911     if (!SrcWithSubRC)
1912       return false;
1913 
1914     if (SrcWithSubRC != SrcRC) {
1915       if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
1916         return false;
1917     }
1918 
1919     I.getOperand(1).setSubReg(SubRegIdx);
1920   }
1921 
1922   I.setDesc(TII.get(TargetOpcode::COPY));
1923   return true;
1924 }
1925 
1926 /// \returns true if a bitmask for \p Size bits will be an inline immediate.
1927 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
1928   Mask = maskTrailingOnes<unsigned>(Size);
1929   int SignedMask = static_cast<int>(Mask);
1930   return SignedMask >= -16 && SignedMask <= 64;
1931 }
1932 
1933 // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
1934 const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
1935   Register Reg, const MachineRegisterInfo &MRI,
1936   const TargetRegisterInfo &TRI) const {
1937   const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
1938   if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>())
1939     return RB;
1940 
1941   // Ignore the type, since we don't use vcc in artifacts.
1942   if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
1943     return &RBI.getRegBankFromRegClass(*RC, LLT());
1944   return nullptr;
1945 }
1946 
1947 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
1948   bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
1949   bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
1950   const DebugLoc &DL = I.getDebugLoc();
1951   MachineBasicBlock &MBB = *I.getParent();
1952   const Register DstReg = I.getOperand(0).getReg();
1953   const Register SrcReg = I.getOperand(1).getReg();
1954 
1955   const LLT DstTy = MRI->getType(DstReg);
1956   const LLT SrcTy = MRI->getType(SrcReg);
1957   const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
1958     I.getOperand(2).getImm() : SrcTy.getSizeInBits();
1959   const unsigned DstSize = DstTy.getSizeInBits();
1960   if (!DstTy.isScalar())
1961     return false;
1962 
1963   // Artifact casts should never use vcc.
1964   const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
1965 
1966   // FIXME: This should probably be illegal and split earlier.
1967   if (I.getOpcode() == AMDGPU::G_ANYEXT) {
1968     if (DstSize <= 32)
1969       return selectCOPY(I);
1970 
1971     const TargetRegisterClass *SrcRC =
1972         TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank, *MRI);
1973     const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1974     const TargetRegisterClass *DstRC =
1975         TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
1976 
1977     Register UndefReg = MRI->createVirtualRegister(SrcRC);
1978     BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
1979     BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1980       .addReg(SrcReg)
1981       .addImm(AMDGPU::sub0)
1982       .addReg(UndefReg)
1983       .addImm(AMDGPU::sub1);
1984     I.eraseFromParent();
1985 
1986     return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
1987            RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
1988   }
1989 
1990   if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
1991     // 64-bit should have been split up in RegBankSelect
1992 
1993     // Try to use an and with a mask if it will save code size.
1994     unsigned Mask;
1995     if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
1996       MachineInstr *ExtI =
1997       BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
1998         .addImm(Mask)
1999         .addReg(SrcReg);
2000       I.eraseFromParent();
2001       return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2002     }
2003 
2004     const unsigned BFE = Signed ? AMDGPU::V_BFE_I32 : AMDGPU::V_BFE_U32;
2005     MachineInstr *ExtI =
2006       BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2007       .addReg(SrcReg)
2008       .addImm(0) // Offset
2009       .addImm(SrcSize); // Width
2010     I.eraseFromParent();
2011     return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2012   }
2013 
2014   if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2015     const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2016       AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2017     if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2018       return false;
2019 
2020     if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2021       const unsigned SextOpc = SrcSize == 8 ?
2022         AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2023       BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2024         .addReg(SrcReg);
2025       I.eraseFromParent();
2026       return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2027     }
2028 
2029     const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2030     const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2031 
2032     // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2033     if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2034       // We need a 64-bit register source, but the high bits don't matter.
2035       Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2036       Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2037       unsigned SubReg = InReg ? AMDGPU::sub0 : 0;
2038 
2039       BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2040       BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2041         .addReg(SrcReg, 0, SubReg)
2042         .addImm(AMDGPU::sub0)
2043         .addReg(UndefReg)
2044         .addImm(AMDGPU::sub1);
2045 
2046       BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2047         .addReg(ExtReg)
2048         .addImm(SrcSize << 16);
2049 
2050       I.eraseFromParent();
2051       return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2052     }
2053 
2054     unsigned Mask;
2055     if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2056       BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2057         .addReg(SrcReg)
2058         .addImm(Mask);
2059     } else {
2060       BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2061         .addReg(SrcReg)
2062         .addImm(SrcSize << 16);
2063     }
2064 
2065     I.eraseFromParent();
2066     return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2067   }
2068 
2069   return false;
2070 }
2071 
2072 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
2073   MachineBasicBlock *BB = I.getParent();
2074   MachineOperand &ImmOp = I.getOperand(1);
2075   Register DstReg = I.getOperand(0).getReg();
2076   unsigned Size = MRI->getType(DstReg).getSizeInBits();
2077 
2078   // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
2079   if (ImmOp.isFPImm()) {
2080     const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
2081     ImmOp.ChangeToImmediate(Imm.getZExtValue());
2082   } else if (ImmOp.isCImm()) {
2083     ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue());
2084   } else {
2085     llvm_unreachable("Not supported by g_constants");
2086   }
2087 
2088   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2089   const bool IsSgpr = DstRB->getID() == AMDGPU::SGPRRegBankID;
2090 
2091   unsigned Opcode;
2092   if (DstRB->getID() == AMDGPU::VCCRegBankID) {
2093     Opcode = STI.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2094   } else {
2095     Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
2096 
2097     // We should never produce s1 values on banks other than VCC. If the user of
2098     // this already constrained the register, we may incorrectly think it's VCC
2099     // if it wasn't originally.
2100     if (Size == 1)
2101       return false;
2102   }
2103 
2104   if (Size != 64) {
2105     I.setDesc(TII.get(Opcode));
2106     I.addImplicitDefUseOperands(*MF);
2107     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2108   }
2109 
2110   const DebugLoc &DL = I.getDebugLoc();
2111 
2112   APInt Imm(Size, I.getOperand(1).getImm());
2113 
2114   MachineInstr *ResInst;
2115   if (IsSgpr && TII.isInlineConstant(Imm)) {
2116     ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
2117       .addImm(I.getOperand(1).getImm());
2118   } else {
2119     const TargetRegisterClass *RC = IsSgpr ?
2120       &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass;
2121     Register LoReg = MRI->createVirtualRegister(RC);
2122     Register HiReg = MRI->createVirtualRegister(RC);
2123 
2124     BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
2125       .addImm(Imm.trunc(32).getZExtValue());
2126 
2127     BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
2128       .addImm(Imm.ashr(32).getZExtValue());
2129 
2130     ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2131       .addReg(LoReg)
2132       .addImm(AMDGPU::sub0)
2133       .addReg(HiReg)
2134       .addImm(AMDGPU::sub1);
2135   }
2136 
2137   // We can't call constrainSelectedInstRegOperands here, because it doesn't
2138   // work for target independent opcodes
2139   I.eraseFromParent();
2140   const TargetRegisterClass *DstRC =
2141     TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI);
2142   if (!DstRC)
2143     return true;
2144   return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI);
2145 }
2146 
2147 bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2148   // Only manually handle the f64 SGPR case.
2149   //
2150   // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2151   // the bit ops theoretically have a second result due to the implicit def of
2152   // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2153   // that is easy by disabling the check. The result works, but uses a
2154   // nonsensical sreg32orlds_and_sreg_1 regclass.
2155   //
2156   // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2157   // the variadic REG_SEQUENCE operands.
2158 
2159   Register Dst = MI.getOperand(0).getReg();
2160   const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2161   if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2162       MRI->getType(Dst) != LLT::scalar(64))
2163     return false;
2164 
2165   Register Src = MI.getOperand(1).getReg();
2166   MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2167   if (Fabs)
2168     Src = Fabs->getOperand(1).getReg();
2169 
2170   if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2171       !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2172     return false;
2173 
2174   MachineBasicBlock *BB = MI.getParent();
2175   const DebugLoc &DL = MI.getDebugLoc();
2176   Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2177   Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2178   Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2179   Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2180 
2181   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2182     .addReg(Src, 0, AMDGPU::sub0);
2183   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2184     .addReg(Src, 0, AMDGPU::sub1);
2185   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2186     .addImm(0x80000000);
2187 
2188   // Set or toggle sign bit.
2189   unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2190   BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2191     .addReg(HiReg)
2192     .addReg(ConstReg);
2193   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2194     .addReg(LoReg)
2195     .addImm(AMDGPU::sub0)
2196     .addReg(OpReg)
2197     .addImm(AMDGPU::sub1);
2198   MI.eraseFromParent();
2199   return true;
2200 }
2201 
2202 // FIXME: This is a workaround for the same tablegen problems as G_FNEG
2203 bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2204   Register Dst = MI.getOperand(0).getReg();
2205   const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2206   if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2207       MRI->getType(Dst) != LLT::scalar(64))
2208     return false;
2209 
2210   Register Src = MI.getOperand(1).getReg();
2211   MachineBasicBlock *BB = MI.getParent();
2212   const DebugLoc &DL = MI.getDebugLoc();
2213   Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2214   Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2215   Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2216   Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2217 
2218   if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2219       !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2220     return false;
2221 
2222   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2223     .addReg(Src, 0, AMDGPU::sub0);
2224   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2225     .addReg(Src, 0, AMDGPU::sub1);
2226   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2227     .addImm(0x7fffffff);
2228 
2229   // Clear sign bit.
2230   // TODO: Should this used S_BITSET0_*?
2231   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2232     .addReg(HiReg)
2233     .addReg(ConstReg);
2234   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2235     .addReg(LoReg)
2236     .addImm(AMDGPU::sub0)
2237     .addReg(OpReg)
2238     .addImm(AMDGPU::sub1);
2239 
2240   MI.eraseFromParent();
2241   return true;
2242 }
2243 
2244 static bool isConstant(const MachineInstr &MI) {
2245   return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2246 }
2247 
2248 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2249     const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2250 
2251   const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg());
2252 
2253   assert(PtrMI);
2254 
2255   if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2256     return;
2257 
2258   GEPInfo GEPInfo(*PtrMI);
2259 
2260   for (unsigned i = 1; i != 3; ++i) {
2261     const MachineOperand &GEPOp = PtrMI->getOperand(i);
2262     const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2263     assert(OpDef);
2264     if (i == 2 && isConstant(*OpDef)) {
2265       // TODO: Could handle constant base + variable offset, but a combine
2266       // probably should have commuted it.
2267       assert(GEPInfo.Imm == 0);
2268       GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2269       continue;
2270     }
2271     const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2272     if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2273       GEPInfo.SgprParts.push_back(GEPOp.getReg());
2274     else
2275       GEPInfo.VgprParts.push_back(GEPOp.getReg());
2276   }
2277 
2278   AddrInfo.push_back(GEPInfo);
2279   getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2280 }
2281 
2282 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2283   if (!MI.hasOneMemOperand())
2284     return false;
2285 
2286   const MachineMemOperand *MMO = *MI.memoperands_begin();
2287   const Value *Ptr = MMO->getValue();
2288 
2289   // UndefValue means this is a load of a kernel input.  These are uniform.
2290   // Sometimes LDS instructions have constant pointers.
2291   // If Ptr is null, then that means this mem operand contains a
2292   // PseudoSourceValue like GOT.
2293   if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
2294       isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
2295     return true;
2296 
2297   if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
2298     return true;
2299 
2300   const Instruction *I = dyn_cast<Instruction>(Ptr);
2301   return I && I->getMetadata("amdgpu.uniform");
2302 }
2303 
2304 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2305   for (const GEPInfo &GEPInfo : AddrInfo) {
2306     if (!GEPInfo.VgprParts.empty())
2307       return true;
2308   }
2309   return false;
2310 }
2311 
2312 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2313   const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2314   unsigned AS = PtrTy.getAddressSpace();
2315   if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) &&
2316       STI.ldsRequiresM0Init()) {
2317     MachineBasicBlock *BB = I.getParent();
2318 
2319     // If DS instructions require M0 initializtion, insert it before selecting.
2320     BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2321       .addImm(-1);
2322   }
2323 }
2324 
2325 bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
2326   MachineInstr &I) const {
2327   initM0(I);
2328   return selectImpl(I, *CoverageInfo);
2329 }
2330 
2331 // TODO: No rtn optimization.
2332 bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG(
2333   MachineInstr &MI) const {
2334   Register PtrReg = MI.getOperand(1).getReg();
2335   const LLT PtrTy = MRI->getType(PtrReg);
2336   if (PtrTy.getAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
2337       STI.useFlatForGlobal())
2338     return selectImpl(MI, *CoverageInfo);
2339 
2340   Register DstReg = MI.getOperand(0).getReg();
2341   const LLT Ty = MRI->getType(DstReg);
2342   const bool Is64 = Ty.getSizeInBits() == 64;
2343   const unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2344   Register TmpReg = MRI->createVirtualRegister(
2345     Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2346 
2347   const DebugLoc &DL = MI.getDebugLoc();
2348   MachineBasicBlock *BB = MI.getParent();
2349 
2350   Register VAddr, RSrcReg, SOffset;
2351   int64_t Offset = 0;
2352 
2353   unsigned Opcode;
2354   if (selectMUBUFOffsetImpl(MI.getOperand(1), RSrcReg, SOffset, Offset)) {
2355     Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN :
2356                              AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN;
2357   } else if (selectMUBUFAddr64Impl(MI.getOperand(1), VAddr,
2358                                    RSrcReg, SOffset, Offset)) {
2359     Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN :
2360                     AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN;
2361   } else
2362     return selectImpl(MI, *CoverageInfo);
2363 
2364   auto MIB = BuildMI(*BB, &MI, DL, TII.get(Opcode), TmpReg)
2365     .addReg(MI.getOperand(2).getReg());
2366 
2367   if (VAddr)
2368     MIB.addReg(VAddr);
2369 
2370   MIB.addReg(RSrcReg);
2371   if (SOffset)
2372     MIB.addReg(SOffset);
2373   else
2374     MIB.addImm(0);
2375 
2376   MIB.addImm(Offset);
2377   MIB.addImm(0); // slc
2378   MIB.cloneMemRefs(MI);
2379 
2380   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), DstReg)
2381     .addReg(TmpReg, RegState::Kill, SubReg);
2382 
2383   MI.eraseFromParent();
2384 
2385   MRI->setRegClass(
2386     DstReg, Is64 ? &AMDGPU::VReg_64RegClass : &AMDGPU::VGPR_32RegClass);
2387   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2388 }
2389 
2390 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
2391   MachineBasicBlock *BB = I.getParent();
2392   MachineOperand &CondOp = I.getOperand(0);
2393   Register CondReg = CondOp.getReg();
2394   const DebugLoc &DL = I.getDebugLoc();
2395 
2396   unsigned BrOpcode;
2397   Register CondPhysReg;
2398   const TargetRegisterClass *ConstrainRC;
2399 
2400   // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
2401   // whether the branch is uniform when selecting the instruction. In
2402   // GlobalISel, we should push that decision into RegBankSelect. Assume for now
2403   // RegBankSelect knows what it's doing if the branch condition is scc, even
2404   // though it currently does not.
2405   if (!isVCC(CondReg, *MRI)) {
2406     if (MRI->getType(CondReg) != LLT::scalar(32))
2407       return false;
2408 
2409     CondPhysReg = AMDGPU::SCC;
2410     BrOpcode = AMDGPU::S_CBRANCH_SCC1;
2411     ConstrainRC = &AMDGPU::SReg_32RegClass;
2412   } else {
2413     // FIXME: Do we have to insert an and with exec here, like in SelectionDAG?
2414     // We sort of know that a VCC producer based on the register bank, that ands
2415     // inactive lanes with 0. What if there was a logical operation with vcc
2416     // producers in different blocks/with different exec masks?
2417     // FIXME: Should scc->vcc copies and with exec?
2418     CondPhysReg = TRI.getVCC();
2419     BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
2420     ConstrainRC = TRI.getBoolRC();
2421   }
2422 
2423   if (!MRI->getRegClassOrNull(CondReg))
2424     MRI->setRegClass(CondReg, ConstrainRC);
2425 
2426   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
2427     .addReg(CondReg);
2428   BuildMI(*BB, &I, DL, TII.get(BrOpcode))
2429     .addMBB(I.getOperand(1).getMBB());
2430 
2431   I.eraseFromParent();
2432   return true;
2433 }
2434 
2435 bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
2436   MachineInstr &I) const {
2437   Register DstReg = I.getOperand(0).getReg();
2438   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2439   const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2440   I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
2441   if (IsVGPR)
2442     I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
2443 
2444   return RBI.constrainGenericRegister(
2445     DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
2446 }
2447 
2448 bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
2449   Register DstReg = I.getOperand(0).getReg();
2450   Register SrcReg = I.getOperand(1).getReg();
2451   Register MaskReg = I.getOperand(2).getReg();
2452   LLT Ty = MRI->getType(DstReg);
2453   LLT MaskTy = MRI->getType(MaskReg);
2454 
2455   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2456   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2457   const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
2458   const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2459   if (DstRB != SrcRB) // Should only happen for hand written MIR.
2460     return false;
2461 
2462   unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2463   const TargetRegisterClass &RegRC
2464     = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2465 
2466   const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB,
2467                                                                   *MRI);
2468   const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB,
2469                                                                   *MRI);
2470   const TargetRegisterClass *MaskRC =
2471       TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB, *MRI);
2472 
2473   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2474       !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2475       !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
2476     return false;
2477 
2478   MachineBasicBlock *BB = I.getParent();
2479   const DebugLoc &DL = I.getDebugLoc();
2480   if (Ty.getSizeInBits() == 32) {
2481     assert(MaskTy.getSizeInBits() == 32 &&
2482            "ptrmask should have been narrowed during legalize");
2483 
2484     BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
2485       .addReg(SrcReg)
2486       .addReg(MaskReg);
2487     I.eraseFromParent();
2488     return true;
2489   }
2490 
2491   Register HiReg = MRI->createVirtualRegister(&RegRC);
2492   Register LoReg = MRI->createVirtualRegister(&RegRC);
2493 
2494   // Extract the subregisters from the source pointer.
2495   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
2496     .addReg(SrcReg, 0, AMDGPU::sub0);
2497   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
2498     .addReg(SrcReg, 0, AMDGPU::sub1);
2499 
2500   Register MaskedLo, MaskedHi;
2501 
2502   // Try to avoid emitting a bit operation when we only need to touch half of
2503   // the 64-bit pointer.
2504   APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64);
2505 
2506   const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
2507   const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
2508   if ((MaskOnes & MaskLo32) == MaskLo32) {
2509     // If all the bits in the low half are 1, we only need a copy for it.
2510     MaskedLo = LoReg;
2511   } else {
2512     // Extract the mask subregister and apply the and.
2513     Register MaskLo = MRI->createVirtualRegister(&RegRC);
2514     MaskedLo = MRI->createVirtualRegister(&RegRC);
2515 
2516     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
2517       .addReg(MaskReg, 0, AMDGPU::sub0);
2518     BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
2519       .addReg(LoReg)
2520       .addReg(MaskLo);
2521   }
2522 
2523   if ((MaskOnes & MaskHi32) == MaskHi32) {
2524     // If all the bits in the high half are 1, we only need a copy for it.
2525     MaskedHi = HiReg;
2526   } else {
2527     Register MaskHi = MRI->createVirtualRegister(&RegRC);
2528     MaskedHi = MRI->createVirtualRegister(&RegRC);
2529 
2530     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
2531       .addReg(MaskReg, 0, AMDGPU::sub1);
2532     BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
2533       .addReg(HiReg)
2534       .addReg(MaskHi);
2535   }
2536 
2537   BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2538     .addReg(MaskedLo)
2539     .addImm(AMDGPU::sub0)
2540     .addReg(MaskedHi)
2541     .addImm(AMDGPU::sub1);
2542   I.eraseFromParent();
2543   return true;
2544 }
2545 
2546 /// Return the register to use for the index value, and the subregister to use
2547 /// for the indirectly accessed register.
2548 static std::pair<Register, unsigned>
2549 computeIndirectRegIndex(MachineRegisterInfo &MRI,
2550                         const SIRegisterInfo &TRI,
2551                         const TargetRegisterClass *SuperRC,
2552                         Register IdxReg,
2553                         unsigned EltSize) {
2554   Register IdxBaseReg;
2555   int Offset;
2556   MachineInstr *Unused;
2557 
2558   std::tie(IdxBaseReg, Offset, Unused)
2559     = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg);
2560   if (IdxBaseReg == AMDGPU::NoRegister) {
2561     // This will happen if the index is a known constant. This should ordinarily
2562     // be legalized out, but handle it as a register just in case.
2563     assert(Offset == 0);
2564     IdxBaseReg = IdxReg;
2565   }
2566 
2567   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
2568 
2569   // Skip out of bounds offsets, or else we would end up using an undefined
2570   // register.
2571   if (static_cast<unsigned>(Offset) >= SubRegs.size())
2572     return std::make_pair(IdxReg, SubRegs[0]);
2573   return std::make_pair(IdxBaseReg, SubRegs[Offset]);
2574 }
2575 
2576 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
2577   MachineInstr &MI) const {
2578   Register DstReg = MI.getOperand(0).getReg();
2579   Register SrcReg = MI.getOperand(1).getReg();
2580   Register IdxReg = MI.getOperand(2).getReg();
2581 
2582   LLT DstTy = MRI->getType(DstReg);
2583   LLT SrcTy = MRI->getType(SrcReg);
2584 
2585   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2586   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2587   const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
2588 
2589   // The index must be scalar. If it wasn't RegBankSelect should have moved this
2590   // into a waterfall loop.
2591   if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
2592     return false;
2593 
2594   const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB,
2595                                                                   *MRI);
2596   const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(DstTy, *DstRB,
2597                                                                   *MRI);
2598   if (!SrcRC || !DstRC)
2599     return false;
2600   if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2601       !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2602       !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
2603     return false;
2604 
2605   MachineBasicBlock *BB = MI.getParent();
2606   const DebugLoc &DL = MI.getDebugLoc();
2607   const bool Is64 = DstTy.getSizeInBits() == 64;
2608 
2609   unsigned SubReg;
2610   std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, SrcRC, IdxReg,
2611                                                      DstTy.getSizeInBits() / 8);
2612 
2613   if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
2614     if (DstTy.getSizeInBits() != 32 && !Is64)
2615       return false;
2616 
2617     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2618       .addReg(IdxReg);
2619 
2620     unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
2621     BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
2622       .addReg(SrcReg, 0, SubReg)
2623       .addReg(SrcReg, RegState::Implicit);
2624     MI.eraseFromParent();
2625     return true;
2626   }
2627 
2628   if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
2629     return false;
2630 
2631   if (!STI.useVGPRIndexMode()) {
2632     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2633       .addReg(IdxReg);
2634     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
2635       .addReg(SrcReg, 0, SubReg)
2636       .addReg(SrcReg, RegState::Implicit);
2637     MI.eraseFromParent();
2638     return true;
2639   }
2640 
2641   BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON))
2642     .addReg(IdxReg)
2643     .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE);
2644   BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), DstReg)
2645     .addReg(SrcReg, 0, SubReg)
2646     .addReg(SrcReg, RegState::Implicit)
2647     .addReg(AMDGPU::M0, RegState::Implicit);
2648   BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF));
2649 
2650   MI.eraseFromParent();
2651   return true;
2652 }
2653 
2654 // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
2655 bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
2656   MachineInstr &MI) const {
2657   Register DstReg = MI.getOperand(0).getReg();
2658   Register VecReg = MI.getOperand(1).getReg();
2659   Register ValReg = MI.getOperand(2).getReg();
2660   Register IdxReg = MI.getOperand(3).getReg();
2661 
2662   LLT VecTy = MRI->getType(DstReg);
2663   LLT ValTy = MRI->getType(ValReg);
2664   unsigned VecSize = VecTy.getSizeInBits();
2665   unsigned ValSize = ValTy.getSizeInBits();
2666 
2667   const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
2668   const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
2669   const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
2670 
2671   assert(VecTy.getElementType() == ValTy);
2672 
2673   // The index must be scalar. If it wasn't RegBankSelect should have moved this
2674   // into a waterfall loop.
2675   if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
2676     return false;
2677 
2678   const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB,
2679                                                                   *MRI);
2680   const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB,
2681                                                                   *MRI);
2682 
2683   if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
2684       !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
2685       !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
2686       !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
2687     return false;
2688 
2689   if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
2690     return false;
2691 
2692   unsigned SubReg;
2693   std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg,
2694                                                      ValSize / 8);
2695 
2696   const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
2697                          STI.useVGPRIndexMode();
2698 
2699   MachineBasicBlock *BB = MI.getParent();
2700   const DebugLoc &DL = MI.getDebugLoc();
2701 
2702   if (IndexMode) {
2703     BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON))
2704       .addReg(IdxReg)
2705       .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE);
2706   } else {
2707     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2708       .addReg(IdxReg);
2709   }
2710 
2711   const MCInstrDesc &RegWriteOp
2712     = TII.getIndirectRegWritePseudo(VecSize, ValSize,
2713                                     VecRB->getID() == AMDGPU::SGPRRegBankID);
2714   BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
2715     .addReg(VecReg)
2716     .addReg(ValReg)
2717     .addImm(SubReg);
2718 
2719   if (IndexMode)
2720     BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF));
2721 
2722   MI.eraseFromParent();
2723   return true;
2724 }
2725 
2726 static bool isZeroOrUndef(int X) {
2727   return X == 0 || X == -1;
2728 }
2729 
2730 static bool isOneOrUndef(int X) {
2731   return X == 1 || X == -1;
2732 }
2733 
2734 static bool isZeroOrOneOrUndef(int X) {
2735   return X == 0 || X == 1 || X == -1;
2736 }
2737 
2738 // Normalize a VOP3P shuffle mask to refer to the low/high half of a single
2739 // 32-bit register.
2740 static Register normalizeVOP3PMask(int NewMask[2], Register Src0, Register Src1,
2741                                    ArrayRef<int> Mask) {
2742   NewMask[0] = Mask[0];
2743   NewMask[1] = Mask[1];
2744   if (isZeroOrOneOrUndef(Mask[0]) && isZeroOrOneOrUndef(Mask[1]))
2745     return Src0;
2746 
2747   assert(NewMask[0] == 2 || NewMask[0] == 3 || NewMask[0] == -1);
2748   assert(NewMask[1] == 2 || NewMask[1] == 3 || NewMask[1] == -1);
2749 
2750   // Shift the mask inputs to be 0/1;
2751   NewMask[0] = NewMask[0] == -1 ? -1 : NewMask[0] - 2;
2752   NewMask[1] = NewMask[1] == -1 ? -1 : NewMask[1] - 2;
2753   return Src1;
2754 }
2755 
2756 // This is only legal with VOP3P instructions as an aid to op_sel matching.
2757 bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
2758   MachineInstr &MI) const {
2759   Register DstReg = MI.getOperand(0).getReg();
2760   Register Src0Reg = MI.getOperand(1).getReg();
2761   Register Src1Reg = MI.getOperand(2).getReg();
2762   ArrayRef<int> ShufMask = MI.getOperand(3).getShuffleMask();
2763 
2764   const LLT V2S16 = LLT::vector(2, 16);
2765   if (MRI->getType(DstReg) != V2S16 || MRI->getType(Src0Reg) != V2S16)
2766     return false;
2767 
2768   if (!AMDGPU::isLegalVOP3PShuffleMask(ShufMask))
2769     return false;
2770 
2771   assert(ShufMask.size() == 2);
2772   assert(STI.hasSDWA() && "no target has VOP3P but not SDWA");
2773 
2774   MachineBasicBlock *MBB = MI.getParent();
2775   const DebugLoc &DL = MI.getDebugLoc();
2776 
2777   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2778   const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2779   const TargetRegisterClass &RC = IsVALU ?
2780     AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2781 
2782   // Handle the degenerate case which should have folded out.
2783   if (ShufMask[0] == -1 && ShufMask[1] == -1) {
2784     BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), DstReg);
2785 
2786     MI.eraseFromParent();
2787     return RBI.constrainGenericRegister(DstReg, RC, *MRI);
2788   }
2789 
2790   // A legal VOP3P mask only reads one of the sources.
2791   int Mask[2];
2792   Register SrcVec = normalizeVOP3PMask(Mask, Src0Reg, Src1Reg, ShufMask);
2793 
2794   if (!RBI.constrainGenericRegister(DstReg, RC, *MRI) ||
2795       !RBI.constrainGenericRegister(SrcVec, RC, *MRI))
2796     return false;
2797 
2798   // TODO: This also should have been folded out
2799   if (isZeroOrUndef(Mask[0]) && isOneOrUndef(Mask[1])) {
2800     BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), DstReg)
2801       .addReg(SrcVec);
2802 
2803     MI.eraseFromParent();
2804     return true;
2805   }
2806 
2807   if (Mask[0] == 1 && Mask[1] == -1) {
2808     if (IsVALU) {
2809       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
2810         .addImm(16)
2811         .addReg(SrcVec);
2812     } else {
2813       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
2814         .addReg(SrcVec)
2815         .addImm(16);
2816     }
2817   } else if (Mask[0] == -1 && Mask[1] == 0) {
2818     if (IsVALU) {
2819       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), DstReg)
2820         .addImm(16)
2821         .addReg(SrcVec);
2822     } else {
2823       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHL_B32), DstReg)
2824         .addReg(SrcVec)
2825         .addImm(16);
2826     }
2827   } else if (Mask[0] == 0 && Mask[1] == 0) {
2828     if (IsVALU) {
2829       // Write low half of the register into the high half.
2830       MachineInstr *MovSDWA =
2831         BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2832         .addImm(0)                             // $src0_modifiers
2833         .addReg(SrcVec)                        // $src0
2834         .addImm(0)                             // $clamp
2835         .addImm(AMDGPU::SDWA::WORD_1)          // $dst_sel
2836         .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2837         .addImm(AMDGPU::SDWA::WORD_0)          // $src0_sel
2838         .addReg(SrcVec, RegState::Implicit);
2839       MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2840     } else {
2841       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
2842         .addReg(SrcVec)
2843         .addReg(SrcVec);
2844     }
2845   } else if (Mask[0] == 1 && Mask[1] == 1) {
2846     if (IsVALU) {
2847       // Write high half of the register into the low half.
2848       MachineInstr *MovSDWA =
2849         BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2850         .addImm(0)                             // $src0_modifiers
2851         .addReg(SrcVec)                        // $src0
2852         .addImm(0)                             // $clamp
2853         .addImm(AMDGPU::SDWA::WORD_0)          // $dst_sel
2854         .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2855         .addImm(AMDGPU::SDWA::WORD_1)          // $src0_sel
2856         .addReg(SrcVec, RegState::Implicit);
2857       MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2858     } else {
2859       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_HH_B32_B16), DstReg)
2860         .addReg(SrcVec)
2861         .addReg(SrcVec);
2862     }
2863   } else if (Mask[0] == 1 && Mask[1] == 0) {
2864     if (IsVALU) {
2865       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_ALIGNBIT_B32), DstReg)
2866         .addReg(SrcVec)
2867         .addReg(SrcVec)
2868         .addImm(16);
2869     } else {
2870       Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2871       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg)
2872         .addReg(SrcVec)
2873         .addImm(16);
2874       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
2875         .addReg(TmpReg)
2876         .addReg(SrcVec);
2877     }
2878   } else
2879     llvm_unreachable("all shuffle masks should be handled");
2880 
2881   MI.eraseFromParent();
2882   return true;
2883 }
2884 
2885 bool AMDGPUInstructionSelector::select(MachineInstr &I) {
2886   if (I.isPHI())
2887     return selectPHI(I);
2888 
2889   if (!I.isPreISelOpcode()) {
2890     if (I.isCopy())
2891       return selectCOPY(I);
2892     return true;
2893   }
2894 
2895   switch (I.getOpcode()) {
2896   case TargetOpcode::G_AND:
2897   case TargetOpcode::G_OR:
2898   case TargetOpcode::G_XOR:
2899     if (selectImpl(I, *CoverageInfo))
2900       return true;
2901     return selectG_AND_OR_XOR(I);
2902   case TargetOpcode::G_ADD:
2903   case TargetOpcode::G_SUB:
2904     if (selectImpl(I, *CoverageInfo))
2905       return true;
2906     return selectG_ADD_SUB(I);
2907   case TargetOpcode::G_UADDO:
2908   case TargetOpcode::G_USUBO:
2909   case TargetOpcode::G_UADDE:
2910   case TargetOpcode::G_USUBE:
2911     return selectG_UADDO_USUBO_UADDE_USUBE(I);
2912   case TargetOpcode::G_INTTOPTR:
2913   case TargetOpcode::G_BITCAST:
2914   case TargetOpcode::G_PTRTOINT:
2915     return selectCOPY(I);
2916   case TargetOpcode::G_CONSTANT:
2917   case TargetOpcode::G_FCONSTANT:
2918     return selectG_CONSTANT(I);
2919   case TargetOpcode::G_FNEG:
2920     if (selectImpl(I, *CoverageInfo))
2921       return true;
2922     return selectG_FNEG(I);
2923   case TargetOpcode::G_FABS:
2924     if (selectImpl(I, *CoverageInfo))
2925       return true;
2926     return selectG_FABS(I);
2927   case TargetOpcode::G_EXTRACT:
2928     return selectG_EXTRACT(I);
2929   case TargetOpcode::G_MERGE_VALUES:
2930   case TargetOpcode::G_BUILD_VECTOR:
2931   case TargetOpcode::G_CONCAT_VECTORS:
2932     return selectG_MERGE_VALUES(I);
2933   case TargetOpcode::G_UNMERGE_VALUES:
2934     return selectG_UNMERGE_VALUES(I);
2935   case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2936     return selectG_BUILD_VECTOR_TRUNC(I);
2937   case TargetOpcode::G_PTR_ADD:
2938     return selectG_PTR_ADD(I);
2939   case TargetOpcode::G_IMPLICIT_DEF:
2940     return selectG_IMPLICIT_DEF(I);
2941   case TargetOpcode::G_FREEZE:
2942     return selectCOPY(I);
2943   case TargetOpcode::G_INSERT:
2944     return selectG_INSERT(I);
2945   case TargetOpcode::G_INTRINSIC:
2946     return selectG_INTRINSIC(I);
2947   case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
2948     return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
2949   case TargetOpcode::G_ICMP:
2950     if (selectG_ICMP(I))
2951       return true;
2952     return selectImpl(I, *CoverageInfo);
2953   case TargetOpcode::G_LOAD:
2954   case TargetOpcode::G_STORE:
2955   case TargetOpcode::G_ATOMIC_CMPXCHG:
2956   case TargetOpcode::G_ATOMICRMW_XCHG:
2957   case TargetOpcode::G_ATOMICRMW_ADD:
2958   case TargetOpcode::G_ATOMICRMW_SUB:
2959   case TargetOpcode::G_ATOMICRMW_AND:
2960   case TargetOpcode::G_ATOMICRMW_OR:
2961   case TargetOpcode::G_ATOMICRMW_XOR:
2962   case TargetOpcode::G_ATOMICRMW_MIN:
2963   case TargetOpcode::G_ATOMICRMW_MAX:
2964   case TargetOpcode::G_ATOMICRMW_UMIN:
2965   case TargetOpcode::G_ATOMICRMW_UMAX:
2966   case TargetOpcode::G_ATOMICRMW_FADD:
2967   case AMDGPU::G_AMDGPU_ATOMIC_INC:
2968   case AMDGPU::G_AMDGPU_ATOMIC_DEC:
2969   case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
2970   case AMDGPU::G_AMDGPU_ATOMIC_FMAX:
2971     return selectG_LOAD_STORE_ATOMICRMW(I);
2972   case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
2973     return selectG_AMDGPU_ATOMIC_CMPXCHG(I);
2974   case TargetOpcode::G_SELECT:
2975     return selectG_SELECT(I);
2976   case TargetOpcode::G_TRUNC:
2977     return selectG_TRUNC(I);
2978   case TargetOpcode::G_SEXT:
2979   case TargetOpcode::G_ZEXT:
2980   case TargetOpcode::G_ANYEXT:
2981   case TargetOpcode::G_SEXT_INREG:
2982     if (selectImpl(I, *CoverageInfo))
2983       return true;
2984     return selectG_SZA_EXT(I);
2985   case TargetOpcode::G_BRCOND:
2986     return selectG_BRCOND(I);
2987   case TargetOpcode::G_GLOBAL_VALUE:
2988     return selectG_GLOBAL_VALUE(I);
2989   case TargetOpcode::G_PTRMASK:
2990     return selectG_PTRMASK(I);
2991   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2992     return selectG_EXTRACT_VECTOR_ELT(I);
2993   case TargetOpcode::G_INSERT_VECTOR_ELT:
2994     return selectG_INSERT_VECTOR_ELT(I);
2995   case TargetOpcode::G_SHUFFLE_VECTOR:
2996     return selectG_SHUFFLE_VECTOR(I);
2997   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
2998   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
2999     const AMDGPU::ImageDimIntrinsicInfo *Intr
3000       = AMDGPU::getImageDimIntrinsicInfo(I.getIntrinsicID());
3001     assert(Intr && "not an image intrinsic with image pseudo");
3002     return selectImageIntrinsic(I, Intr);
3003   }
3004   default:
3005     return selectImpl(I, *CoverageInfo);
3006   }
3007   return false;
3008 }
3009 
3010 InstructionSelector::ComplexRendererFns
3011 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
3012   return {{
3013       [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3014   }};
3015 
3016 }
3017 
3018 std::pair<Register, unsigned>
3019 AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root) const {
3020   Register Src = Root.getReg();
3021   Register OrigSrc = Src;
3022   unsigned Mods = 0;
3023   MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
3024 
3025   if (MI && MI->getOpcode() == AMDGPU::G_FNEG) {
3026     Src = MI->getOperand(1).getReg();
3027     Mods |= SISrcMods::NEG;
3028     MI = getDefIgnoringCopies(Src, *MRI);
3029   }
3030 
3031   if (MI && MI->getOpcode() == AMDGPU::G_FABS) {
3032     Src = MI->getOperand(1).getReg();
3033     Mods |= SISrcMods::ABS;
3034   }
3035 
3036   if (Mods != 0 &&
3037       RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
3038     MachineInstr *UseMI = Root.getParent();
3039 
3040     // If we looked through copies to find source modifiers on an SGPR operand,
3041     // we now have an SGPR register source. To avoid potentially violating the
3042     // constant bus restriction, we need to insert a copy to a VGPR.
3043     Register VGPRSrc = MRI->cloneVirtualRegister(OrigSrc);
3044     BuildMI(*UseMI->getParent(), UseMI, UseMI->getDebugLoc(),
3045             TII.get(AMDGPU::COPY), VGPRSrc)
3046       .addReg(Src);
3047     Src = VGPRSrc;
3048   }
3049 
3050   return std::make_pair(Src, Mods);
3051 }
3052 
3053 ///
3054 /// This will select either an SGPR or VGPR operand and will save us from
3055 /// having to write an extra tablegen pattern.
3056 InstructionSelector::ComplexRendererFns
3057 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
3058   return {{
3059       [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3060   }};
3061 }
3062 
3063 InstructionSelector::ComplexRendererFns
3064 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
3065   Register Src;
3066   unsigned Mods;
3067   std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3068 
3069   return {{
3070       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3071       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3072       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },    // clamp
3073       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }     // omod
3074   }};
3075 }
3076 
3077 InstructionSelector::ComplexRendererFns
3078 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
3079   return {{
3080       [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
3081       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3082       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }  // omod
3083   }};
3084 }
3085 
3086 InstructionSelector::ComplexRendererFns
3087 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
3088   Register Src;
3089   unsigned Mods;
3090   std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3091 
3092   return {{
3093       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3094       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
3095   }};
3096 }
3097 
3098 InstructionSelector::ComplexRendererFns
3099 AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
3100   Register Reg = Root.getReg();
3101   const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3102   if (Def && (Def->getOpcode() == AMDGPU::G_FNEG ||
3103               Def->getOpcode() == AMDGPU::G_FABS))
3104     return {};
3105   return {{
3106       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3107   }};
3108 }
3109 
3110 std::pair<Register, unsigned>
3111 AMDGPUInstructionSelector::selectVOP3PModsImpl(
3112   Register Src, const MachineRegisterInfo &MRI) const {
3113   unsigned Mods = 0;
3114   MachineInstr *MI = MRI.getVRegDef(Src);
3115 
3116   if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
3117       // It's possible to see an f32 fneg here, but unlikely.
3118       // TODO: Treat f32 fneg as only high bit.
3119       MRI.getType(Src) == LLT::vector(2, 16)) {
3120     Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
3121     Src = MI->getOperand(1).getReg();
3122     MI = MRI.getVRegDef(Src);
3123   }
3124 
3125   // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
3126 
3127   // Packed instructions do not have abs modifiers.
3128   Mods |= SISrcMods::OP_SEL_1;
3129 
3130   return std::make_pair(Src, Mods);
3131 }
3132 
3133 InstructionSelector::ComplexRendererFns
3134 AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
3135   MachineRegisterInfo &MRI
3136     = Root.getParent()->getParent()->getParent()->getRegInfo();
3137 
3138   Register Src;
3139   unsigned Mods;
3140   std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
3141 
3142   return {{
3143       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3144       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
3145   }};
3146 }
3147 
3148 InstructionSelector::ComplexRendererFns
3149 AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {
3150   Register Src;
3151   unsigned Mods;
3152   std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3153   if (!TM.Options.NoNaNsFPMath && !isKnownNeverNaN(Src, *MRI))
3154     return None;
3155 
3156   return {{
3157       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3158       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
3159   }};
3160 }
3161 
3162 InstructionSelector::ComplexRendererFns
3163 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
3164   // FIXME: Handle op_sel
3165   return {{
3166       [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
3167       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods
3168   }};
3169 }
3170 
3171 InstructionSelector::ComplexRendererFns
3172 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
3173   SmallVector<GEPInfo, 4> AddrInfo;
3174   getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
3175 
3176   if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3177     return None;
3178 
3179   const GEPInfo &GEPInfo = AddrInfo[0];
3180   Optional<int64_t> EncodedImm =
3181       AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm, false);
3182   if (!EncodedImm)
3183     return None;
3184 
3185   unsigned PtrReg = GEPInfo.SgprParts[0];
3186   return {{
3187     [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3188     [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
3189   }};
3190 }
3191 
3192 InstructionSelector::ComplexRendererFns
3193 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
3194   SmallVector<GEPInfo, 4> AddrInfo;
3195   getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
3196 
3197   if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3198     return None;
3199 
3200   const GEPInfo &GEPInfo = AddrInfo[0];
3201   Register PtrReg = GEPInfo.SgprParts[0];
3202   Optional<int64_t> EncodedImm =
3203       AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
3204   if (!EncodedImm)
3205     return None;
3206 
3207   return {{
3208     [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3209     [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
3210   }};
3211 }
3212 
3213 InstructionSelector::ComplexRendererFns
3214 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
3215   MachineInstr *MI = Root.getParent();
3216   MachineBasicBlock *MBB = MI->getParent();
3217 
3218   SmallVector<GEPInfo, 4> AddrInfo;
3219   getAddrModeInfo(*MI, *MRI, AddrInfo);
3220 
3221   // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
3222   // then we can select all ptr + 32-bit offsets not just immediate offsets.
3223   if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3224     return None;
3225 
3226   const GEPInfo &GEPInfo = AddrInfo[0];
3227   // SGPR offset is unsigned.
3228   if (!GEPInfo.Imm || GEPInfo.Imm < 0 || !isUInt<32>(GEPInfo.Imm))
3229     return None;
3230 
3231   // If we make it this far we have a load with an 32-bit immediate offset.
3232   // It is OK to select this using a sgpr offset, because we have already
3233   // failed trying to select this load into one of the _IMM variants since
3234   // the _IMM Patterns are considered before the _SGPR patterns.
3235   Register PtrReg = GEPInfo.SgprParts[0];
3236   Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3237   BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg)
3238           .addImm(GEPInfo.Imm);
3239   return {{
3240     [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3241     [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }
3242   }};
3243 }
3244 
3245 template <bool Signed>
3246 InstructionSelector::ComplexRendererFns
3247 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const {
3248   MachineInstr *MI = Root.getParent();
3249 
3250   InstructionSelector::ComplexRendererFns Default = {{
3251       [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
3252       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },  // offset
3253     }};
3254 
3255   if (!STI.hasFlatInstOffsets())
3256     return Default;
3257 
3258   const MachineInstr *OpDef = MRI->getVRegDef(Root.getReg());
3259   if (!OpDef || OpDef->getOpcode() != AMDGPU::G_PTR_ADD)
3260     return Default;
3261 
3262   Optional<int64_t> Offset =
3263     getConstantVRegVal(OpDef->getOperand(2).getReg(), *MRI);
3264   if (!Offset.hasValue())
3265     return Default;
3266 
3267   unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
3268   if (!TII.isLegalFLATOffset(Offset.getValue(), AddrSpace, Signed))
3269     return Default;
3270 
3271   Register BasePtr = OpDef->getOperand(1).getReg();
3272 
3273   return {{
3274       [=](MachineInstrBuilder &MIB) { MIB.addReg(BasePtr); },
3275       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset.getValue()); },
3276     }};
3277 }
3278 
3279 InstructionSelector::ComplexRendererFns
3280 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
3281   return selectFlatOffsetImpl<false>(Root);
3282 }
3283 
3284 InstructionSelector::ComplexRendererFns
3285 AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const {
3286   return selectFlatOffsetImpl<true>(Root);
3287 }
3288 
3289 /// Match a zero extend from a 32-bit value to 64-bits.
3290 static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
3291   Register ZExtSrc;
3292   if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
3293     return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3294 
3295   // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3296   const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
3297   if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3298     return false;
3299 
3300   int64_t MergeRHS;
3301   if (mi_match(Def->getOperand(2).getReg(), MRI, m_ICst(MergeRHS)) &&
3302       MergeRHS == 0) {
3303     return Def->getOperand(1).getReg();
3304   }
3305 
3306   return Register();
3307 }
3308 
3309 // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
3310 InstructionSelector::ComplexRendererFns
3311 AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
3312   Register PtrBase;
3313   int64_t ImmOffset;
3314 
3315   // Match the immediate offset first, which canonically is moved as low as
3316   // possible.
3317   std::tie(PtrBase, ImmOffset) = getPtrBaseWithConstantOffset(Root.getReg(),
3318                                                               *MRI);
3319 
3320   // TODO: Could split larger constant into VGPR offset.
3321   if (ImmOffset != 0 &&
3322       !TII.isLegalFLATOffset(ImmOffset, AMDGPUAS::GLOBAL_ADDRESS, true)) {
3323     PtrBase = Root.getReg();
3324     ImmOffset = 0;
3325   }
3326 
3327   // Match the variable offset.
3328   const MachineInstr *PtrBaseDef = getDefIgnoringCopies(PtrBase, *MRI);
3329   if (PtrBaseDef->getOpcode() != AMDGPU::G_PTR_ADD)
3330     return None;
3331 
3332   // Look through the SGPR->VGPR copy.
3333   Register PtrBaseSrc =
3334     getSrcRegIgnoringCopies(PtrBaseDef->getOperand(1).getReg(), *MRI);
3335   if (!PtrBaseSrc)
3336     return None;
3337 
3338   const RegisterBank *BaseRB = RBI.getRegBank(PtrBaseSrc, *MRI, TRI);
3339   if (BaseRB->getID() != AMDGPU::SGPRRegBankID)
3340     return None;
3341 
3342   Register SAddr = PtrBaseSrc;
3343   Register PtrBaseOffset = PtrBaseDef->getOperand(2).getReg();
3344 
3345   // It's possible voffset is an SGPR here, but the copy to VGPR will be
3346   // inserted later.
3347   Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset);
3348   if (!VOffset)
3349     return None;
3350 
3351   return {{[=](MachineInstrBuilder &MIB) { // saddr
3352              MIB.addReg(SAddr);
3353            },
3354            [=](MachineInstrBuilder &MIB) { // voffset
3355              MIB.addReg(VOffset);
3356            },
3357            [=](MachineInstrBuilder &MIB) { // offset
3358              MIB.addImm(ImmOffset);
3359            }}};
3360 }
3361 
3362 static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
3363   auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>();
3364   return PSV && PSV->isStack();
3365 }
3366 
3367 InstructionSelector::ComplexRendererFns
3368 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
3369   MachineInstr *MI = Root.getParent();
3370   MachineBasicBlock *MBB = MI->getParent();
3371   MachineFunction *MF = MBB->getParent();
3372   const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3373 
3374   int64_t Offset = 0;
3375   if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
3376       Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) {
3377     Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3378 
3379     // TODO: Should this be inside the render function? The iterator seems to
3380     // move.
3381     BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
3382             HighBits)
3383       .addImm(Offset & ~4095);
3384 
3385     return {{[=](MachineInstrBuilder &MIB) { // rsrc
3386                MIB.addReg(Info->getScratchRSrcReg());
3387              },
3388              [=](MachineInstrBuilder &MIB) { // vaddr
3389                MIB.addReg(HighBits);
3390              },
3391              [=](MachineInstrBuilder &MIB) { // soffset
3392                const MachineMemOperand *MMO = *MI->memoperands_begin();
3393                const MachinePointerInfo &PtrInfo = MMO->getPointerInfo();
3394 
3395                if (isStackPtrRelative(PtrInfo))
3396                  MIB.addReg(Info->getStackPtrOffsetReg());
3397                else
3398                  MIB.addImm(0);
3399              },
3400              [=](MachineInstrBuilder &MIB) { // offset
3401                MIB.addImm(Offset & 4095);
3402              }}};
3403   }
3404 
3405   assert(Offset == 0 || Offset == -1);
3406 
3407   // Try to fold a frame index directly into the MUBUF vaddr field, and any
3408   // offsets.
3409   Optional<int> FI;
3410   Register VAddr = Root.getReg();
3411   if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) {
3412     if (isBaseWithConstantOffset(Root, *MRI)) {
3413       const MachineOperand &LHS = RootDef->getOperand(1);
3414       const MachineOperand &RHS = RootDef->getOperand(2);
3415       const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg());
3416       const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg());
3417       if (LHSDef && RHSDef) {
3418         int64_t PossibleOffset =
3419             RHSDef->getOperand(1).getCImm()->getSExtValue();
3420         if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) &&
3421             (!STI.privateMemoryResourceIsRangeChecked() ||
3422              KnownBits->signBitIsZero(LHS.getReg()))) {
3423           if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
3424             FI = LHSDef->getOperand(1).getIndex();
3425           else
3426             VAddr = LHS.getReg();
3427           Offset = PossibleOffset;
3428         }
3429       }
3430     } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
3431       FI = RootDef->getOperand(1).getIndex();
3432     }
3433   }
3434 
3435   return {{[=](MachineInstrBuilder &MIB) { // rsrc
3436              MIB.addReg(Info->getScratchRSrcReg());
3437            },
3438            [=](MachineInstrBuilder &MIB) { // vaddr
3439              if (FI.hasValue())
3440                MIB.addFrameIndex(FI.getValue());
3441              else
3442                MIB.addReg(VAddr);
3443            },
3444            [=](MachineInstrBuilder &MIB) { // soffset
3445              // If we don't know this private access is a local stack object, it
3446              // needs to be relative to the entry point's scratch wave offset.
3447              // TODO: Should split large offsets that don't fit like above.
3448              // TODO: Don't use scratch wave offset just because the offset
3449              // didn't fit.
3450              if (!Info->isEntryFunction() && FI.hasValue())
3451                MIB.addReg(Info->getStackPtrOffsetReg());
3452              else
3453                MIB.addImm(0);
3454            },
3455            [=](MachineInstrBuilder &MIB) { // offset
3456              MIB.addImm(Offset);
3457            }}};
3458 }
3459 
3460 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
3461                                                 int64_t Offset,
3462                                                 unsigned OffsetBits) const {
3463   if ((OffsetBits == 16 && !isUInt<16>(Offset)) ||
3464       (OffsetBits == 8 && !isUInt<8>(Offset)))
3465     return false;
3466 
3467   if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
3468     return true;
3469 
3470   // On Southern Islands instruction with a negative base value and an offset
3471   // don't seem to work.
3472   return KnownBits->signBitIsZero(Base);
3473 }
3474 
3475 InstructionSelector::ComplexRendererFns
3476 AMDGPUInstructionSelector::selectMUBUFScratchOffset(
3477     MachineOperand &Root) const {
3478   MachineInstr *MI = Root.getParent();
3479   MachineBasicBlock *MBB = MI->getParent();
3480 
3481   int64_t Offset = 0;
3482   if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
3483       !SIInstrInfo::isLegalMUBUFImmOffset(Offset))
3484     return {};
3485 
3486   const MachineFunction *MF = MBB->getParent();
3487   const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3488   const MachineMemOperand *MMO = *MI->memoperands_begin();
3489   const MachinePointerInfo &PtrInfo = MMO->getPointerInfo();
3490 
3491   return {{
3492       [=](MachineInstrBuilder &MIB) { // rsrc
3493         MIB.addReg(Info->getScratchRSrcReg());
3494       },
3495       [=](MachineInstrBuilder &MIB) { // soffset
3496         if (isStackPtrRelative(PtrInfo))
3497           MIB.addReg(Info->getStackPtrOffsetReg());
3498         else
3499           MIB.addImm(0);
3500       },
3501       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
3502   }};
3503 }
3504 
3505 std::pair<Register, unsigned>
3506 AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
3507   const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
3508   if (!RootDef)
3509     return std::make_pair(Root.getReg(), 0);
3510 
3511   int64_t ConstAddr = 0;
3512 
3513   Register PtrBase;
3514   int64_t Offset;
3515   std::tie(PtrBase, Offset) =
3516     getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3517 
3518   if (Offset) {
3519     if (isDSOffsetLegal(PtrBase, Offset, 16)) {
3520       // (add n0, c0)
3521       return std::make_pair(PtrBase, Offset);
3522     }
3523   } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
3524     // TODO
3525 
3526 
3527   } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
3528     // TODO
3529 
3530   }
3531 
3532   return std::make_pair(Root.getReg(), 0);
3533 }
3534 
3535 InstructionSelector::ComplexRendererFns
3536 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
3537   Register Reg;
3538   unsigned Offset;
3539   std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
3540   return {{
3541       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3542       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
3543     }};
3544 }
3545 
3546 InstructionSelector::ComplexRendererFns
3547 AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
3548   Register Reg;
3549   unsigned Offset;
3550   std::tie(Reg, Offset) = selectDS64Bit4ByteAlignedImpl(Root);
3551   return {{
3552       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3553       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
3554       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
3555     }};
3556 }
3557 
3558 std::pair<Register, unsigned>
3559 AMDGPUInstructionSelector::selectDS64Bit4ByteAlignedImpl(MachineOperand &Root) const {
3560   const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
3561   if (!RootDef)
3562     return std::make_pair(Root.getReg(), 0);
3563 
3564   int64_t ConstAddr = 0;
3565 
3566   Register PtrBase;
3567   int64_t Offset;
3568   std::tie(PtrBase, Offset) =
3569     getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3570 
3571   if (Offset) {
3572     int64_t DWordOffset0 = Offset / 4;
3573     int64_t DWordOffset1 = DWordOffset0 + 1;
3574     if (isDSOffsetLegal(PtrBase, DWordOffset1, 8)) {
3575       // (add n0, c0)
3576       return std::make_pair(PtrBase, DWordOffset0);
3577     }
3578   } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
3579     // TODO
3580 
3581   } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
3582     // TODO
3583 
3584   }
3585 
3586   return std::make_pair(Root.getReg(), 0);
3587 }
3588 
3589 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
3590 /// the base value with the constant offset. There may be intervening copies
3591 /// between \p Root and the identified constant. Returns \p Root, 0 if this does
3592 /// not match the pattern.
3593 std::pair<Register, int64_t>
3594 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
3595   Register Root, const MachineRegisterInfo &MRI) const {
3596   MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
3597   if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
3598     return {Root, 0};
3599 
3600   MachineOperand &RHS = RootI->getOperand(2);
3601   Optional<ValueAndVReg> MaybeOffset
3602     = getConstantVRegValWithLookThrough(RHS.getReg(), MRI, true);
3603   if (!MaybeOffset)
3604     return {Root, 0};
3605   return {RootI->getOperand(1).getReg(), MaybeOffset->Value};
3606 }
3607 
3608 static void addZeroImm(MachineInstrBuilder &MIB) {
3609   MIB.addImm(0);
3610 }
3611 
3612 /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
3613 /// BasePtr is not valid, a null base pointer will be used.
3614 static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI,
3615                           uint32_t FormatLo, uint32_t FormatHi,
3616                           Register BasePtr) {
3617   Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
3618   Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
3619   Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3620   Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
3621 
3622   B.buildInstr(AMDGPU::S_MOV_B32)
3623     .addDef(RSrc2)
3624     .addImm(FormatLo);
3625   B.buildInstr(AMDGPU::S_MOV_B32)
3626     .addDef(RSrc3)
3627     .addImm(FormatHi);
3628 
3629   // Build the half of the subregister with the constants before building the
3630   // full 128-bit register. If we are building multiple resource descriptors,
3631   // this will allow CSEing of the 2-component register.
3632   B.buildInstr(AMDGPU::REG_SEQUENCE)
3633     .addDef(RSrcHi)
3634     .addReg(RSrc2)
3635     .addImm(AMDGPU::sub0)
3636     .addReg(RSrc3)
3637     .addImm(AMDGPU::sub1);
3638 
3639   Register RSrcLo = BasePtr;
3640   if (!BasePtr) {
3641     RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3642     B.buildInstr(AMDGPU::S_MOV_B64)
3643       .addDef(RSrcLo)
3644       .addImm(0);
3645   }
3646 
3647   B.buildInstr(AMDGPU::REG_SEQUENCE)
3648     .addDef(RSrc)
3649     .addReg(RSrcLo)
3650     .addImm(AMDGPU::sub0_sub1)
3651     .addReg(RSrcHi)
3652     .addImm(AMDGPU::sub2_sub3);
3653 
3654   return RSrc;
3655 }
3656 
3657 static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
3658                                 const SIInstrInfo &TII, Register BasePtr) {
3659   uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
3660 
3661   // FIXME: Why are half the "default" bits ignored based on the addressing
3662   // mode?
3663   return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
3664 }
3665 
3666 static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
3667                                const SIInstrInfo &TII, Register BasePtr) {
3668   uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
3669 
3670   // FIXME: Why are half the "default" bits ignored based on the addressing
3671   // mode?
3672   return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
3673 }
3674 
3675 AMDGPUInstructionSelector::MUBUFAddressData
3676 AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
3677   MUBUFAddressData Data;
3678   Data.N0 = Src;
3679 
3680   Register PtrBase;
3681   int64_t Offset;
3682 
3683   std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
3684   if (isUInt<32>(Offset)) {
3685     Data.N0 = PtrBase;
3686     Data.Offset = Offset;
3687   }
3688 
3689   if (MachineInstr *InputAdd
3690       = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
3691     Data.N2 = InputAdd->getOperand(1).getReg();
3692     Data.N3 = InputAdd->getOperand(2).getReg();
3693 
3694     // FIXME: Need to fix extra SGPR->VGPRcopies inserted
3695     // FIXME: Don't know this was defined by operand 0
3696     //
3697     // TODO: Remove this when we have copy folding optimizations after
3698     // RegBankSelect.
3699     Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
3700     Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
3701   }
3702 
3703   return Data;
3704 }
3705 
3706 /// Return if the addr64 mubuf mode should be used for the given address.
3707 bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
3708   // (ptr_add N2, N3) -> addr64, or
3709   // (ptr_add (ptr_add N2, N3), C1) -> addr64
3710   if (Addr.N2)
3711     return true;
3712 
3713   const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
3714   return N0Bank->getID() == AMDGPU::VGPRRegBankID;
3715 }
3716 
3717 /// Split an immediate offset \p ImmOffset depending on whether it fits in the
3718 /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
3719 /// component.
3720 void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
3721   MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
3722   if (SIInstrInfo::isLegalMUBUFImmOffset(ImmOffset))
3723     return;
3724 
3725   // Illegal offset, store it in soffset.
3726   SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3727   B.buildInstr(AMDGPU::S_MOV_B32)
3728     .addDef(SOffset)
3729     .addImm(ImmOffset);
3730   ImmOffset = 0;
3731 }
3732 
3733 bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
3734   MachineOperand &Root, Register &VAddr, Register &RSrcReg,
3735   Register &SOffset, int64_t &Offset) const {
3736   // FIXME: Predicates should stop this from reaching here.
3737   // addr64 bit was removed for volcanic islands.
3738   if (!STI.hasAddr64() || STI.useFlatForGlobal())
3739     return false;
3740 
3741   MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
3742   if (!shouldUseAddr64(AddrData))
3743     return false;
3744 
3745   Register N0 = AddrData.N0;
3746   Register N2 = AddrData.N2;
3747   Register N3 = AddrData.N3;
3748   Offset = AddrData.Offset;
3749 
3750   // Base pointer for the SRD.
3751   Register SRDPtr;
3752 
3753   if (N2) {
3754     if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
3755       assert(N3);
3756       if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
3757         // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
3758         // addr64, and construct the default resource from a 0 address.
3759         VAddr = N0;
3760       } else {
3761         SRDPtr = N3;
3762         VAddr = N2;
3763       }
3764     } else {
3765       // N2 is not divergent.
3766       SRDPtr = N2;
3767       VAddr = N3;
3768     }
3769   } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
3770     // Use the default null pointer in the resource
3771     VAddr = N0;
3772   } else {
3773     // N0 -> offset, or
3774     // (N0 + C1) -> offset
3775     SRDPtr = N0;
3776   }
3777 
3778   MachineIRBuilder B(*Root.getParent());
3779   RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
3780   splitIllegalMUBUFOffset(B, SOffset, Offset);
3781   return true;
3782 }
3783 
3784 bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
3785   MachineOperand &Root, Register &RSrcReg, Register &SOffset,
3786   int64_t &Offset) const {
3787 
3788   // FIXME: Pattern should not reach here.
3789   if (STI.useFlatForGlobal())
3790     return false;
3791 
3792   MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
3793   if (shouldUseAddr64(AddrData))
3794     return false;
3795 
3796   // N0 -> offset, or
3797   // (N0 + C1) -> offset
3798   Register SRDPtr = AddrData.N0;
3799   Offset = AddrData.Offset;
3800 
3801   // TODO: Look through extensions for 32-bit soffset.
3802   MachineIRBuilder B(*Root.getParent());
3803 
3804   RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
3805   splitIllegalMUBUFOffset(B, SOffset, Offset);
3806   return true;
3807 }
3808 
3809 InstructionSelector::ComplexRendererFns
3810 AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
3811   Register VAddr;
3812   Register RSrcReg;
3813   Register SOffset;
3814   int64_t Offset = 0;
3815 
3816   if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
3817     return {};
3818 
3819   // FIXME: Use defaulted operands for trailing 0s and remove from the complex
3820   // pattern.
3821   return {{
3822       [=](MachineInstrBuilder &MIB) {  // rsrc
3823         MIB.addReg(RSrcReg);
3824       },
3825       [=](MachineInstrBuilder &MIB) { // vaddr
3826         MIB.addReg(VAddr);
3827       },
3828       [=](MachineInstrBuilder &MIB) { // soffset
3829         if (SOffset)
3830           MIB.addReg(SOffset);
3831         else
3832           MIB.addImm(0);
3833       },
3834       [=](MachineInstrBuilder &MIB) { // offset
3835         MIB.addImm(Offset);
3836       },
3837       addZeroImm, //  glc
3838       addZeroImm, //  slc
3839       addZeroImm, //  tfe
3840       addZeroImm, //  dlc
3841       addZeroImm  //  swz
3842     }};
3843 }
3844 
3845 InstructionSelector::ComplexRendererFns
3846 AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
3847   Register RSrcReg;
3848   Register SOffset;
3849   int64_t Offset = 0;
3850 
3851   if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
3852     return {};
3853 
3854   return {{
3855       [=](MachineInstrBuilder &MIB) {  // rsrc
3856         MIB.addReg(RSrcReg);
3857       },
3858       [=](MachineInstrBuilder &MIB) { // soffset
3859         if (SOffset)
3860           MIB.addReg(SOffset);
3861         else
3862           MIB.addImm(0);
3863       },
3864       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
3865       addZeroImm, //  glc
3866       addZeroImm, //  slc
3867       addZeroImm, //  tfe
3868       addZeroImm, //  dlc
3869       addZeroImm  //  swz
3870     }};
3871 }
3872 
3873 InstructionSelector::ComplexRendererFns
3874 AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const {
3875   Register VAddr;
3876   Register RSrcReg;
3877   Register SOffset;
3878   int64_t Offset = 0;
3879 
3880   if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
3881     return {};
3882 
3883   // FIXME: Use defaulted operands for trailing 0s and remove from the complex
3884   // pattern.
3885   return {{
3886       [=](MachineInstrBuilder &MIB) {  // rsrc
3887         MIB.addReg(RSrcReg);
3888       },
3889       [=](MachineInstrBuilder &MIB) { // vaddr
3890         MIB.addReg(VAddr);
3891       },
3892       [=](MachineInstrBuilder &MIB) { // soffset
3893         if (SOffset)
3894           MIB.addReg(SOffset);
3895         else
3896           MIB.addImm(0);
3897       },
3898       [=](MachineInstrBuilder &MIB) { // offset
3899         MIB.addImm(Offset);
3900       },
3901       addZeroImm //  slc
3902     }};
3903 }
3904 
3905 InstructionSelector::ComplexRendererFns
3906 AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const {
3907   Register RSrcReg;
3908   Register SOffset;
3909   int64_t Offset = 0;
3910 
3911   if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
3912     return {};
3913 
3914   return {{
3915       [=](MachineInstrBuilder &MIB) {  // rsrc
3916         MIB.addReg(RSrcReg);
3917       },
3918       [=](MachineInstrBuilder &MIB) { // soffset
3919         if (SOffset)
3920           MIB.addReg(SOffset);
3921         else
3922           MIB.addImm(0);
3923       },
3924       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
3925       addZeroImm //  slc
3926     }};
3927 }
3928 
3929 /// Get an immediate that must be 32-bits, and treated as zero extended.
3930 static Optional<uint64_t> getConstantZext32Val(Register Reg,
3931                                                const MachineRegisterInfo &MRI) {
3932   // getConstantVRegVal sexts any values, so see if that matters.
3933   Optional<int64_t> OffsetVal = getConstantVRegVal(Reg, MRI);
3934   if (!OffsetVal || !isInt<32>(*OffsetVal))
3935     return None;
3936   return Lo_32(*OffsetVal);
3937 }
3938 
3939 InstructionSelector::ComplexRendererFns
3940 AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
3941   Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
3942   if (!OffsetVal)
3943     return {};
3944 
3945   Optional<int64_t> EncodedImm =
3946       AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
3947   if (!EncodedImm)
3948     return {};
3949 
3950   return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }  }};
3951 }
3952 
3953 InstructionSelector::ComplexRendererFns
3954 AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
3955   assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
3956 
3957   Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
3958   if (!OffsetVal)
3959     return {};
3960 
3961   Optional<int64_t> EncodedImm
3962     = AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal);
3963   if (!EncodedImm)
3964     return {};
3965 
3966   return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }  }};
3967 }
3968 
3969 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
3970                                                  const MachineInstr &MI,
3971                                                  int OpIdx) const {
3972   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
3973          "Expected G_CONSTANT");
3974   MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
3975 }
3976 
3977 void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
3978                                                 const MachineInstr &MI,
3979                                                 int OpIdx) const {
3980   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
3981          "Expected G_CONSTANT");
3982   MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
3983 }
3984 
3985 void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB,
3986                                                  const MachineInstr &MI,
3987                                                  int OpIdx) const {
3988   assert(OpIdx == -1);
3989 
3990   const MachineOperand &Op = MI.getOperand(1);
3991   if (MI.getOpcode() == TargetOpcode::G_FCONSTANT)
3992     MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
3993   else {
3994     assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
3995     MIB.addImm(Op.getCImm()->getSExtValue());
3996   }
3997 }
3998 
3999 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
4000                                                 const MachineInstr &MI,
4001                                                 int OpIdx) const {
4002   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
4003          "Expected G_CONSTANT");
4004   MIB.addImm(MI.getOperand(1).getCImm()->getValue().countPopulation());
4005 }
4006 
4007 /// This only really exists to satisfy DAG type checking machinery, so is a
4008 /// no-op here.
4009 void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
4010                                                 const MachineInstr &MI,
4011                                                 int OpIdx) const {
4012   MIB.addImm(MI.getOperand(OpIdx).getImm());
4013 }
4014 
4015 void AMDGPUInstructionSelector::renderExtractGLC(MachineInstrBuilder &MIB,
4016                                                  const MachineInstr &MI,
4017                                                  int OpIdx) const {
4018   assert(OpIdx >= 0 && "expected to match an immediate operand");
4019   MIB.addImm(MI.getOperand(OpIdx).getImm() & 1);
4020 }
4021 
4022 void AMDGPUInstructionSelector::renderExtractSLC(MachineInstrBuilder &MIB,
4023                                                  const MachineInstr &MI,
4024                                                  int OpIdx) const {
4025   assert(OpIdx >= 0 && "expected to match an immediate operand");
4026   MIB.addImm((MI.getOperand(OpIdx).getImm() >> 1) & 1);
4027 }
4028 
4029 void AMDGPUInstructionSelector::renderExtractDLC(MachineInstrBuilder &MIB,
4030                                                  const MachineInstr &MI,
4031                                                  int OpIdx) const {
4032   assert(OpIdx >= 0 && "expected to match an immediate operand");
4033   MIB.addImm((MI.getOperand(OpIdx).getImm() >> 2) & 1);
4034 }
4035 
4036 void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
4037                                                  const MachineInstr &MI,
4038                                                  int OpIdx) const {
4039   assert(OpIdx >= 0 && "expected to match an immediate operand");
4040   MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1);
4041 }
4042 
4043 void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
4044                                                  const MachineInstr &MI,
4045                                                  int OpIdx) const {
4046   MIB.addFrameIndex((MI.getOperand(1).getIndex()));
4047 }
4048 
4049 bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const {
4050   return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm());
4051 }
4052 
4053 bool AMDGPUInstructionSelector::isInlineImmediate32(int64_t Imm) const {
4054   return AMDGPU::isInlinableLiteral32(Imm, STI.hasInv2PiInlineImm());
4055 }
4056 
4057 bool AMDGPUInstructionSelector::isInlineImmediate64(int64_t Imm) const {
4058   return AMDGPU::isInlinableLiteral64(Imm, STI.hasInv2PiInlineImm());
4059 }
4060 
4061 bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
4062   return TII.isInlineConstant(Imm);
4063 }
4064