1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the InstructionSelector class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUInstructionSelector.h"
15 #include "AMDGPUInstrInfo.h"
16 #include "AMDGPUGlobalISelUtils.h"
17 #include "AMDGPURegisterBankInfo.h"
18 #include "AMDGPUSubtarget.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
23 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
24 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
25 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
26 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
27 #include "llvm/CodeGen/GlobalISel/Utils.h"
28 #include "llvm/CodeGen/MachineBasicBlock.h"
29 #include "llvm/CodeGen/MachineFunction.h"
30 #include "llvm/CodeGen/MachineInstr.h"
31 #include "llvm/CodeGen/MachineInstrBuilder.h"
32 #include "llvm/CodeGen/MachineRegisterInfo.h"
33 #include "llvm/IR/Type.h"
34 #include "llvm/Support/Debug.h"
35 #include "llvm/Support/raw_ostream.h"
36 
37 #define DEBUG_TYPE "amdgpu-isel"
38 
39 using namespace llvm;
40 using namespace MIPatternMatch;
41 
42 static cl::opt<bool> AllowRiskySelect(
43   "amdgpu-global-isel-risky-select",
44   cl::desc("Allow GlobalISel to select cases that are likely to not work yet"),
45   cl::init(false),
46   cl::ReallyHidden);
47 
48 #define GET_GLOBALISEL_IMPL
49 #define AMDGPUSubtarget GCNSubtarget
50 #include "AMDGPUGenGlobalISel.inc"
51 #undef GET_GLOBALISEL_IMPL
52 #undef AMDGPUSubtarget
53 
54 AMDGPUInstructionSelector::AMDGPUInstructionSelector(
55     const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
56     const AMDGPUTargetMachine &TM)
57     : InstructionSelector(), TII(*STI.getInstrInfo()),
58       TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
59       STI(STI),
60       EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG),
61 #define GET_GLOBALISEL_PREDICATES_INIT
62 #include "AMDGPUGenGlobalISel.inc"
63 #undef GET_GLOBALISEL_PREDICATES_INIT
64 #define GET_GLOBALISEL_TEMPORARIES_INIT
65 #include "AMDGPUGenGlobalISel.inc"
66 #undef GET_GLOBALISEL_TEMPORARIES_INIT
67 {
68 }
69 
70 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
71 
72 void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB,
73                                         CodeGenCoverage &CoverageInfo) {
74   MRI = &MF.getRegInfo();
75   InstructionSelector::setupMF(MF, KB, CoverageInfo);
76 }
77 
78 bool AMDGPUInstructionSelector::isVCC(Register Reg,
79                                       const MachineRegisterInfo &MRI) const {
80   if (Register::isPhysicalRegister(Reg))
81     return Reg == TRI.getVCC();
82 
83   auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
84   const TargetRegisterClass *RC =
85       RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
86   if (RC) {
87     const LLT Ty = MRI.getType(Reg);
88     return RC->hasSuperClassEq(TRI.getBoolRC()) &&
89            Ty.isValid() && Ty.getSizeInBits() == 1;
90   }
91 
92   const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
93   return RB->getID() == AMDGPU::VCCRegBankID;
94 }
95 
96 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
97                                                         unsigned NewOpc) const {
98   MI.setDesc(TII.get(NewOpc));
99   MI.RemoveOperand(1); // Remove intrinsic ID.
100   MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
101 
102   MachineOperand &Dst = MI.getOperand(0);
103   MachineOperand &Src = MI.getOperand(1);
104 
105   // TODO: This should be legalized to s32 if needed
106   if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
107     return false;
108 
109   const TargetRegisterClass *DstRC
110     = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
111   const TargetRegisterClass *SrcRC
112     = TRI.getConstrainedRegClassForOperand(Src, *MRI);
113   if (!DstRC || DstRC != SrcRC)
114     return false;
115 
116   return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
117          RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
118 }
119 
120 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
121   const DebugLoc &DL = I.getDebugLoc();
122   MachineBasicBlock *BB = I.getParent();
123   I.setDesc(TII.get(TargetOpcode::COPY));
124 
125   const MachineOperand &Src = I.getOperand(1);
126   MachineOperand &Dst = I.getOperand(0);
127   Register DstReg = Dst.getReg();
128   Register SrcReg = Src.getReg();
129 
130   if (isVCC(DstReg, *MRI)) {
131     if (SrcReg == AMDGPU::SCC) {
132       const TargetRegisterClass *RC
133         = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
134       if (!RC)
135         return true;
136       return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
137     }
138 
139     if (!isVCC(SrcReg, *MRI)) {
140       // TODO: Should probably leave the copy and let copyPhysReg expand it.
141       if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
142         return false;
143 
144       const TargetRegisterClass *SrcRC
145         = TRI.getConstrainedRegClassForOperand(Src, *MRI);
146 
147       Register MaskedReg = MRI->createVirtualRegister(SrcRC);
148 
149       // We can't trust the high bits at this point, so clear them.
150 
151       // TODO: Skip masking high bits if def is known boolean.
152 
153       unsigned AndOpc = TRI.isSGPRClass(SrcRC) ?
154         AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
155       BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
156         .addImm(1)
157         .addReg(SrcReg);
158       BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
159         .addImm(0)
160         .addReg(MaskedReg);
161 
162       if (!MRI->getRegClassOrNull(SrcReg))
163         MRI->setRegClass(SrcReg, SrcRC);
164       I.eraseFromParent();
165       return true;
166     }
167 
168     const TargetRegisterClass *RC =
169       TRI.getConstrainedRegClassForOperand(Dst, *MRI);
170     if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
171       return false;
172 
173     return true;
174   }
175 
176   for (const MachineOperand &MO : I.operands()) {
177     if (Register::isPhysicalRegister(MO.getReg()))
178       continue;
179 
180     const TargetRegisterClass *RC =
181             TRI.getConstrainedRegClassForOperand(MO, *MRI);
182     if (!RC)
183       continue;
184     RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
185   }
186   return true;
187 }
188 
189 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
190   const Register DefReg = I.getOperand(0).getReg();
191   const LLT DefTy = MRI->getType(DefReg);
192   if (DefTy == LLT::scalar(1)) {
193     if (!AllowRiskySelect) {
194       LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n");
195       return false;
196     }
197 
198     LLVM_DEBUG(dbgs() << "Selecting risky boolean phi\n");
199   }
200 
201   // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
202 
203   const RegClassOrRegBank &RegClassOrBank =
204     MRI->getRegClassOrRegBank(DefReg);
205 
206   const TargetRegisterClass *DefRC
207     = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
208   if (!DefRC) {
209     if (!DefTy.isValid()) {
210       LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
211       return false;
212     }
213 
214     const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
215     DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI);
216     if (!DefRC) {
217       LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
218       return false;
219     }
220   }
221 
222   // TODO: Verify that all registers have the same bank
223   I.setDesc(TII.get(TargetOpcode::PHI));
224   return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
225 }
226 
227 MachineOperand
228 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
229                                            const TargetRegisterClass &SubRC,
230                                            unsigned SubIdx) const {
231 
232   MachineInstr *MI = MO.getParent();
233   MachineBasicBlock *BB = MO.getParent()->getParent();
234   Register DstReg = MRI->createVirtualRegister(&SubRC);
235 
236   if (MO.isReg()) {
237     unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
238     Register Reg = MO.getReg();
239     BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
240             .addReg(Reg, 0, ComposedSubIdx);
241 
242     return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
243                                      MO.isKill(), MO.isDead(), MO.isUndef(),
244                                      MO.isEarlyClobber(), 0, MO.isDebug(),
245                                      MO.isInternalRead());
246   }
247 
248   assert(MO.isImm());
249 
250   APInt Imm(64, MO.getImm());
251 
252   switch (SubIdx) {
253   default:
254     llvm_unreachable("do not know to split immediate with this sub index.");
255   case AMDGPU::sub0:
256     return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
257   case AMDGPU::sub1:
258     return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
259   }
260 }
261 
262 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
263   switch (Opc) {
264   case AMDGPU::G_AND:
265     return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
266   case AMDGPU::G_OR:
267     return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
268   case AMDGPU::G_XOR:
269     return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
270   default:
271     llvm_unreachable("not a bit op");
272   }
273 }
274 
275 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
276   Register DstReg = I.getOperand(0).getReg();
277   unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
278 
279   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
280   if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
281       DstRB->getID() != AMDGPU::VCCRegBankID)
282     return false;
283 
284   bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
285                             STI.isWave64());
286   I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
287 
288   // Dead implicit-def of scc
289   I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
290                                          true, // isImp
291                                          false, // isKill
292                                          true)); // isDead
293   return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
294 }
295 
296 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
297   MachineBasicBlock *BB = I.getParent();
298   MachineFunction *MF = BB->getParent();
299   Register DstReg = I.getOperand(0).getReg();
300   const DebugLoc &DL = I.getDebugLoc();
301   LLT Ty = MRI->getType(DstReg);
302   if (Ty.isVector())
303     return false;
304 
305   unsigned Size = Ty.getSizeInBits();
306   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
307   const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
308   const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
309 
310   if (Size == 32) {
311     if (IsSALU) {
312       const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
313       MachineInstr *Add =
314         BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
315         .add(I.getOperand(1))
316         .add(I.getOperand(2));
317       I.eraseFromParent();
318       return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
319     }
320 
321     if (STI.hasAddNoCarry()) {
322       const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
323       I.setDesc(TII.get(Opc));
324       I.addOperand(*MF, MachineOperand::CreateImm(0));
325       I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
326       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
327     }
328 
329     const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
330 
331     Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
332     MachineInstr *Add
333       = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
334       .addDef(UnusedCarry, RegState::Dead)
335       .add(I.getOperand(1))
336       .add(I.getOperand(2))
337       .addImm(0);
338     I.eraseFromParent();
339     return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
340   }
341 
342   assert(!Sub && "illegal sub should not reach here");
343 
344   const TargetRegisterClass &RC
345     = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
346   const TargetRegisterClass &HalfRC
347     = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
348 
349   MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
350   MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
351   MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
352   MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
353 
354   Register DstLo = MRI->createVirtualRegister(&HalfRC);
355   Register DstHi = MRI->createVirtualRegister(&HalfRC);
356 
357   if (IsSALU) {
358     BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
359       .add(Lo1)
360       .add(Lo2);
361     BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
362       .add(Hi1)
363       .add(Hi2);
364   } else {
365     const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
366     Register CarryReg = MRI->createVirtualRegister(CarryRC);
367     BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
368       .addDef(CarryReg)
369       .add(Lo1)
370       .add(Lo2)
371       .addImm(0);
372     MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
373       .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
374       .add(Hi1)
375       .add(Hi2)
376       .addReg(CarryReg, RegState::Kill)
377       .addImm(0);
378 
379     if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
380       return false;
381   }
382 
383   BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
384     .addReg(DstLo)
385     .addImm(AMDGPU::sub0)
386     .addReg(DstHi)
387     .addImm(AMDGPU::sub1);
388 
389 
390   if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
391     return false;
392 
393   I.eraseFromParent();
394   return true;
395 }
396 
397 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
398   MachineInstr &I) const {
399   MachineBasicBlock *BB = I.getParent();
400   MachineFunction *MF = BB->getParent();
401   const DebugLoc &DL = I.getDebugLoc();
402   Register Dst0Reg = I.getOperand(0).getReg();
403   Register Dst1Reg = I.getOperand(1).getReg();
404   const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
405                      I.getOpcode() == AMDGPU::G_UADDE;
406   const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
407                           I.getOpcode() == AMDGPU::G_USUBE;
408 
409   if (isVCC(Dst1Reg, *MRI)) {
410     unsigned NoCarryOpc =
411         IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
412     unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
413     I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
414     I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
415     I.addOperand(*MF, MachineOperand::CreateImm(0));
416     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
417   }
418 
419   Register Src0Reg = I.getOperand(2).getReg();
420   Register Src1Reg = I.getOperand(3).getReg();
421 
422   if (HasCarryIn) {
423     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
424       .addReg(I.getOperand(4).getReg());
425   }
426 
427   unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
428   unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
429 
430   BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
431     .add(I.getOperand(2))
432     .add(I.getOperand(3));
433   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
434     .addReg(AMDGPU::SCC);
435 
436   if (!MRI->getRegClassOrNull(Dst1Reg))
437     MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
438 
439   if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
440       !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
441       !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
442     return false;
443 
444   if (HasCarryIn &&
445       !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
446                                     AMDGPU::SReg_32RegClass, *MRI))
447     return false;
448 
449   I.eraseFromParent();
450   return true;
451 }
452 
453 // TODO: We should probably legalize these to only using 32-bit results.
454 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
455   MachineBasicBlock *BB = I.getParent();
456   Register DstReg = I.getOperand(0).getReg();
457   Register SrcReg = I.getOperand(1).getReg();
458   LLT DstTy = MRI->getType(DstReg);
459   LLT SrcTy = MRI->getType(SrcReg);
460   const unsigned SrcSize = SrcTy.getSizeInBits();
461   unsigned DstSize = DstTy.getSizeInBits();
462 
463   // TODO: Should handle any multiple of 32 offset.
464   unsigned Offset = I.getOperand(2).getImm();
465   if (Offset % 32 != 0 || DstSize > 128)
466     return false;
467 
468   // 16-bit operations really use 32-bit registers.
469   // FIXME: Probably should not allow 16-bit G_EXTRACT results.
470   if (DstSize == 16)
471     DstSize = 32;
472 
473   const TargetRegisterClass *DstRC =
474     TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
475   if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
476     return false;
477 
478   const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
479   const TargetRegisterClass *SrcRC =
480     TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
481   if (!SrcRC)
482     return false;
483   unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32,
484                                                          DstSize / 32);
485   SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
486   if (!SrcRC)
487     return false;
488 
489   SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
490                                     *SrcRC, I.getOperand(1));
491   const DebugLoc &DL = I.getDebugLoc();
492   BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
493     .addReg(SrcReg, 0, SubReg);
494 
495   I.eraseFromParent();
496   return true;
497 }
498 
499 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
500   MachineBasicBlock *BB = MI.getParent();
501   Register DstReg = MI.getOperand(0).getReg();
502   LLT DstTy = MRI->getType(DstReg);
503   LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
504 
505   const unsigned SrcSize = SrcTy.getSizeInBits();
506   if (SrcSize < 32)
507     return selectImpl(MI, *CoverageInfo);
508 
509   const DebugLoc &DL = MI.getDebugLoc();
510   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
511   const unsigned DstSize = DstTy.getSizeInBits();
512   const TargetRegisterClass *DstRC =
513     TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
514   if (!DstRC)
515     return false;
516 
517   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
518   MachineInstrBuilder MIB =
519     BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
520   for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
521     MachineOperand &Src = MI.getOperand(I + 1);
522     MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
523     MIB.addImm(SubRegs[I]);
524 
525     const TargetRegisterClass *SrcRC
526       = TRI.getConstrainedRegClassForOperand(Src, *MRI);
527     if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
528       return false;
529   }
530 
531   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
532     return false;
533 
534   MI.eraseFromParent();
535   return true;
536 }
537 
538 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
539   MachineBasicBlock *BB = MI.getParent();
540   const int NumDst = MI.getNumOperands() - 1;
541 
542   MachineOperand &Src = MI.getOperand(NumDst);
543 
544   Register SrcReg = Src.getReg();
545   Register DstReg0 = MI.getOperand(0).getReg();
546   LLT DstTy = MRI->getType(DstReg0);
547   LLT SrcTy = MRI->getType(SrcReg);
548 
549   const unsigned DstSize = DstTy.getSizeInBits();
550   const unsigned SrcSize = SrcTy.getSizeInBits();
551   const DebugLoc &DL = MI.getDebugLoc();
552   const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
553 
554   const TargetRegisterClass *SrcRC =
555     TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
556   if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
557     return false;
558 
559   const unsigned SrcFlags = getUndefRegState(Src.isUndef());
560 
561   // Note we could have mixed SGPR and VGPR destination banks for an SGPR
562   // source, and this relies on the fact that the same subregister indices are
563   // used for both.
564   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
565   for (int I = 0, E = NumDst; I != E; ++I) {
566     MachineOperand &Dst = MI.getOperand(I);
567     BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
568       .addReg(SrcReg, SrcFlags, SubRegs[I]);
569 
570     // Make sure the subregister index is valid for the source register.
571     SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
572     if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
573       return false;
574 
575     const TargetRegisterClass *DstRC =
576       TRI.getConstrainedRegClassForOperand(Dst, *MRI);
577     if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
578       return false;
579   }
580 
581   MI.eraseFromParent();
582   return true;
583 }
584 
585 bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
586   MachineInstr &MI) const {
587   if (selectImpl(MI, *CoverageInfo))
588     return true;
589 
590   const LLT S32 = LLT::scalar(32);
591   const LLT V2S16 = LLT::vector(2, 16);
592 
593   Register Dst = MI.getOperand(0).getReg();
594   if (MRI->getType(Dst) != V2S16)
595     return false;
596 
597   const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
598   if (DstBank->getID() != AMDGPU::SGPRRegBankID)
599     return false;
600 
601   Register Src0 = MI.getOperand(1).getReg();
602   Register Src1 = MI.getOperand(2).getReg();
603   if (MRI->getType(Src0) != S32)
604     return false;
605 
606   const DebugLoc &DL = MI.getDebugLoc();
607   MachineBasicBlock *BB = MI.getParent();
608 
609   auto ConstSrc1 = getConstantVRegValWithLookThrough(Src1, *MRI, true, true);
610   if (ConstSrc1) {
611     auto ConstSrc0 = getConstantVRegValWithLookThrough(Src0, *MRI, true, true);
612     if (ConstSrc0) {
613       uint32_t Lo16 = static_cast<uint32_t>(ConstSrc0->Value) & 0xffff;
614       uint32_t Hi16 = static_cast<uint32_t>(ConstSrc1->Value) & 0xffff;
615 
616       BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst)
617         .addImm(Lo16 | (Hi16 << 16));
618       MI.eraseFromParent();
619       return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
620     }
621   }
622 
623   // TODO: This should probably be a combine somewhere
624   // (build_vector_trunc $src0, undef -> copy $src0
625   MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
626   if (Src1Def && Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
627     MI.setDesc(TII.get(AMDGPU::COPY));
628     MI.RemoveOperand(2);
629     return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI) &&
630            RBI.constrainGenericRegister(Src0, AMDGPU::SReg_32RegClass, *MRI);
631   }
632 
633   Register ShiftSrc0;
634   Register ShiftSrc1;
635   int64_t ShiftAmt;
636 
637   // With multiple uses of the shift, this will duplicate the shift and
638   // increase register pressure.
639   //
640   // (build_vector_trunc (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
641   //  => (S_PACK_HH_B32_B16 $src0, $src1)
642   // (build_vector_trunc $src0, (lshr_oneuse SReg_32:$src1, 16))
643   //  => (S_PACK_LH_B32_B16 $src0, $src1)
644   // (build_vector_trunc $src0, $src1)
645   //  => (S_PACK_LL_B32_B16 $src0, $src1)
646 
647   // FIXME: This is an inconvenient way to check a specific value
648   bool Shift0 = mi_match(
649     Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_ICst(ShiftAmt)))) &&
650     ShiftAmt == 16;
651 
652   bool Shift1 = mi_match(
653     Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_ICst(ShiftAmt)))) &&
654     ShiftAmt == 16;
655 
656   unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
657   if (Shift0 && Shift1) {
658     Opc = AMDGPU::S_PACK_HH_B32_B16;
659     MI.getOperand(1).setReg(ShiftSrc0);
660     MI.getOperand(2).setReg(ShiftSrc1);
661   } else if (Shift1) {
662     Opc = AMDGPU::S_PACK_LH_B32_B16;
663     MI.getOperand(2).setReg(ShiftSrc1);
664   } else if (Shift0 && ConstSrc1 && ConstSrc1->Value == 0) {
665     // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
666     auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
667       .addReg(ShiftSrc0)
668       .addImm(16);
669 
670     MI.eraseFromParent();
671     return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
672   }
673 
674   MI.setDesc(TII.get(Opc));
675   return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
676 }
677 
678 bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const {
679   return selectG_ADD_SUB(I);
680 }
681 
682 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
683   const MachineOperand &MO = I.getOperand(0);
684 
685   // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
686   // regbank check here is to know why getConstrainedRegClassForOperand failed.
687   const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
688   if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
689       (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
690     I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
691     return true;
692   }
693 
694   return false;
695 }
696 
697 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
698   MachineBasicBlock *BB = I.getParent();
699 
700   Register DstReg = I.getOperand(0).getReg();
701   Register Src0Reg = I.getOperand(1).getReg();
702   Register Src1Reg = I.getOperand(2).getReg();
703   LLT Src1Ty = MRI->getType(Src1Reg);
704 
705   unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
706   unsigned InsSize = Src1Ty.getSizeInBits();
707 
708   int64_t Offset = I.getOperand(3).getImm();
709 
710   // FIXME: These cases should have been illegal and unnecessary to check here.
711   if (Offset % 32 != 0 || InsSize % 32 != 0)
712     return false;
713 
714   // Currently not handled by getSubRegFromChannel.
715   if (InsSize > 128)
716     return false;
717 
718   unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
719   if (SubReg == AMDGPU::NoSubRegister)
720     return false;
721 
722   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
723   const TargetRegisterClass *DstRC =
724     TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
725   if (!DstRC)
726     return false;
727 
728   const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
729   const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
730   const TargetRegisterClass *Src0RC =
731     TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI);
732   const TargetRegisterClass *Src1RC =
733     TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI);
734 
735   // Deal with weird cases where the class only partially supports the subreg
736   // index.
737   Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
738   if (!Src0RC || !Src1RC)
739     return false;
740 
741   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
742       !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
743       !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
744     return false;
745 
746   const DebugLoc &DL = I.getDebugLoc();
747   BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
748     .addReg(Src0Reg)
749     .addReg(Src1Reg)
750     .addImm(SubReg);
751 
752   I.eraseFromParent();
753   return true;
754 }
755 
756 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
757   if (STI.getLDSBankCount() != 16)
758     return selectImpl(MI, *CoverageInfo);
759 
760   Register Dst = MI.getOperand(0).getReg();
761   Register Src0 = MI.getOperand(2).getReg();
762   Register M0Val = MI.getOperand(6).getReg();
763   if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
764       !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
765       !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
766     return false;
767 
768   // This requires 2 instructions. It is possible to write a pattern to support
769   // this, but the generated isel emitter doesn't correctly deal with multiple
770   // output instructions using the same physical register input. The copy to m0
771   // is incorrectly placed before the second instruction.
772   //
773   // TODO: Match source modifiers.
774 
775   Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
776   const DebugLoc &DL = MI.getDebugLoc();
777   MachineBasicBlock *MBB = MI.getParent();
778 
779   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
780     .addReg(M0Val);
781   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
782     .addImm(2)
783     .addImm(MI.getOperand(4).getImm())  // $attr
784     .addImm(MI.getOperand(3).getImm()); // $attrchan
785 
786   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
787     .addImm(0)                          // $src0_modifiers
788     .addReg(Src0)                       // $src0
789     .addImm(MI.getOperand(4).getImm())  // $attr
790     .addImm(MI.getOperand(3).getImm())  // $attrchan
791     .addImm(0)                          // $src2_modifiers
792     .addReg(InterpMov)                  // $src2 - 2 f16 values selected by high
793     .addImm(MI.getOperand(5).getImm())  // $high
794     .addImm(0)                          // $clamp
795     .addImm(0);                         // $omod
796 
797   MI.eraseFromParent();
798   return true;
799 }
800 
801 // We need to handle this here because tablegen doesn't support matching
802 // instructions with multiple outputs.
803 bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
804   Register Dst0 = MI.getOperand(0).getReg();
805   Register Dst1 = MI.getOperand(1).getReg();
806 
807   LLT Ty = MRI->getType(Dst0);
808   unsigned Opc;
809   if (Ty == LLT::scalar(32))
810     Opc = AMDGPU::V_DIV_SCALE_F32;
811   else if (Ty == LLT::scalar(64))
812     Opc = AMDGPU::V_DIV_SCALE_F64;
813   else
814     return false;
815 
816   const DebugLoc &DL = MI.getDebugLoc();
817   MachineBasicBlock *MBB = MI.getParent();
818 
819   Register Numer = MI.getOperand(3).getReg();
820   Register Denom = MI.getOperand(4).getReg();
821   unsigned ChooseDenom = MI.getOperand(5).getImm();
822 
823   Register Src0 = ChooseDenom != 0 ? Numer : Denom;
824 
825   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
826     .addDef(Dst1)
827     .addUse(Src0)
828     .addUse(Denom)
829     .addUse(Numer);
830 
831   MI.eraseFromParent();
832   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
833 }
834 
835 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
836   unsigned IntrinsicID = I.getIntrinsicID();
837   switch (IntrinsicID) {
838   case Intrinsic::amdgcn_if_break: {
839     MachineBasicBlock *BB = I.getParent();
840 
841     // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
842     // SelectionDAG uses for wave32 vs wave64.
843     BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
844       .add(I.getOperand(0))
845       .add(I.getOperand(2))
846       .add(I.getOperand(3));
847 
848     Register DstReg = I.getOperand(0).getReg();
849     Register Src0Reg = I.getOperand(2).getReg();
850     Register Src1Reg = I.getOperand(3).getReg();
851 
852     I.eraseFromParent();
853 
854     for (Register Reg : { DstReg, Src0Reg, Src1Reg })
855       MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
856 
857     return true;
858   }
859   case Intrinsic::amdgcn_interp_p1_f16:
860     return selectInterpP1F16(I);
861   case Intrinsic::amdgcn_wqm:
862     return constrainCopyLikeIntrin(I, AMDGPU::WQM);
863   case Intrinsic::amdgcn_softwqm:
864     return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
865   case Intrinsic::amdgcn_wwm:
866     return constrainCopyLikeIntrin(I, AMDGPU::WWM);
867   case Intrinsic::amdgcn_div_scale:
868     return selectDivScale(I);
869   case Intrinsic::amdgcn_icmp:
870     return selectIntrinsicIcmp(I);
871   case Intrinsic::amdgcn_ballot:
872     return selectBallot(I);
873   case Intrinsic::amdgcn_reloc_constant:
874     return selectRelocConstant(I);
875   default:
876     return selectImpl(I, *CoverageInfo);
877   }
878 }
879 
880 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) {
881   if (Size != 32 && Size != 64)
882     return -1;
883   switch (P) {
884   default:
885     llvm_unreachable("Unknown condition code!");
886   case CmpInst::ICMP_NE:
887     return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64;
888   case CmpInst::ICMP_EQ:
889     return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64;
890   case CmpInst::ICMP_SGT:
891     return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64;
892   case CmpInst::ICMP_SGE:
893     return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64;
894   case CmpInst::ICMP_SLT:
895     return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64;
896   case CmpInst::ICMP_SLE:
897     return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64;
898   case CmpInst::ICMP_UGT:
899     return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64;
900   case CmpInst::ICMP_UGE:
901     return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64;
902   case CmpInst::ICMP_ULT:
903     return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64;
904   case CmpInst::ICMP_ULE:
905     return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64;
906   }
907 }
908 
909 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
910                                               unsigned Size) const {
911   if (Size == 64) {
912     if (!STI.hasScalarCompareEq64())
913       return -1;
914 
915     switch (P) {
916     case CmpInst::ICMP_NE:
917       return AMDGPU::S_CMP_LG_U64;
918     case CmpInst::ICMP_EQ:
919       return AMDGPU::S_CMP_EQ_U64;
920     default:
921       return -1;
922     }
923   }
924 
925   if (Size != 32)
926     return -1;
927 
928   switch (P) {
929   case CmpInst::ICMP_NE:
930     return AMDGPU::S_CMP_LG_U32;
931   case CmpInst::ICMP_EQ:
932     return AMDGPU::S_CMP_EQ_U32;
933   case CmpInst::ICMP_SGT:
934     return AMDGPU::S_CMP_GT_I32;
935   case CmpInst::ICMP_SGE:
936     return AMDGPU::S_CMP_GE_I32;
937   case CmpInst::ICMP_SLT:
938     return AMDGPU::S_CMP_LT_I32;
939   case CmpInst::ICMP_SLE:
940     return AMDGPU::S_CMP_LE_I32;
941   case CmpInst::ICMP_UGT:
942     return AMDGPU::S_CMP_GT_U32;
943   case CmpInst::ICMP_UGE:
944     return AMDGPU::S_CMP_GE_U32;
945   case CmpInst::ICMP_ULT:
946     return AMDGPU::S_CMP_LT_U32;
947   case CmpInst::ICMP_ULE:
948     return AMDGPU::S_CMP_LE_U32;
949   default:
950     llvm_unreachable("Unknown condition code!");
951   }
952 }
953 
954 bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
955   MachineBasicBlock *BB = I.getParent();
956   const DebugLoc &DL = I.getDebugLoc();
957 
958   Register SrcReg = I.getOperand(2).getReg();
959   unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
960 
961   auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
962 
963   Register CCReg = I.getOperand(0).getReg();
964   if (!isVCC(CCReg, *MRI)) {
965     int Opcode = getS_CMPOpcode(Pred, Size);
966     if (Opcode == -1)
967       return false;
968     MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
969             .add(I.getOperand(2))
970             .add(I.getOperand(3));
971     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
972       .addReg(AMDGPU::SCC);
973     bool Ret =
974         constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
975         RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
976     I.eraseFromParent();
977     return Ret;
978   }
979 
980   int Opcode = getV_CMPOpcode(Pred, Size);
981   if (Opcode == -1)
982     return false;
983 
984   MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode),
985             I.getOperand(0).getReg())
986             .add(I.getOperand(2))
987             .add(I.getOperand(3));
988   RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
989                                *TRI.getBoolRC(), *MRI);
990   bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
991   I.eraseFromParent();
992   return Ret;
993 }
994 
995 bool AMDGPUInstructionSelector::selectIntrinsicIcmp(MachineInstr &I) const {
996   Register Dst = I.getOperand(0).getReg();
997   if (isVCC(Dst, *MRI))
998     return false;
999 
1000   if (MRI->getType(Dst).getSizeInBits() != STI.getWavefrontSize())
1001     return false;
1002 
1003   MachineBasicBlock *BB = I.getParent();
1004   const DebugLoc &DL = I.getDebugLoc();
1005   Register SrcReg = I.getOperand(2).getReg();
1006   unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1007   auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1008 
1009   int Opcode = getV_CMPOpcode(Pred, Size);
1010   if (Opcode == -1)
1011     return false;
1012 
1013   MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst)
1014                            .add(I.getOperand(2))
1015                            .add(I.getOperand(3));
1016   RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), *TRI.getBoolRC(),
1017                                *MRI);
1018   bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1019   I.eraseFromParent();
1020   return Ret;
1021 }
1022 
1023 bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1024   MachineBasicBlock *BB = I.getParent();
1025   const DebugLoc &DL = I.getDebugLoc();
1026   Register DstReg = I.getOperand(0).getReg();
1027   const unsigned Size = MRI->getType(DstReg).getSizeInBits();
1028   const bool Is64 = Size == 64;
1029 
1030   if (Size != STI.getWavefrontSize())
1031     return false;
1032 
1033   Optional<ValueAndVReg> Arg =
1034       getConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI, true);
1035 
1036   if (Arg.hasValue()) {
1037     const int64_t Value = Arg.getValue().Value;
1038     if (Value == 0) {
1039       unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1040       BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
1041     } else if (Value == -1) { // all ones
1042       Register SrcReg = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
1043       BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
1044     } else
1045       return false;
1046   } else {
1047     Register SrcReg = I.getOperand(2).getReg();
1048     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
1049   }
1050 
1051   I.eraseFromParent();
1052   return true;
1053 }
1054 
1055 bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1056   Register DstReg = I.getOperand(0).getReg();
1057   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1058   const TargetRegisterClass *DstRC =
1059     TRI.getRegClassForSizeOnBank(32, *DstBank, *MRI);
1060   if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1061     return false;
1062 
1063   const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1064 
1065   Module *M = MF->getFunction().getParent();
1066   const MDNode *Metadata = I.getOperand(2).getMetadata();
1067   auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1068   auto RelocSymbol = cast<GlobalVariable>(
1069     M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1070 
1071   MachineBasicBlock *BB = I.getParent();
1072   BuildMI(*BB, &I, I.getDebugLoc(),
1073           TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1074     .addGlobalAddress(RelocSymbol, 0, SIInstrInfo::MO_ABS32_LO);
1075 
1076   I.eraseFromParent();
1077   return true;
1078 }
1079 
1080 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1081   // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
1082   // SelectionDAG uses for wave32 vs wave64.
1083   MachineBasicBlock *BB = MI.getParent();
1084   BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1085       .add(MI.getOperand(1));
1086 
1087   Register Reg = MI.getOperand(1).getReg();
1088   MI.eraseFromParent();
1089 
1090   if (!MRI->getRegClassOrNull(Reg))
1091     MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1092   return true;
1093 }
1094 
1095 static unsigned getDSShaderTypeValue(const MachineFunction &MF) {
1096   switch (MF.getFunction().getCallingConv()) {
1097   case CallingConv::AMDGPU_PS:
1098     return 1;
1099   case CallingConv::AMDGPU_VS:
1100     return 2;
1101   case CallingConv::AMDGPU_GS:
1102     return 3;
1103   case CallingConv::AMDGPU_HS:
1104   case CallingConv::AMDGPU_LS:
1105   case CallingConv::AMDGPU_ES:
1106     report_fatal_error("ds_ordered_count unsupported for this calling conv");
1107   case CallingConv::AMDGPU_CS:
1108   case CallingConv::AMDGPU_KERNEL:
1109   case CallingConv::C:
1110   case CallingConv::Fast:
1111   default:
1112     // Assume other calling conventions are various compute callable functions
1113     return 0;
1114   }
1115 }
1116 
1117 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1118   MachineInstr &MI, Intrinsic::ID IntrID) const {
1119   MachineBasicBlock *MBB = MI.getParent();
1120   MachineFunction *MF = MBB->getParent();
1121   const DebugLoc &DL = MI.getDebugLoc();
1122 
1123   unsigned IndexOperand = MI.getOperand(7).getImm();
1124   bool WaveRelease = MI.getOperand(8).getImm() != 0;
1125   bool WaveDone = MI.getOperand(9).getImm() != 0;
1126 
1127   if (WaveDone && !WaveRelease)
1128     report_fatal_error("ds_ordered_count: wave_done requires wave_release");
1129 
1130   unsigned OrderedCountIndex = IndexOperand & 0x3f;
1131   IndexOperand &= ~0x3f;
1132   unsigned CountDw = 0;
1133 
1134   if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1135     CountDw = (IndexOperand >> 24) & 0xf;
1136     IndexOperand &= ~(0xf << 24);
1137 
1138     if (CountDw < 1 || CountDw > 4) {
1139       report_fatal_error(
1140         "ds_ordered_count: dword count must be between 1 and 4");
1141     }
1142   }
1143 
1144   if (IndexOperand)
1145     report_fatal_error("ds_ordered_count: bad index operand");
1146 
1147   unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1148   unsigned ShaderType = getDSShaderTypeValue(*MF);
1149 
1150   unsigned Offset0 = OrderedCountIndex << 2;
1151   unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) |
1152                      (Instruction << 4);
1153 
1154   if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
1155     Offset1 |= (CountDw - 1) << 6;
1156 
1157   unsigned Offset = Offset0 | (Offset1 << 8);
1158 
1159   Register M0Val = MI.getOperand(2).getReg();
1160   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1161     .addReg(M0Val);
1162 
1163   Register DstReg = MI.getOperand(0).getReg();
1164   Register ValReg = MI.getOperand(3).getReg();
1165   MachineInstrBuilder DS =
1166     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1167       .addReg(ValReg)
1168       .addImm(Offset)
1169       .cloneMemRefs(MI);
1170 
1171   if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1172     return false;
1173 
1174   bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1175   MI.eraseFromParent();
1176   return Ret;
1177 }
1178 
1179 static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1180   switch (IntrID) {
1181   case Intrinsic::amdgcn_ds_gws_init:
1182     return AMDGPU::DS_GWS_INIT;
1183   case Intrinsic::amdgcn_ds_gws_barrier:
1184     return AMDGPU::DS_GWS_BARRIER;
1185   case Intrinsic::amdgcn_ds_gws_sema_v:
1186     return AMDGPU::DS_GWS_SEMA_V;
1187   case Intrinsic::amdgcn_ds_gws_sema_br:
1188     return AMDGPU::DS_GWS_SEMA_BR;
1189   case Intrinsic::amdgcn_ds_gws_sema_p:
1190     return AMDGPU::DS_GWS_SEMA_P;
1191   case Intrinsic::amdgcn_ds_gws_sema_release_all:
1192     return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1193   default:
1194     llvm_unreachable("not a gws intrinsic");
1195   }
1196 }
1197 
1198 bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1199                                                      Intrinsic::ID IID) const {
1200   if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1201       !STI.hasGWSSemaReleaseAll())
1202     return false;
1203 
1204   // intrinsic ID, vsrc, offset
1205   const bool HasVSrc = MI.getNumOperands() == 3;
1206   assert(HasVSrc || MI.getNumOperands() == 2);
1207 
1208   Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1209   const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1210   if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1211     return false;
1212 
1213   MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1214   assert(OffsetDef);
1215 
1216   unsigned ImmOffset;
1217 
1218   MachineBasicBlock *MBB = MI.getParent();
1219   const DebugLoc &DL = MI.getDebugLoc();
1220 
1221   MachineInstr *Readfirstlane = nullptr;
1222 
1223   // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1224   // incoming offset, in case there's an add of a constant. We'll have to put it
1225   // back later.
1226   if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1227     Readfirstlane = OffsetDef;
1228     BaseOffset = OffsetDef->getOperand(1).getReg();
1229     OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1230   }
1231 
1232   if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1233     // If we have a constant offset, try to use the 0 in m0 as the base.
1234     // TODO: Look into changing the default m0 initialization value. If the
1235     // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1236     // the immediate offset.
1237 
1238     ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1239     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1240       .addImm(0);
1241   } else {
1242     std::tie(BaseOffset, ImmOffset, OffsetDef)
1243       = AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset);
1244 
1245     if (Readfirstlane) {
1246       // We have the constant offset now, so put the readfirstlane back on the
1247       // variable component.
1248       if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1249         return false;
1250 
1251       Readfirstlane->getOperand(1).setReg(BaseOffset);
1252       BaseOffset = Readfirstlane->getOperand(0).getReg();
1253     } else {
1254       if (!RBI.constrainGenericRegister(BaseOffset,
1255                                         AMDGPU::SReg_32RegClass, *MRI))
1256         return false;
1257     }
1258 
1259     Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1260     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1261       .addReg(BaseOffset)
1262       .addImm(16);
1263 
1264     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1265       .addReg(M0Base);
1266   }
1267 
1268   // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1269   // offset field) % 64. Some versions of the programming guide omit the m0
1270   // part, or claim it's from offset 0.
1271   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1272 
1273   if (HasVSrc) {
1274     Register VSrc = MI.getOperand(1).getReg();
1275     MIB.addReg(VSrc);
1276     if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1277       return false;
1278   }
1279 
1280   MIB.addImm(ImmOffset)
1281      .addImm(-1) // $gds
1282      .cloneMemRefs(MI);
1283 
1284   MI.eraseFromParent();
1285   return true;
1286 }
1287 
1288 bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1289                                                       bool IsAppend) const {
1290   Register PtrBase = MI.getOperand(2).getReg();
1291   LLT PtrTy = MRI->getType(PtrBase);
1292   bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1293 
1294   unsigned Offset;
1295   std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1296 
1297   // TODO: Should this try to look through readfirstlane like GWS?
1298   if (!isDSOffsetLegal(PtrBase, Offset, 16)) {
1299     PtrBase = MI.getOperand(2).getReg();
1300     Offset = 0;
1301   }
1302 
1303   MachineBasicBlock *MBB = MI.getParent();
1304   const DebugLoc &DL = MI.getDebugLoc();
1305   const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1306 
1307   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1308     .addReg(PtrBase);
1309   if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1310     return false;
1311 
1312   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1313     .addImm(Offset)
1314     .addImm(IsGDS ? -1 : 0)
1315     .cloneMemRefs(MI);
1316   MI.eraseFromParent();
1317   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1318 }
1319 
1320 static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
1321                          bool &IsTexFail) {
1322   if (TexFailCtrl)
1323     IsTexFail = true;
1324 
1325   TFE = (TexFailCtrl & 0x1) ? 1 : 0;
1326   TexFailCtrl &= ~(uint64_t)0x1;
1327   LWE = (TexFailCtrl & 0x2) ? 1 : 0;
1328   TexFailCtrl &= ~(uint64_t)0x2;
1329 
1330   return TexFailCtrl == 0;
1331 }
1332 
1333 static bool parseCachePolicy(uint64_t Value,
1334                              bool *GLC, bool *SLC, bool *DLC) {
1335   if (GLC) {
1336     *GLC = (Value & 0x1) ? 1 : 0;
1337     Value &= ~(uint64_t)0x1;
1338   }
1339   if (SLC) {
1340     *SLC = (Value & 0x2) ? 1 : 0;
1341     Value &= ~(uint64_t)0x2;
1342   }
1343   if (DLC) {
1344     *DLC = (Value & 0x4) ? 1 : 0;
1345     Value &= ~(uint64_t)0x4;
1346   }
1347 
1348   return Value == 0;
1349 }
1350 
1351 bool AMDGPUInstructionSelector::selectImageIntrinsic(
1352   MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
1353   MachineBasicBlock *MBB = MI.getParent();
1354   const DebugLoc &DL = MI.getDebugLoc();
1355 
1356   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1357     AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1358 
1359   const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
1360   const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
1361       AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
1362   const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo =
1363       AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode);
1364   unsigned IntrOpcode = Intr->BaseOpcode;
1365   const bool IsGFX10 = STI.getGeneration() >= AMDGPUSubtarget::GFX10;
1366 
1367   const int VAddrIdx = getImageVAddrIdxBegin(BaseOpcode,
1368                                              MI.getNumExplicitDefs());
1369   int NumVAddr, NumGradients;
1370   std::tie(NumVAddr, NumGradients) = getImageNumVAddr(Intr, BaseOpcode);
1371 
1372   Register VDataIn, VDataOut;
1373   LLT VDataTy;
1374   int NumVDataDwords = -1;
1375   bool IsD16 = false;
1376 
1377   // XXX - Can we just get the second to last argument for ctrl?
1378   unsigned CtrlIdx; // Index of texfailctrl argument
1379   bool Unorm;
1380   if (!BaseOpcode->Sampler) {
1381     Unorm = true;
1382     CtrlIdx = VAddrIdx + NumVAddr + 1;
1383   } else {
1384     Unorm = MI.getOperand(VAddrIdx + NumVAddr + 2).getImm() != 0;
1385     CtrlIdx = VAddrIdx + NumVAddr + 3;
1386   }
1387 
1388   bool TFE;
1389   bool LWE;
1390   bool IsTexFail = false;
1391   if (!parseTexFail(MI.getOperand(CtrlIdx).getImm(), TFE, LWE, IsTexFail))
1392     return false;
1393 
1394   const int Flags = MI.getOperand(CtrlIdx + 2).getImm();
1395   const bool IsA16 = (Flags & 1) != 0;
1396   const bool IsG16 = (Flags & 2) != 0;
1397 
1398   // A16 implies 16 bit gradients
1399   if (IsA16 && !IsG16)
1400     return false;
1401 
1402   unsigned DMask = 0;
1403   unsigned DMaskLanes = 0;
1404 
1405   if (BaseOpcode->Atomic) {
1406     VDataOut = MI.getOperand(0).getReg();
1407     VDataIn = MI.getOperand(2).getReg();
1408     LLT Ty = MRI->getType(VDataIn);
1409 
1410     // Be careful to allow atomic swap on 16-bit element vectors.
1411     const bool Is64Bit = BaseOpcode->AtomicX2 ?
1412       Ty.getSizeInBits() == 128 :
1413       Ty.getSizeInBits() == 64;
1414 
1415     if (BaseOpcode->AtomicX2) {
1416       assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
1417 
1418       DMask = Is64Bit ? 0xf : 0x3;
1419       NumVDataDwords = Is64Bit ? 4 : 2;
1420     } else {
1421       DMask = Is64Bit ? 0x3 : 0x1;
1422       NumVDataDwords = Is64Bit ? 2 : 1;
1423     }
1424   } else {
1425     const int DMaskIdx = 2; // Input/output + intrinsic ID.
1426 
1427     DMask = MI.getOperand(DMaskIdx).getImm();
1428     DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
1429 
1430     if (BaseOpcode->Store) {
1431       VDataIn = MI.getOperand(1).getReg();
1432       VDataTy = MRI->getType(VDataIn);
1433       NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
1434     } else {
1435       VDataOut = MI.getOperand(0).getReg();
1436       VDataTy = MRI->getType(VDataOut);
1437       NumVDataDwords = DMaskLanes;
1438 
1439       // One memoperand is mandatory, except for getresinfo.
1440       // FIXME: Check this in verifier.
1441       if (!MI.memoperands_empty()) {
1442         const MachineMemOperand *MMO = *MI.memoperands_begin();
1443 
1444         // Infer d16 from the memory size, as the register type will be mangled by
1445         // unpacked subtargets, or by TFE.
1446         IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32;
1447 
1448         if (IsD16 && !STI.hasUnpackedD16VMem())
1449           NumVDataDwords = (DMaskLanes + 1) / 2;
1450       }
1451     }
1452   }
1453 
1454   // Optimize _L to _LZ when _L is zero
1455   if (LZMappingInfo) {
1456     // The legalizer replaced the register with an immediate 0 if we need to
1457     // change the opcode.
1458     const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1);
1459     if (Lod.isImm()) {
1460       assert(Lod.getImm() == 0);
1461       IntrOpcode = LZMappingInfo->LZ;  // set new opcode to _lz variant of _l
1462     }
1463   }
1464 
1465   // Optimize _mip away, when 'lod' is zero
1466   if (MIPMappingInfo) {
1467     const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1);
1468     if (Lod.isImm()) {
1469       assert(Lod.getImm() == 0);
1470       IntrOpcode = MIPMappingInfo->NONMIP;  // set new opcode to variant without _mip
1471     }
1472   }
1473 
1474   // Set G16 opcode
1475   if (IsG16 && !IsA16) {
1476     const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
1477         AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
1478     assert(G16MappingInfo);
1479     IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
1480   }
1481 
1482   // TODO: Check this in verifier.
1483   assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
1484 
1485   bool GLC = false;
1486   bool SLC = false;
1487   bool DLC = false;
1488   if (BaseOpcode->Atomic) {
1489     GLC = true; // TODO no-return optimization
1490     if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), nullptr, &SLC,
1491                           IsGFX10 ? &DLC : nullptr))
1492       return false;
1493   } else {
1494     if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), &GLC, &SLC,
1495                           IsGFX10 ? &DLC : nullptr))
1496       return false;
1497   }
1498 
1499   int NumVAddrRegs = 0;
1500   int NumVAddrDwords = 0;
1501   for (int I = 0; I < NumVAddr; ++I) {
1502     // Skip the $noregs and 0s inserted during legalization.
1503     MachineOperand &AddrOp = MI.getOperand(VAddrIdx + I);
1504     if (!AddrOp.isReg())
1505       continue; // XXX - Break?
1506 
1507     Register Addr = AddrOp.getReg();
1508     if (!Addr)
1509       break;
1510 
1511     ++NumVAddrRegs;
1512     NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
1513   }
1514 
1515   // The legalizer preprocessed the intrinsic arguments. If we aren't using
1516   // NSA, these should have beeen packed into a single value in the first
1517   // address register
1518   const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs;
1519   if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
1520     LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
1521     return false;
1522   }
1523 
1524   if (IsTexFail)
1525     ++NumVDataDwords;
1526 
1527   int Opcode = -1;
1528   if (IsGFX10) {
1529     Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1530                                    UseNSA ? AMDGPU::MIMGEncGfx10NSA
1531                                           : AMDGPU::MIMGEncGfx10Default,
1532                                    NumVDataDwords, NumVAddrDwords);
1533   } else {
1534     if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
1535       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
1536                                      NumVDataDwords, NumVAddrDwords);
1537     if (Opcode == -1)
1538       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
1539                                      NumVDataDwords, NumVAddrDwords);
1540   }
1541   assert(Opcode != -1);
1542 
1543   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
1544     .cloneMemRefs(MI);
1545 
1546   if (VDataOut) {
1547     if (BaseOpcode->AtomicX2) {
1548       const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
1549 
1550       Register TmpReg = MRI->createVirtualRegister(
1551         Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
1552       unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
1553 
1554       MIB.addDef(TmpReg);
1555       BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
1556         .addReg(TmpReg, RegState::Kill, SubReg);
1557 
1558     } else {
1559       MIB.addDef(VDataOut); // vdata output
1560     }
1561   }
1562 
1563   if (VDataIn)
1564     MIB.addReg(VDataIn); // vdata input
1565 
1566   for (int i = 0; i != NumVAddrRegs; ++i) {
1567     MachineOperand &SrcOp = MI.getOperand(VAddrIdx + i);
1568     if (SrcOp.isReg()) {
1569       assert(SrcOp.getReg() != 0);
1570       MIB.addReg(SrcOp.getReg());
1571     }
1572   }
1573 
1574   MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr).getReg()); // rsrc
1575   if (BaseOpcode->Sampler)
1576     MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr + 1).getReg()); // sampler
1577 
1578   MIB.addImm(DMask); // dmask
1579 
1580   if (IsGFX10)
1581     MIB.addImm(DimInfo->Encoding);
1582   MIB.addImm(Unorm);
1583   if (IsGFX10)
1584     MIB.addImm(DLC);
1585 
1586   MIB.addImm(GLC);
1587   MIB.addImm(SLC);
1588   MIB.addImm(IsA16 &&  // a16 or r128
1589              STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
1590   if (IsGFX10)
1591     MIB.addImm(IsA16 ? -1 : 0);
1592 
1593   MIB.addImm(TFE); // tfe
1594   MIB.addImm(LWE); // lwe
1595   if (!IsGFX10)
1596     MIB.addImm(DimInfo->DA ? -1 : 0);
1597   if (BaseOpcode->HasD16)
1598     MIB.addImm(IsD16 ? -1 : 0);
1599 
1600   MI.eraseFromParent();
1601   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1602 }
1603 
1604 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
1605     MachineInstr &I) const {
1606   unsigned IntrinsicID = I.getIntrinsicID();
1607   switch (IntrinsicID) {
1608   case Intrinsic::amdgcn_end_cf:
1609     return selectEndCfIntrinsic(I);
1610   case Intrinsic::amdgcn_ds_ordered_add:
1611   case Intrinsic::amdgcn_ds_ordered_swap:
1612     return selectDSOrderedIntrinsic(I, IntrinsicID);
1613   case Intrinsic::amdgcn_ds_gws_init:
1614   case Intrinsic::amdgcn_ds_gws_barrier:
1615   case Intrinsic::amdgcn_ds_gws_sema_v:
1616   case Intrinsic::amdgcn_ds_gws_sema_br:
1617   case Intrinsic::amdgcn_ds_gws_sema_p:
1618   case Intrinsic::amdgcn_ds_gws_sema_release_all:
1619     return selectDSGWSIntrinsic(I, IntrinsicID);
1620   case Intrinsic::amdgcn_ds_append:
1621     return selectDSAppendConsume(I, true);
1622   case Intrinsic::amdgcn_ds_consume:
1623     return selectDSAppendConsume(I, false);
1624   default: {
1625     return selectImpl(I, *CoverageInfo);
1626   }
1627   }
1628 }
1629 
1630 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
1631   if (selectImpl(I, *CoverageInfo))
1632     return true;
1633 
1634   MachineBasicBlock *BB = I.getParent();
1635   const DebugLoc &DL = I.getDebugLoc();
1636 
1637   Register DstReg = I.getOperand(0).getReg();
1638   unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
1639   assert(Size <= 32 || Size == 64);
1640   const MachineOperand &CCOp = I.getOperand(1);
1641   Register CCReg = CCOp.getReg();
1642   if (!isVCC(CCReg, *MRI)) {
1643     unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
1644                                          AMDGPU::S_CSELECT_B32;
1645     MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
1646             .addReg(CCReg);
1647 
1648     // The generic constrainSelectedInstRegOperands doesn't work for the scc register
1649     // bank, because it does not cover the register class that we used to represent
1650     // for it.  So we need to manually set the register class here.
1651     if (!MRI->getRegClassOrNull(CCReg))
1652         MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
1653     MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
1654             .add(I.getOperand(2))
1655             .add(I.getOperand(3));
1656 
1657     bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI) |
1658                constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
1659     I.eraseFromParent();
1660     return Ret;
1661   }
1662 
1663   // Wide VGPR select should have been split in RegBankSelect.
1664   if (Size > 32)
1665     return false;
1666 
1667   MachineInstr *Select =
1668       BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1669               .addImm(0)
1670               .add(I.getOperand(3))
1671               .addImm(0)
1672               .add(I.getOperand(2))
1673               .add(I.getOperand(1));
1674 
1675   bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
1676   I.eraseFromParent();
1677   return Ret;
1678 }
1679 
1680 bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const {
1681   initM0(I);
1682   return selectImpl(I, *CoverageInfo);
1683 }
1684 
1685 static int sizeToSubRegIndex(unsigned Size) {
1686   switch (Size) {
1687   case 32:
1688     return AMDGPU::sub0;
1689   case 64:
1690     return AMDGPU::sub0_sub1;
1691   case 96:
1692     return AMDGPU::sub0_sub1_sub2;
1693   case 128:
1694     return AMDGPU::sub0_sub1_sub2_sub3;
1695   case 256:
1696     return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
1697   default:
1698     if (Size < 32)
1699       return AMDGPU::sub0;
1700     if (Size > 256)
1701       return -1;
1702     return sizeToSubRegIndex(PowerOf2Ceil(Size));
1703   }
1704 }
1705 
1706 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
1707   Register DstReg = I.getOperand(0).getReg();
1708   Register SrcReg = I.getOperand(1).getReg();
1709   const LLT DstTy = MRI->getType(DstReg);
1710   const LLT SrcTy = MRI->getType(SrcReg);
1711   const LLT S1 = LLT::scalar(1);
1712 
1713   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
1714   const RegisterBank *DstRB;
1715   if (DstTy == S1) {
1716     // This is a special case. We don't treat s1 for legalization artifacts as
1717     // vcc booleans.
1718     DstRB = SrcRB;
1719   } else {
1720     DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1721     if (SrcRB != DstRB)
1722       return false;
1723   }
1724 
1725   const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
1726 
1727   unsigned DstSize = DstTy.getSizeInBits();
1728   unsigned SrcSize = SrcTy.getSizeInBits();
1729 
1730   const TargetRegisterClass *SrcRC
1731     = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI);
1732   const TargetRegisterClass *DstRC
1733     = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI);
1734   if (!SrcRC || !DstRC)
1735     return false;
1736 
1737   if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
1738       !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
1739     LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
1740     return false;
1741   }
1742 
1743   if (DstTy == LLT::vector(2, 16) && SrcTy == LLT::vector(2, 32)) {
1744     MachineBasicBlock *MBB = I.getParent();
1745     const DebugLoc &DL = I.getDebugLoc();
1746 
1747     Register LoReg = MRI->createVirtualRegister(DstRC);
1748     Register HiReg = MRI->createVirtualRegister(DstRC);
1749     BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
1750       .addReg(SrcReg, 0, AMDGPU::sub0);
1751     BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
1752       .addReg(SrcReg, 0, AMDGPU::sub1);
1753 
1754     if (IsVALU && STI.hasSDWA()) {
1755       // Write the low 16-bits of the high element into the high 16-bits of the
1756       // low element.
1757       MachineInstr *MovSDWA =
1758         BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
1759         .addImm(0)                             // $src0_modifiers
1760         .addReg(HiReg)                         // $src0
1761         .addImm(0)                             // $clamp
1762         .addImm(AMDGPU::SDWA::WORD_1)          // $dst_sel
1763         .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
1764         .addImm(AMDGPU::SDWA::WORD_0)          // $src0_sel
1765         .addReg(LoReg, RegState::Implicit);
1766       MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
1767     } else {
1768       Register TmpReg0 = MRI->createVirtualRegister(DstRC);
1769       Register TmpReg1 = MRI->createVirtualRegister(DstRC);
1770       Register ImmReg = MRI->createVirtualRegister(DstRC);
1771       if (IsVALU) {
1772         BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
1773           .addImm(16)
1774           .addReg(HiReg);
1775       } else {
1776         BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
1777           .addReg(HiReg)
1778           .addImm(16);
1779       }
1780 
1781       unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
1782       unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
1783       unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
1784 
1785       BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
1786         .addImm(0xffff);
1787       BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
1788         .addReg(LoReg)
1789         .addReg(ImmReg);
1790       BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
1791         .addReg(TmpReg0)
1792         .addReg(TmpReg1);
1793     }
1794 
1795     I.eraseFromParent();
1796     return true;
1797   }
1798 
1799   if (!DstTy.isScalar())
1800     return false;
1801 
1802   if (SrcSize > 32) {
1803     int SubRegIdx = sizeToSubRegIndex(DstSize);
1804     if (SubRegIdx == -1)
1805       return false;
1806 
1807     // Deal with weird cases where the class only partially supports the subreg
1808     // index.
1809     const TargetRegisterClass *SrcWithSubRC
1810       = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
1811     if (!SrcWithSubRC)
1812       return false;
1813 
1814     if (SrcWithSubRC != SrcRC) {
1815       if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
1816         return false;
1817     }
1818 
1819     I.getOperand(1).setSubReg(SubRegIdx);
1820   }
1821 
1822   I.setDesc(TII.get(TargetOpcode::COPY));
1823   return true;
1824 }
1825 
1826 /// \returns true if a bitmask for \p Size bits will be an inline immediate.
1827 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
1828   Mask = maskTrailingOnes<unsigned>(Size);
1829   int SignedMask = static_cast<int>(Mask);
1830   return SignedMask >= -16 && SignedMask <= 64;
1831 }
1832 
1833 // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
1834 const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
1835   Register Reg, const MachineRegisterInfo &MRI,
1836   const TargetRegisterInfo &TRI) const {
1837   const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
1838   if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>())
1839     return RB;
1840 
1841   // Ignore the type, since we don't use vcc in artifacts.
1842   if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
1843     return &RBI.getRegBankFromRegClass(*RC, LLT());
1844   return nullptr;
1845 }
1846 
1847 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
1848   bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
1849   bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
1850   const DebugLoc &DL = I.getDebugLoc();
1851   MachineBasicBlock &MBB = *I.getParent();
1852   const Register DstReg = I.getOperand(0).getReg();
1853   const Register SrcReg = I.getOperand(1).getReg();
1854 
1855   const LLT DstTy = MRI->getType(DstReg);
1856   const LLT SrcTy = MRI->getType(SrcReg);
1857   const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
1858     I.getOperand(2).getImm() : SrcTy.getSizeInBits();
1859   const unsigned DstSize = DstTy.getSizeInBits();
1860   if (!DstTy.isScalar())
1861     return false;
1862 
1863   // Artifact casts should never use vcc.
1864   const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
1865 
1866   // FIXME: This should probably be illegal and split earlier.
1867   if (I.getOpcode() == AMDGPU::G_ANYEXT) {
1868     if (DstSize <= 32)
1869       return selectCOPY(I);
1870 
1871     const TargetRegisterClass *SrcRC =
1872         TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank, *MRI);
1873     const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1874     const TargetRegisterClass *DstRC =
1875         TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
1876 
1877     Register UndefReg = MRI->createVirtualRegister(SrcRC);
1878     BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
1879     BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1880       .addReg(SrcReg)
1881       .addImm(AMDGPU::sub0)
1882       .addReg(UndefReg)
1883       .addImm(AMDGPU::sub1);
1884     I.eraseFromParent();
1885 
1886     return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
1887            RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
1888   }
1889 
1890   if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
1891     // 64-bit should have been split up in RegBankSelect
1892 
1893     // Try to use an and with a mask if it will save code size.
1894     unsigned Mask;
1895     if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
1896       MachineInstr *ExtI =
1897       BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
1898         .addImm(Mask)
1899         .addReg(SrcReg);
1900       I.eraseFromParent();
1901       return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
1902     }
1903 
1904     const unsigned BFE = Signed ? AMDGPU::V_BFE_I32 : AMDGPU::V_BFE_U32;
1905     MachineInstr *ExtI =
1906       BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
1907       .addReg(SrcReg)
1908       .addImm(0) // Offset
1909       .addImm(SrcSize); // Width
1910     I.eraseFromParent();
1911     return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
1912   }
1913 
1914   if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
1915     const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
1916       AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
1917     if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
1918       return false;
1919 
1920     if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
1921       const unsigned SextOpc = SrcSize == 8 ?
1922         AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
1923       BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
1924         .addReg(SrcReg);
1925       I.eraseFromParent();
1926       return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
1927     }
1928 
1929     const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
1930     const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
1931 
1932     // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
1933     if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
1934       // We need a 64-bit register source, but the high bits don't matter.
1935       Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
1936       Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1937       unsigned SubReg = InReg ? AMDGPU::sub0 : 0;
1938 
1939       BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
1940       BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
1941         .addReg(SrcReg, 0, SubReg)
1942         .addImm(AMDGPU::sub0)
1943         .addReg(UndefReg)
1944         .addImm(AMDGPU::sub1);
1945 
1946       BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
1947         .addReg(ExtReg)
1948         .addImm(SrcSize << 16);
1949 
1950       I.eraseFromParent();
1951       return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
1952     }
1953 
1954     unsigned Mask;
1955     if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
1956       BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
1957         .addReg(SrcReg)
1958         .addImm(Mask);
1959     } else {
1960       BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
1961         .addReg(SrcReg)
1962         .addImm(SrcSize << 16);
1963     }
1964 
1965     I.eraseFromParent();
1966     return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
1967   }
1968 
1969   return false;
1970 }
1971 
1972 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
1973   MachineBasicBlock *BB = I.getParent();
1974   MachineOperand &ImmOp = I.getOperand(1);
1975 
1976   // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
1977   if (ImmOp.isFPImm()) {
1978     const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
1979     ImmOp.ChangeToImmediate(Imm.getZExtValue());
1980   } else if (ImmOp.isCImm()) {
1981     ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue());
1982   }
1983 
1984   Register DstReg = I.getOperand(0).getReg();
1985   unsigned Size;
1986   bool IsSgpr;
1987   const RegisterBank *RB = MRI->getRegBankOrNull(I.getOperand(0).getReg());
1988   if (RB) {
1989     IsSgpr = RB->getID() == AMDGPU::SGPRRegBankID;
1990     Size = MRI->getType(DstReg).getSizeInBits();
1991   } else {
1992     const TargetRegisterClass *RC = TRI.getRegClassForReg(*MRI, DstReg);
1993     IsSgpr = TRI.isSGPRClass(RC);
1994     Size = TRI.getRegSizeInBits(*RC);
1995   }
1996 
1997   if (Size != 32 && Size != 64)
1998     return false;
1999 
2000   unsigned Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
2001   if (Size == 32) {
2002     I.setDesc(TII.get(Opcode));
2003     I.addImplicitDefUseOperands(*MF);
2004     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2005   }
2006 
2007   const DebugLoc &DL = I.getDebugLoc();
2008 
2009   APInt Imm(Size, I.getOperand(1).getImm());
2010 
2011   MachineInstr *ResInst;
2012   if (IsSgpr && TII.isInlineConstant(Imm)) {
2013     ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
2014       .addImm(I.getOperand(1).getImm());
2015   } else {
2016     const TargetRegisterClass *RC = IsSgpr ?
2017       &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass;
2018     Register LoReg = MRI->createVirtualRegister(RC);
2019     Register HiReg = MRI->createVirtualRegister(RC);
2020 
2021     BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
2022       .addImm(Imm.trunc(32).getZExtValue());
2023 
2024     BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
2025       .addImm(Imm.ashr(32).getZExtValue());
2026 
2027     ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2028       .addReg(LoReg)
2029       .addImm(AMDGPU::sub0)
2030       .addReg(HiReg)
2031       .addImm(AMDGPU::sub1);
2032   }
2033 
2034   // We can't call constrainSelectedInstRegOperands here, because it doesn't
2035   // work for target independent opcodes
2036   I.eraseFromParent();
2037   const TargetRegisterClass *DstRC =
2038     TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI);
2039   if (!DstRC)
2040     return true;
2041   return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI);
2042 }
2043 
2044 bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2045   // Only manually handle the f64 SGPR case.
2046   //
2047   // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2048   // the bit ops theoretically have a second result due to the implicit def of
2049   // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2050   // that is easy by disabling the check. The result works, but uses a
2051   // nonsensical sreg32orlds_and_sreg_1 regclass.
2052   //
2053   // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2054   // the variadic REG_SEQUENCE operands.
2055 
2056   Register Dst = MI.getOperand(0).getReg();
2057   const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2058   if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2059       MRI->getType(Dst) != LLT::scalar(64))
2060     return false;
2061 
2062   Register Src = MI.getOperand(1).getReg();
2063   MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2064   if (Fabs)
2065     Src = Fabs->getOperand(1).getReg();
2066 
2067   if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2068       !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2069     return false;
2070 
2071   MachineBasicBlock *BB = MI.getParent();
2072   const DebugLoc &DL = MI.getDebugLoc();
2073   Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2074   Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2075   Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2076   Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2077 
2078   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2079     .addReg(Src, 0, AMDGPU::sub0);
2080   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2081     .addReg(Src, 0, AMDGPU::sub1);
2082   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2083     .addImm(0x80000000);
2084 
2085   // Set or toggle sign bit.
2086   unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2087   BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2088     .addReg(HiReg)
2089     .addReg(ConstReg);
2090   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2091     .addReg(LoReg)
2092     .addImm(AMDGPU::sub0)
2093     .addReg(OpReg)
2094     .addImm(AMDGPU::sub1);
2095   MI.eraseFromParent();
2096   return true;
2097 }
2098 
2099 // FIXME: This is a workaround for the same tablegen problems as G_FNEG
2100 bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2101   Register Dst = MI.getOperand(0).getReg();
2102   const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2103   if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2104       MRI->getType(Dst) != LLT::scalar(64))
2105     return false;
2106 
2107   Register Src = MI.getOperand(1).getReg();
2108   MachineBasicBlock *BB = MI.getParent();
2109   const DebugLoc &DL = MI.getDebugLoc();
2110   Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2111   Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2112   Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2113   Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2114 
2115   if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2116       !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2117     return false;
2118 
2119   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2120     .addReg(Src, 0, AMDGPU::sub0);
2121   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2122     .addReg(Src, 0, AMDGPU::sub1);
2123   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2124     .addImm(0x7fffffff);
2125 
2126   // Clear sign bit.
2127   // TODO: Should this used S_BITSET0_*?
2128   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2129     .addReg(HiReg)
2130     .addReg(ConstReg);
2131   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2132     .addReg(LoReg)
2133     .addImm(AMDGPU::sub0)
2134     .addReg(OpReg)
2135     .addImm(AMDGPU::sub1);
2136 
2137   MI.eraseFromParent();
2138   return true;
2139 }
2140 
2141 static bool isConstant(const MachineInstr &MI) {
2142   return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2143 }
2144 
2145 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2146     const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2147 
2148   const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg());
2149 
2150   assert(PtrMI);
2151 
2152   if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2153     return;
2154 
2155   GEPInfo GEPInfo(*PtrMI);
2156 
2157   for (unsigned i = 1; i != 3; ++i) {
2158     const MachineOperand &GEPOp = PtrMI->getOperand(i);
2159     const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2160     assert(OpDef);
2161     if (i == 2 && isConstant(*OpDef)) {
2162       // TODO: Could handle constant base + variable offset, but a combine
2163       // probably should have commuted it.
2164       assert(GEPInfo.Imm == 0);
2165       GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2166       continue;
2167     }
2168     const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2169     if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2170       GEPInfo.SgprParts.push_back(GEPOp.getReg());
2171     else
2172       GEPInfo.VgprParts.push_back(GEPOp.getReg());
2173   }
2174 
2175   AddrInfo.push_back(GEPInfo);
2176   getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2177 }
2178 
2179 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2180   if (!MI.hasOneMemOperand())
2181     return false;
2182 
2183   const MachineMemOperand *MMO = *MI.memoperands_begin();
2184   const Value *Ptr = MMO->getValue();
2185 
2186   // UndefValue means this is a load of a kernel input.  These are uniform.
2187   // Sometimes LDS instructions have constant pointers.
2188   // If Ptr is null, then that means this mem operand contains a
2189   // PseudoSourceValue like GOT.
2190   if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
2191       isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
2192     return true;
2193 
2194   if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
2195     return true;
2196 
2197   const Instruction *I = dyn_cast<Instruction>(Ptr);
2198   return I && I->getMetadata("amdgpu.uniform");
2199 }
2200 
2201 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2202   for (const GEPInfo &GEPInfo : AddrInfo) {
2203     if (!GEPInfo.VgprParts.empty())
2204       return true;
2205   }
2206   return false;
2207 }
2208 
2209 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2210   MachineBasicBlock *BB = I.getParent();
2211 
2212   const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2213   unsigned AS = PtrTy.getAddressSpace();
2214   if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) &&
2215       STI.ldsRequiresM0Init()) {
2216     // If DS instructions require M0 initializtion, insert it before selecting.
2217     BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2218       .addImm(-1);
2219   }
2220 }
2221 
2222 bool AMDGPUInstructionSelector::selectG_LOAD_ATOMICRMW(MachineInstr &I) const {
2223   initM0(I);
2224   return selectImpl(I, *CoverageInfo);
2225 }
2226 
2227 // TODO: No rtn optimization.
2228 bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG(
2229   MachineInstr &MI) const {
2230   Register PtrReg = MI.getOperand(1).getReg();
2231   const LLT PtrTy = MRI->getType(PtrReg);
2232   if (PtrTy.getAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
2233       STI.useFlatForGlobal())
2234     return selectImpl(MI, *CoverageInfo);
2235 
2236   Register DstReg = MI.getOperand(0).getReg();
2237   const LLT Ty = MRI->getType(DstReg);
2238   const bool Is64 = Ty.getSizeInBits() == 64;
2239   const unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2240   Register TmpReg = MRI->createVirtualRegister(
2241     Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2242 
2243   const DebugLoc &DL = MI.getDebugLoc();
2244   MachineBasicBlock *BB = MI.getParent();
2245 
2246   Register VAddr, RSrcReg, SOffset;
2247   int64_t Offset = 0;
2248 
2249   unsigned Opcode;
2250   if (selectMUBUFOffsetImpl(MI.getOperand(1), RSrcReg, SOffset, Offset)) {
2251     Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN :
2252                              AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN;
2253   } else if (selectMUBUFAddr64Impl(MI.getOperand(1), VAddr,
2254                                    RSrcReg, SOffset, Offset)) {
2255     Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN :
2256                     AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN;
2257   } else
2258     return selectImpl(MI, *CoverageInfo);
2259 
2260   auto MIB = BuildMI(*BB, &MI, DL, TII.get(Opcode), TmpReg)
2261     .addReg(MI.getOperand(2).getReg());
2262 
2263   if (VAddr)
2264     MIB.addReg(VAddr);
2265 
2266   MIB.addReg(RSrcReg);
2267   if (SOffset)
2268     MIB.addReg(SOffset);
2269   else
2270     MIB.addImm(0);
2271 
2272   MIB.addImm(Offset);
2273   MIB.addImm(0); // slc
2274   MIB.cloneMemRefs(MI);
2275 
2276   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), DstReg)
2277     .addReg(TmpReg, RegState::Kill, SubReg);
2278 
2279   MI.eraseFromParent();
2280 
2281   MRI->setRegClass(
2282     DstReg, Is64 ? &AMDGPU::VReg_64RegClass : &AMDGPU::VGPR_32RegClass);
2283   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2284 }
2285 
2286 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
2287   MachineBasicBlock *BB = I.getParent();
2288   MachineOperand &CondOp = I.getOperand(0);
2289   Register CondReg = CondOp.getReg();
2290   const DebugLoc &DL = I.getDebugLoc();
2291 
2292   unsigned BrOpcode;
2293   Register CondPhysReg;
2294   const TargetRegisterClass *ConstrainRC;
2295 
2296   // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
2297   // whether the branch is uniform when selecting the instruction. In
2298   // GlobalISel, we should push that decision into RegBankSelect. Assume for now
2299   // RegBankSelect knows what it's doing if the branch condition is scc, even
2300   // though it currently does not.
2301   if (!isVCC(CondReg, *MRI)) {
2302     if (MRI->getType(CondReg) != LLT::scalar(32))
2303       return false;
2304 
2305     CondPhysReg = AMDGPU::SCC;
2306     BrOpcode = AMDGPU::S_CBRANCH_SCC1;
2307     ConstrainRC = &AMDGPU::SReg_32RegClass;
2308   } else {
2309     // FIXME: Do we have to insert an and with exec here, like in SelectionDAG?
2310     // We sort of know that a VCC producer based on the register bank, that ands
2311     // inactive lanes with 0. What if there was a logical operation with vcc
2312     // producers in different blocks/with different exec masks?
2313     // FIXME: Should scc->vcc copies and with exec?
2314     CondPhysReg = TRI.getVCC();
2315     BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
2316     ConstrainRC = TRI.getBoolRC();
2317   }
2318 
2319   if (!MRI->getRegClassOrNull(CondReg))
2320     MRI->setRegClass(CondReg, ConstrainRC);
2321 
2322   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
2323     .addReg(CondReg);
2324   BuildMI(*BB, &I, DL, TII.get(BrOpcode))
2325     .addMBB(I.getOperand(1).getMBB());
2326 
2327   I.eraseFromParent();
2328   return true;
2329 }
2330 
2331 bool AMDGPUInstructionSelector::selectG_FRAME_INDEX_GLOBAL_VALUE(
2332   MachineInstr &I) const {
2333   Register DstReg = I.getOperand(0).getReg();
2334   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2335   const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2336   I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
2337   if (IsVGPR)
2338     I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
2339 
2340   return RBI.constrainGenericRegister(
2341     DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
2342 }
2343 
2344 bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
2345   Register DstReg = I.getOperand(0).getReg();
2346   Register SrcReg = I.getOperand(1).getReg();
2347   Register MaskReg = I.getOperand(2).getReg();
2348   LLT Ty = MRI->getType(DstReg);
2349   LLT MaskTy = MRI->getType(MaskReg);
2350 
2351   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2352   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2353   const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
2354   const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2355   if (DstRB != SrcRB) // Should only happen for hand written MIR.
2356     return false;
2357 
2358   unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2359   const TargetRegisterClass &RegRC
2360     = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2361 
2362   const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB,
2363                                                                   *MRI);
2364   const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB,
2365                                                                   *MRI);
2366   const TargetRegisterClass *MaskRC =
2367       TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB, *MRI);
2368 
2369   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2370       !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2371       !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
2372     return false;
2373 
2374   MachineBasicBlock *BB = I.getParent();
2375   const DebugLoc &DL = I.getDebugLoc();
2376   if (Ty.getSizeInBits() == 32) {
2377     assert(MaskTy.getSizeInBits() == 32 &&
2378            "ptrmask should have been narrowed during legalize");
2379 
2380     BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
2381       .addReg(SrcReg)
2382       .addReg(MaskReg);
2383     I.eraseFromParent();
2384     return true;
2385   }
2386 
2387   Register HiReg = MRI->createVirtualRegister(&RegRC);
2388   Register LoReg = MRI->createVirtualRegister(&RegRC);
2389 
2390   // Extract the subregisters from the source pointer.
2391   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
2392     .addReg(SrcReg, 0, AMDGPU::sub0);
2393   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
2394     .addReg(SrcReg, 0, AMDGPU::sub1);
2395 
2396   Register MaskedLo, MaskedHi;
2397 
2398   // Try to avoid emitting a bit operation when we only need to touch half of
2399   // the 64-bit pointer.
2400   APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64);
2401 
2402   const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
2403   const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
2404   if ((MaskOnes & MaskLo32) == MaskLo32) {
2405     // If all the bits in the low half are 1, we only need a copy for it.
2406     MaskedLo = LoReg;
2407   } else {
2408     // Extract the mask subregister and apply the and.
2409     Register MaskLo = MRI->createVirtualRegister(&RegRC);
2410     MaskedLo = MRI->createVirtualRegister(&RegRC);
2411 
2412     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
2413       .addReg(MaskReg, 0, AMDGPU::sub0);
2414     BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
2415       .addReg(LoReg)
2416       .addReg(MaskLo);
2417   }
2418 
2419   if ((MaskOnes & MaskHi32) == MaskHi32) {
2420     // If all the bits in the high half are 1, we only need a copy for it.
2421     MaskedHi = HiReg;
2422   } else {
2423     Register MaskHi = MRI->createVirtualRegister(&RegRC);
2424     MaskedHi = MRI->createVirtualRegister(&RegRC);
2425 
2426     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
2427       .addReg(MaskReg, 0, AMDGPU::sub1);
2428     BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
2429       .addReg(HiReg)
2430       .addReg(MaskHi);
2431   }
2432 
2433   BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2434     .addReg(MaskedLo)
2435     .addImm(AMDGPU::sub0)
2436     .addReg(MaskedHi)
2437     .addImm(AMDGPU::sub1);
2438   I.eraseFromParent();
2439   return true;
2440 }
2441 
2442 /// Return the register to use for the index value, and the subregister to use
2443 /// for the indirectly accessed register.
2444 static std::pair<Register, unsigned>
2445 computeIndirectRegIndex(MachineRegisterInfo &MRI,
2446                         const SIRegisterInfo &TRI,
2447                         const TargetRegisterClass *SuperRC,
2448                         Register IdxReg,
2449                         unsigned EltSize) {
2450   Register IdxBaseReg;
2451   int Offset;
2452   MachineInstr *Unused;
2453 
2454   std::tie(IdxBaseReg, Offset, Unused)
2455     = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg);
2456   if (IdxBaseReg == AMDGPU::NoRegister) {
2457     // This will happen if the index is a known constant. This should ordinarily
2458     // be legalized out, but handle it as a register just in case.
2459     assert(Offset == 0);
2460     IdxBaseReg = IdxReg;
2461   }
2462 
2463   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
2464 
2465   // Skip out of bounds offsets, or else we would end up using an undefined
2466   // register.
2467   if (static_cast<unsigned>(Offset) >= SubRegs.size())
2468     return std::make_pair(IdxReg, SubRegs[0]);
2469   return std::make_pair(IdxBaseReg, SubRegs[Offset]);
2470 }
2471 
2472 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
2473   MachineInstr &MI) const {
2474   Register DstReg = MI.getOperand(0).getReg();
2475   Register SrcReg = MI.getOperand(1).getReg();
2476   Register IdxReg = MI.getOperand(2).getReg();
2477 
2478   LLT DstTy = MRI->getType(DstReg);
2479   LLT SrcTy = MRI->getType(SrcReg);
2480 
2481   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2482   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2483   const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
2484 
2485   // The index must be scalar. If it wasn't RegBankSelect should have moved this
2486   // into a waterfall loop.
2487   if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
2488     return false;
2489 
2490   const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB,
2491                                                                   *MRI);
2492   const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(DstTy, *DstRB,
2493                                                                   *MRI);
2494   if (!SrcRC || !DstRC)
2495     return false;
2496   if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2497       !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2498       !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
2499     return false;
2500 
2501   MachineBasicBlock *BB = MI.getParent();
2502   const DebugLoc &DL = MI.getDebugLoc();
2503   const bool Is64 = DstTy.getSizeInBits() == 64;
2504 
2505   unsigned SubReg;
2506   std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, SrcRC, IdxReg,
2507                                                      DstTy.getSizeInBits() / 8);
2508 
2509   if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
2510     if (DstTy.getSizeInBits() != 32 && !Is64)
2511       return false;
2512 
2513     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2514       .addReg(IdxReg);
2515 
2516     unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
2517     BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
2518       .addReg(SrcReg, 0, SubReg)
2519       .addReg(SrcReg, RegState::Implicit);
2520     MI.eraseFromParent();
2521     return true;
2522   }
2523 
2524   if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
2525     return false;
2526 
2527   if (!STI.useVGPRIndexMode()) {
2528     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2529       .addReg(IdxReg);
2530     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
2531       .addReg(SrcReg, 0, SubReg)
2532       .addReg(SrcReg, RegState::Implicit);
2533     MI.eraseFromParent();
2534     return true;
2535   }
2536 
2537   BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON))
2538     .addReg(IdxReg)
2539     .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE);
2540   BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), DstReg)
2541     .addReg(SrcReg, 0, SubReg)
2542     .addReg(SrcReg, RegState::Implicit)
2543     .addReg(AMDGPU::M0, RegState::Implicit);
2544   BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF));
2545 
2546   MI.eraseFromParent();
2547   return true;
2548 }
2549 
2550 // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
2551 bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
2552   MachineInstr &MI) const {
2553   Register DstReg = MI.getOperand(0).getReg();
2554   Register VecReg = MI.getOperand(1).getReg();
2555   Register ValReg = MI.getOperand(2).getReg();
2556   Register IdxReg = MI.getOperand(3).getReg();
2557 
2558   LLT VecTy = MRI->getType(DstReg);
2559   LLT ValTy = MRI->getType(ValReg);
2560   unsigned VecSize = VecTy.getSizeInBits();
2561   unsigned ValSize = ValTy.getSizeInBits();
2562 
2563   const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
2564   const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
2565   const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
2566 
2567   assert(VecTy.getElementType() == ValTy);
2568 
2569   // The index must be scalar. If it wasn't RegBankSelect should have moved this
2570   // into a waterfall loop.
2571   if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
2572     return false;
2573 
2574   const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB,
2575                                                                   *MRI);
2576   const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB,
2577                                                                   *MRI);
2578 
2579   if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
2580       !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
2581       !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
2582       !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
2583     return false;
2584 
2585   if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
2586     return false;
2587 
2588   unsigned SubReg;
2589   std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg,
2590                                                      ValSize / 8);
2591 
2592   const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
2593                          STI.useVGPRIndexMode();
2594 
2595   MachineBasicBlock *BB = MI.getParent();
2596   const DebugLoc &DL = MI.getDebugLoc();
2597 
2598   if (IndexMode) {
2599     BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON))
2600       .addReg(IdxReg)
2601       .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE);
2602   } else {
2603     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2604       .addReg(IdxReg);
2605   }
2606 
2607   const MCInstrDesc &RegWriteOp
2608     = TII.getIndirectRegWritePseudo(VecSize, ValSize,
2609                                     VecRB->getID() == AMDGPU::SGPRRegBankID);
2610   BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
2611     .addReg(VecReg)
2612     .addReg(ValReg)
2613     .addImm(SubReg);
2614 
2615   if (IndexMode)
2616     BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF));
2617 
2618   MI.eraseFromParent();
2619   return true;
2620 }
2621 
2622 static bool isZeroOrUndef(int X) {
2623   return X == 0 || X == -1;
2624 }
2625 
2626 static bool isOneOrUndef(int X) {
2627   return X == 1 || X == -1;
2628 }
2629 
2630 static bool isZeroOrOneOrUndef(int X) {
2631   return X == 0 || X == 1 || X == -1;
2632 }
2633 
2634 // Normalize a VOP3P shuffle mask to refer to the low/high half of a single
2635 // 32-bit register.
2636 static Register normalizeVOP3PMask(int NewMask[2], Register Src0, Register Src1,
2637                                    ArrayRef<int> Mask) {
2638   NewMask[0] = Mask[0];
2639   NewMask[1] = Mask[1];
2640   if (isZeroOrOneOrUndef(Mask[0]) && isZeroOrOneOrUndef(Mask[1]))
2641     return Src0;
2642 
2643   assert(NewMask[0] == 2 || NewMask[0] == 3 || NewMask[0] == -1);
2644   assert(NewMask[1] == 2 || NewMask[1] == 3 || NewMask[1] == -1);
2645 
2646   // Shift the mask inputs to be 0/1;
2647   NewMask[0] = NewMask[0] == -1 ? -1 : NewMask[0] - 2;
2648   NewMask[1] = NewMask[1] == -1 ? -1 : NewMask[1] - 2;
2649   return Src1;
2650 }
2651 
2652 // This is only legal with VOP3P instructions as an aid to op_sel matching.
2653 bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
2654   MachineInstr &MI) const {
2655   Register DstReg = MI.getOperand(0).getReg();
2656   Register Src0Reg = MI.getOperand(1).getReg();
2657   Register Src1Reg = MI.getOperand(2).getReg();
2658   ArrayRef<int> ShufMask = MI.getOperand(3).getShuffleMask();
2659 
2660   const LLT V2S16 = LLT::vector(2, 16);
2661   if (MRI->getType(DstReg) != V2S16 || MRI->getType(Src0Reg) != V2S16)
2662     return false;
2663 
2664   if (!AMDGPU::isLegalVOP3PShuffleMask(ShufMask))
2665     return false;
2666 
2667   assert(ShufMask.size() == 2);
2668   assert(STI.hasSDWA() && "no target has VOP3P but not SDWA");
2669 
2670   MachineBasicBlock *MBB = MI.getParent();
2671   const DebugLoc &DL = MI.getDebugLoc();
2672 
2673   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2674   const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2675   const TargetRegisterClass &RC = IsVALU ?
2676     AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2677 
2678   // Handle the degenerate case which should have folded out.
2679   if (ShufMask[0] == -1 && ShufMask[1] == -1) {
2680     BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), DstReg);
2681 
2682     MI.eraseFromParent();
2683     return RBI.constrainGenericRegister(DstReg, RC, *MRI);
2684   }
2685 
2686   // A legal VOP3P mask only reads one of the sources.
2687   int Mask[2];
2688   Register SrcVec = normalizeVOP3PMask(Mask, Src0Reg, Src1Reg, ShufMask);
2689 
2690   if (!RBI.constrainGenericRegister(DstReg, RC, *MRI) ||
2691       !RBI.constrainGenericRegister(SrcVec, RC, *MRI))
2692     return false;
2693 
2694   // TODO: This also should have been folded out
2695   if (isZeroOrUndef(Mask[0]) && isOneOrUndef(Mask[1])) {
2696     BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), DstReg)
2697       .addReg(SrcVec);
2698 
2699     MI.eraseFromParent();
2700     return true;
2701   }
2702 
2703   if (Mask[0] == 1 && Mask[1] == -1) {
2704     if (IsVALU) {
2705       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
2706         .addImm(16)
2707         .addReg(SrcVec);
2708     } else {
2709       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
2710         .addReg(SrcVec)
2711         .addImm(16);
2712     }
2713   } else if (Mask[0] == -1 && Mask[1] == 0) {
2714     if (IsVALU) {
2715       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), DstReg)
2716         .addImm(16)
2717         .addReg(SrcVec);
2718     } else {
2719       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHL_B32), DstReg)
2720         .addReg(SrcVec)
2721         .addImm(16);
2722     }
2723   } else if (Mask[0] == 0 && Mask[1] == 0) {
2724     if (IsVALU) {
2725       // Write low half of the register into the high half.
2726       MachineInstr *MovSDWA =
2727         BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2728         .addImm(0)                             // $src0_modifiers
2729         .addReg(SrcVec)                        // $src0
2730         .addImm(0)                             // $clamp
2731         .addImm(AMDGPU::SDWA::WORD_1)          // $dst_sel
2732         .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2733         .addImm(AMDGPU::SDWA::WORD_0)          // $src0_sel
2734         .addReg(SrcVec, RegState::Implicit);
2735       MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2736     } else {
2737       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
2738         .addReg(SrcVec)
2739         .addReg(SrcVec);
2740     }
2741   } else if (Mask[0] == 1 && Mask[1] == 1) {
2742     if (IsVALU) {
2743       // Write high half of the register into the low half.
2744       MachineInstr *MovSDWA =
2745         BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2746         .addImm(0)                             // $src0_modifiers
2747         .addReg(SrcVec)                        // $src0
2748         .addImm(0)                             // $clamp
2749         .addImm(AMDGPU::SDWA::WORD_0)          // $dst_sel
2750         .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2751         .addImm(AMDGPU::SDWA::WORD_1)          // $src0_sel
2752         .addReg(SrcVec, RegState::Implicit);
2753       MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2754     } else {
2755       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_HH_B32_B16), DstReg)
2756         .addReg(SrcVec)
2757         .addReg(SrcVec);
2758     }
2759   } else if (Mask[0] == 1 && Mask[1] == 0) {
2760     if (IsVALU) {
2761       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_ALIGNBIT_B32), DstReg)
2762         .addReg(SrcVec)
2763         .addReg(SrcVec)
2764         .addImm(16);
2765     } else {
2766       Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2767       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg)
2768         .addReg(SrcVec)
2769         .addImm(16);
2770       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
2771         .addReg(TmpReg)
2772         .addReg(SrcVec);
2773     }
2774   } else
2775     llvm_unreachable("all shuffle masks should be handled");
2776 
2777   MI.eraseFromParent();
2778   return true;
2779 }
2780 
2781 bool AMDGPUInstructionSelector::select(MachineInstr &I) {
2782   if (I.isPHI())
2783     return selectPHI(I);
2784 
2785   if (!I.isPreISelOpcode()) {
2786     if (I.isCopy())
2787       return selectCOPY(I);
2788     return true;
2789   }
2790 
2791   switch (I.getOpcode()) {
2792   case TargetOpcode::G_AND:
2793   case TargetOpcode::G_OR:
2794   case TargetOpcode::G_XOR:
2795     if (selectImpl(I, *CoverageInfo))
2796       return true;
2797     return selectG_AND_OR_XOR(I);
2798   case TargetOpcode::G_ADD:
2799   case TargetOpcode::G_SUB:
2800     if (selectImpl(I, *CoverageInfo))
2801       return true;
2802     return selectG_ADD_SUB(I);
2803   case TargetOpcode::G_UADDO:
2804   case TargetOpcode::G_USUBO:
2805   case TargetOpcode::G_UADDE:
2806   case TargetOpcode::G_USUBE:
2807     return selectG_UADDO_USUBO_UADDE_USUBE(I);
2808   case TargetOpcode::G_INTTOPTR:
2809   case TargetOpcode::G_BITCAST:
2810   case TargetOpcode::G_PTRTOINT:
2811     return selectCOPY(I);
2812   case TargetOpcode::G_CONSTANT:
2813   case TargetOpcode::G_FCONSTANT:
2814     return selectG_CONSTANT(I);
2815   case TargetOpcode::G_FNEG:
2816     if (selectImpl(I, *CoverageInfo))
2817       return true;
2818     return selectG_FNEG(I);
2819   case TargetOpcode::G_FABS:
2820     if (selectImpl(I, *CoverageInfo))
2821       return true;
2822     return selectG_FABS(I);
2823   case TargetOpcode::G_EXTRACT:
2824     return selectG_EXTRACT(I);
2825   case TargetOpcode::G_MERGE_VALUES:
2826   case TargetOpcode::G_BUILD_VECTOR:
2827   case TargetOpcode::G_CONCAT_VECTORS:
2828     return selectG_MERGE_VALUES(I);
2829   case TargetOpcode::G_UNMERGE_VALUES:
2830     return selectG_UNMERGE_VALUES(I);
2831   case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2832     return selectG_BUILD_VECTOR_TRUNC(I);
2833   case TargetOpcode::G_PTR_ADD:
2834     return selectG_PTR_ADD(I);
2835   case TargetOpcode::G_IMPLICIT_DEF:
2836     return selectG_IMPLICIT_DEF(I);
2837   case TargetOpcode::G_FREEZE:
2838     return selectCOPY(I);
2839   case TargetOpcode::G_INSERT:
2840     return selectG_INSERT(I);
2841   case TargetOpcode::G_INTRINSIC:
2842     return selectG_INTRINSIC(I);
2843   case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
2844     return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
2845   case TargetOpcode::G_ICMP:
2846     if (selectG_ICMP(I))
2847       return true;
2848     return selectImpl(I, *CoverageInfo);
2849   case TargetOpcode::G_LOAD:
2850   case TargetOpcode::G_ATOMIC_CMPXCHG:
2851   case TargetOpcode::G_ATOMICRMW_XCHG:
2852   case TargetOpcode::G_ATOMICRMW_ADD:
2853   case TargetOpcode::G_ATOMICRMW_SUB:
2854   case TargetOpcode::G_ATOMICRMW_AND:
2855   case TargetOpcode::G_ATOMICRMW_OR:
2856   case TargetOpcode::G_ATOMICRMW_XOR:
2857   case TargetOpcode::G_ATOMICRMW_MIN:
2858   case TargetOpcode::G_ATOMICRMW_MAX:
2859   case TargetOpcode::G_ATOMICRMW_UMIN:
2860   case TargetOpcode::G_ATOMICRMW_UMAX:
2861   case TargetOpcode::G_ATOMICRMW_FADD:
2862   case AMDGPU::G_AMDGPU_ATOMIC_INC:
2863   case AMDGPU::G_AMDGPU_ATOMIC_DEC:
2864     return selectG_LOAD_ATOMICRMW(I);
2865   case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
2866     return selectG_AMDGPU_ATOMIC_CMPXCHG(I);
2867   case TargetOpcode::G_SELECT:
2868     return selectG_SELECT(I);
2869   case TargetOpcode::G_STORE:
2870     return selectG_STORE(I);
2871   case TargetOpcode::G_TRUNC:
2872     return selectG_TRUNC(I);
2873   case TargetOpcode::G_SEXT:
2874   case TargetOpcode::G_ZEXT:
2875   case TargetOpcode::G_ANYEXT:
2876   case TargetOpcode::G_SEXT_INREG:
2877     if (selectImpl(I, *CoverageInfo))
2878       return true;
2879     return selectG_SZA_EXT(I);
2880   case TargetOpcode::G_BRCOND:
2881     return selectG_BRCOND(I);
2882   case TargetOpcode::G_FRAME_INDEX:
2883   case TargetOpcode::G_GLOBAL_VALUE:
2884     return selectG_FRAME_INDEX_GLOBAL_VALUE(I);
2885   case TargetOpcode::G_PTRMASK:
2886     return selectG_PTRMASK(I);
2887   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2888     return selectG_EXTRACT_VECTOR_ELT(I);
2889   case TargetOpcode::G_INSERT_VECTOR_ELT:
2890     return selectG_INSERT_VECTOR_ELT(I);
2891   case TargetOpcode::G_SHUFFLE_VECTOR:
2892     return selectG_SHUFFLE_VECTOR(I);
2893   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
2894   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
2895     const AMDGPU::ImageDimIntrinsicInfo *Intr
2896       = AMDGPU::getImageDimIntrinsicInfo(I.getIntrinsicID());
2897     assert(Intr && "not an image intrinsic with image pseudo");
2898     return selectImageIntrinsic(I, Intr);
2899   }
2900   default:
2901     return selectImpl(I, *CoverageInfo);
2902   }
2903   return false;
2904 }
2905 
2906 InstructionSelector::ComplexRendererFns
2907 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
2908   return {{
2909       [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
2910   }};
2911 
2912 }
2913 
2914 std::pair<Register, unsigned>
2915 AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root) const {
2916   Register Src = Root.getReg();
2917   Register OrigSrc = Src;
2918   unsigned Mods = 0;
2919   MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
2920 
2921   if (MI && MI->getOpcode() == AMDGPU::G_FNEG) {
2922     Src = MI->getOperand(1).getReg();
2923     Mods |= SISrcMods::NEG;
2924     MI = getDefIgnoringCopies(Src, *MRI);
2925   }
2926 
2927   if (MI && MI->getOpcode() == AMDGPU::G_FABS) {
2928     Src = MI->getOperand(1).getReg();
2929     Mods |= SISrcMods::ABS;
2930   }
2931 
2932   if (Mods != 0 &&
2933       RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
2934     MachineInstr *UseMI = Root.getParent();
2935 
2936     // If we looked through copies to find source modifiers on an SGPR operand,
2937     // we now have an SGPR register source. To avoid potentially violating the
2938     // constant bus restriction, we need to insert a copy to a VGPR.
2939     Register VGPRSrc = MRI->cloneVirtualRegister(OrigSrc);
2940     BuildMI(*UseMI->getParent(), UseMI, UseMI->getDebugLoc(),
2941             TII.get(AMDGPU::COPY), VGPRSrc)
2942       .addReg(Src);
2943     Src = VGPRSrc;
2944   }
2945 
2946   return std::make_pair(Src, Mods);
2947 }
2948 
2949 ///
2950 /// This will select either an SGPR or VGPR operand and will save us from
2951 /// having to write an extra tablegen pattern.
2952 InstructionSelector::ComplexRendererFns
2953 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
2954   return {{
2955       [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
2956   }};
2957 }
2958 
2959 InstructionSelector::ComplexRendererFns
2960 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
2961   Register Src;
2962   unsigned Mods;
2963   std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
2964 
2965   return {{
2966       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
2967       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
2968       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },    // clamp
2969       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }     // omod
2970   }};
2971 }
2972 
2973 InstructionSelector::ComplexRendererFns
2974 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
2975   return {{
2976       [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
2977       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
2978       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }  // omod
2979   }};
2980 }
2981 
2982 InstructionSelector::ComplexRendererFns
2983 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
2984   Register Src;
2985   unsigned Mods;
2986   std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
2987 
2988   return {{
2989       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
2990       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
2991   }};
2992 }
2993 
2994 InstructionSelector::ComplexRendererFns
2995 AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
2996   Register Reg = Root.getReg();
2997   const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
2998   if (Def && (Def->getOpcode() == AMDGPU::G_FNEG ||
2999               Def->getOpcode() == AMDGPU::G_FABS))
3000     return {};
3001   return {{
3002       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3003   }};
3004 }
3005 
3006 std::pair<Register, unsigned>
3007 AMDGPUInstructionSelector::selectVOP3PModsImpl(
3008   Register Src, const MachineRegisterInfo &MRI) const {
3009   unsigned Mods = 0;
3010   MachineInstr *MI = MRI.getVRegDef(Src);
3011 
3012   if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
3013       // It's possible to see an f32 fneg here, but unlikely.
3014       // TODO: Treat f32 fneg as only high bit.
3015       MRI.getType(Src) == LLT::vector(2, 16)) {
3016     Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
3017     Src = MI->getOperand(1).getReg();
3018     MI = MRI.getVRegDef(Src);
3019   }
3020 
3021   // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
3022 
3023   // Packed instructions do not have abs modifiers.
3024   Mods |= SISrcMods::OP_SEL_1;
3025 
3026   return std::make_pair(Src, Mods);
3027 }
3028 
3029 InstructionSelector::ComplexRendererFns
3030 AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
3031   MachineRegisterInfo &MRI
3032     = Root.getParent()->getParent()->getParent()->getRegInfo();
3033 
3034   Register Src;
3035   unsigned Mods;
3036   std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
3037 
3038   return {{
3039       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3040       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
3041   }};
3042 }
3043 
3044 InstructionSelector::ComplexRendererFns
3045 AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {
3046   Register Src;
3047   unsigned Mods;
3048   std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3049   if (!TM.Options.NoNaNsFPMath && !isKnownNeverNaN(Src, *MRI))
3050     return None;
3051 
3052   return {{
3053       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3054       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
3055   }};
3056 }
3057 
3058 InstructionSelector::ComplexRendererFns
3059 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
3060   // FIXME: Handle op_sel
3061   return {{
3062       [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
3063       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods
3064   }};
3065 }
3066 
3067 InstructionSelector::ComplexRendererFns
3068 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
3069   SmallVector<GEPInfo, 4> AddrInfo;
3070   getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
3071 
3072   if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3073     return None;
3074 
3075   const GEPInfo &GEPInfo = AddrInfo[0];
3076   Optional<int64_t> EncodedImm =
3077       AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm, false);
3078   if (!EncodedImm)
3079     return None;
3080 
3081   unsigned PtrReg = GEPInfo.SgprParts[0];
3082   return {{
3083     [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3084     [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
3085   }};
3086 }
3087 
3088 InstructionSelector::ComplexRendererFns
3089 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
3090   SmallVector<GEPInfo, 4> AddrInfo;
3091   getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
3092 
3093   if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3094     return None;
3095 
3096   const GEPInfo &GEPInfo = AddrInfo[0];
3097   Register PtrReg = GEPInfo.SgprParts[0];
3098   Optional<int64_t> EncodedImm =
3099       AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
3100   if (!EncodedImm)
3101     return None;
3102 
3103   return {{
3104     [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3105     [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
3106   }};
3107 }
3108 
3109 InstructionSelector::ComplexRendererFns
3110 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
3111   MachineInstr *MI = Root.getParent();
3112   MachineBasicBlock *MBB = MI->getParent();
3113 
3114   SmallVector<GEPInfo, 4> AddrInfo;
3115   getAddrModeInfo(*MI, *MRI, AddrInfo);
3116 
3117   // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
3118   // then we can select all ptr + 32-bit offsets not just immediate offsets.
3119   if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3120     return None;
3121 
3122   const GEPInfo &GEPInfo = AddrInfo[0];
3123   // SGPR offset is unsigned.
3124   if (!GEPInfo.Imm || GEPInfo.Imm < 0 || !isUInt<32>(GEPInfo.Imm))
3125     return None;
3126 
3127   // If we make it this far we have a load with an 32-bit immediate offset.
3128   // It is OK to select this using a sgpr offset, because we have already
3129   // failed trying to select this load into one of the _IMM variants since
3130   // the _IMM Patterns are considered before the _SGPR patterns.
3131   Register PtrReg = GEPInfo.SgprParts[0];
3132   Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3133   BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg)
3134           .addImm(GEPInfo.Imm);
3135   return {{
3136     [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3137     [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }
3138   }};
3139 }
3140 
3141 template <bool Signed>
3142 InstructionSelector::ComplexRendererFns
3143 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const {
3144   MachineInstr *MI = Root.getParent();
3145 
3146   InstructionSelector::ComplexRendererFns Default = {{
3147       [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
3148       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },  // offset
3149       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }  // slc
3150     }};
3151 
3152   if (!STI.hasFlatInstOffsets())
3153     return Default;
3154 
3155   const MachineInstr *OpDef = MRI->getVRegDef(Root.getReg());
3156   if (!OpDef || OpDef->getOpcode() != AMDGPU::G_PTR_ADD)
3157     return Default;
3158 
3159   Optional<int64_t> Offset =
3160     getConstantVRegVal(OpDef->getOperand(2).getReg(), *MRI);
3161   if (!Offset.hasValue())
3162     return Default;
3163 
3164   unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
3165   if (!TII.isLegalFLATOffset(Offset.getValue(), AddrSpace, Signed))
3166     return Default;
3167 
3168   Register BasePtr = OpDef->getOperand(1).getReg();
3169 
3170   return {{
3171       [=](MachineInstrBuilder &MIB) { MIB.addReg(BasePtr); },
3172       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset.getValue()); },
3173       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }  // slc
3174     }};
3175 }
3176 
3177 InstructionSelector::ComplexRendererFns
3178 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
3179   return selectFlatOffsetImpl<false>(Root);
3180 }
3181 
3182 InstructionSelector::ComplexRendererFns
3183 AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const {
3184   return selectFlatOffsetImpl<true>(Root);
3185 }
3186 
3187 static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
3188   auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>();
3189   return PSV && PSV->isStack();
3190 }
3191 
3192 InstructionSelector::ComplexRendererFns
3193 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
3194   MachineInstr *MI = Root.getParent();
3195   MachineBasicBlock *MBB = MI->getParent();
3196   MachineFunction *MF = MBB->getParent();
3197   const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3198 
3199   int64_t Offset = 0;
3200   if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
3201       Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) {
3202     Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3203 
3204     // TODO: Should this be inside the render function? The iterator seems to
3205     // move.
3206     BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
3207             HighBits)
3208       .addImm(Offset & ~4095);
3209 
3210     return {{[=](MachineInstrBuilder &MIB) { // rsrc
3211                MIB.addReg(Info->getScratchRSrcReg());
3212              },
3213              [=](MachineInstrBuilder &MIB) { // vaddr
3214                MIB.addReg(HighBits);
3215              },
3216              [=](MachineInstrBuilder &MIB) { // soffset
3217                const MachineMemOperand *MMO = *MI->memoperands_begin();
3218                const MachinePointerInfo &PtrInfo = MMO->getPointerInfo();
3219 
3220                if (isStackPtrRelative(PtrInfo))
3221                  MIB.addReg(Info->getStackPtrOffsetReg());
3222                else
3223                  MIB.addImm(0);
3224              },
3225              [=](MachineInstrBuilder &MIB) { // offset
3226                MIB.addImm(Offset & 4095);
3227              }}};
3228   }
3229 
3230   assert(Offset == 0 || Offset == -1);
3231 
3232   // Try to fold a frame index directly into the MUBUF vaddr field, and any
3233   // offsets.
3234   Optional<int> FI;
3235   Register VAddr = Root.getReg();
3236   if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) {
3237     if (isBaseWithConstantOffset(Root, *MRI)) {
3238       const MachineOperand &LHS = RootDef->getOperand(1);
3239       const MachineOperand &RHS = RootDef->getOperand(2);
3240       const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg());
3241       const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg());
3242       if (LHSDef && RHSDef) {
3243         int64_t PossibleOffset =
3244             RHSDef->getOperand(1).getCImm()->getSExtValue();
3245         if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) &&
3246             (!STI.privateMemoryResourceIsRangeChecked() ||
3247              KnownBits->signBitIsZero(LHS.getReg()))) {
3248           if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
3249             FI = LHSDef->getOperand(1).getIndex();
3250           else
3251             VAddr = LHS.getReg();
3252           Offset = PossibleOffset;
3253         }
3254       }
3255     } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
3256       FI = RootDef->getOperand(1).getIndex();
3257     }
3258   }
3259 
3260   return {{[=](MachineInstrBuilder &MIB) { // rsrc
3261              MIB.addReg(Info->getScratchRSrcReg());
3262            },
3263            [=](MachineInstrBuilder &MIB) { // vaddr
3264              if (FI.hasValue())
3265                MIB.addFrameIndex(FI.getValue());
3266              else
3267                MIB.addReg(VAddr);
3268            },
3269            [=](MachineInstrBuilder &MIB) { // soffset
3270              // If we don't know this private access is a local stack object, it
3271              // needs to be relative to the entry point's scratch wave offset.
3272              // TODO: Should split large offsets that don't fit like above.
3273              // TODO: Don't use scratch wave offset just because the offset
3274              // didn't fit.
3275              if (!Info->isEntryFunction() && FI.hasValue())
3276                MIB.addReg(Info->getStackPtrOffsetReg());
3277              else
3278                MIB.addImm(0);
3279            },
3280            [=](MachineInstrBuilder &MIB) { // offset
3281              MIB.addImm(Offset);
3282            }}};
3283 }
3284 
3285 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
3286                                                 int64_t Offset,
3287                                                 unsigned OffsetBits) const {
3288   if ((OffsetBits == 16 && !isUInt<16>(Offset)) ||
3289       (OffsetBits == 8 && !isUInt<8>(Offset)))
3290     return false;
3291 
3292   if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
3293     return true;
3294 
3295   // On Southern Islands instruction with a negative base value and an offset
3296   // don't seem to work.
3297   return KnownBits->signBitIsZero(Base);
3298 }
3299 
3300 InstructionSelector::ComplexRendererFns
3301 AMDGPUInstructionSelector::selectMUBUFScratchOffset(
3302     MachineOperand &Root) const {
3303   MachineInstr *MI = Root.getParent();
3304   MachineBasicBlock *MBB = MI->getParent();
3305 
3306   int64_t Offset = 0;
3307   if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
3308       !SIInstrInfo::isLegalMUBUFImmOffset(Offset))
3309     return {};
3310 
3311   const MachineFunction *MF = MBB->getParent();
3312   const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3313   const MachineMemOperand *MMO = *MI->memoperands_begin();
3314   const MachinePointerInfo &PtrInfo = MMO->getPointerInfo();
3315 
3316   return {{
3317       [=](MachineInstrBuilder &MIB) { // rsrc
3318         MIB.addReg(Info->getScratchRSrcReg());
3319       },
3320       [=](MachineInstrBuilder &MIB) { // soffset
3321         if (isStackPtrRelative(PtrInfo))
3322           MIB.addReg(Info->getStackPtrOffsetReg());
3323         else
3324           MIB.addImm(0);
3325       },
3326       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
3327   }};
3328 }
3329 
3330 std::pair<Register, unsigned>
3331 AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
3332   const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
3333   if (!RootDef)
3334     return std::make_pair(Root.getReg(), 0);
3335 
3336   int64_t ConstAddr = 0;
3337 
3338   Register PtrBase;
3339   int64_t Offset;
3340   std::tie(PtrBase, Offset) =
3341     getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3342 
3343   if (Offset) {
3344     if (isDSOffsetLegal(PtrBase, Offset, 16)) {
3345       // (add n0, c0)
3346       return std::make_pair(PtrBase, Offset);
3347     }
3348   } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
3349     // TODO
3350 
3351 
3352   } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
3353     // TODO
3354 
3355   }
3356 
3357   return std::make_pair(Root.getReg(), 0);
3358 }
3359 
3360 InstructionSelector::ComplexRendererFns
3361 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
3362   Register Reg;
3363   unsigned Offset;
3364   std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
3365   return {{
3366       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3367       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
3368     }};
3369 }
3370 
3371 InstructionSelector::ComplexRendererFns
3372 AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
3373   Register Reg;
3374   unsigned Offset;
3375   std::tie(Reg, Offset) = selectDS64Bit4ByteAlignedImpl(Root);
3376   return {{
3377       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3378       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
3379       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
3380     }};
3381 }
3382 
3383 std::pair<Register, unsigned>
3384 AMDGPUInstructionSelector::selectDS64Bit4ByteAlignedImpl(MachineOperand &Root) const {
3385   const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
3386   if (!RootDef)
3387     return std::make_pair(Root.getReg(), 0);
3388 
3389   int64_t ConstAddr = 0;
3390 
3391   Register PtrBase;
3392   int64_t Offset;
3393   std::tie(PtrBase, Offset) =
3394     getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3395 
3396   if (Offset) {
3397     int64_t DWordOffset0 = Offset / 4;
3398     int64_t DWordOffset1 = DWordOffset0 + 1;
3399     if (isDSOffsetLegal(PtrBase, DWordOffset1, 8)) {
3400       // (add n0, c0)
3401       return std::make_pair(PtrBase, DWordOffset0);
3402     }
3403   } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
3404     // TODO
3405 
3406   } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
3407     // TODO
3408 
3409   }
3410 
3411   return std::make_pair(Root.getReg(), 0);
3412 }
3413 
3414 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
3415 /// the base value with the constant offset. There may be intervening copies
3416 /// between \p Root and the identified constant. Returns \p Root, 0 if this does
3417 /// not match the pattern.
3418 std::pair<Register, int64_t>
3419 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
3420   Register Root, const MachineRegisterInfo &MRI) const {
3421   MachineInstr *RootI = MRI.getVRegDef(Root);
3422   if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
3423     return {Root, 0};
3424 
3425   MachineOperand &RHS = RootI->getOperand(2);
3426   Optional<ValueAndVReg> MaybeOffset
3427     = getConstantVRegValWithLookThrough(RHS.getReg(), MRI, true);
3428   if (!MaybeOffset)
3429     return {Root, 0};
3430   return {RootI->getOperand(1).getReg(), MaybeOffset->Value};
3431 }
3432 
3433 static void addZeroImm(MachineInstrBuilder &MIB) {
3434   MIB.addImm(0);
3435 }
3436 
3437 /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
3438 /// BasePtr is not valid, a null base pointer will be used.
3439 static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI,
3440                           uint32_t FormatLo, uint32_t FormatHi,
3441                           Register BasePtr) {
3442   Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
3443   Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
3444   Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3445   Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
3446 
3447   B.buildInstr(AMDGPU::S_MOV_B32)
3448     .addDef(RSrc2)
3449     .addImm(FormatLo);
3450   B.buildInstr(AMDGPU::S_MOV_B32)
3451     .addDef(RSrc3)
3452     .addImm(FormatHi);
3453 
3454   // Build the half of the subregister with the constants before building the
3455   // full 128-bit register. If we are building multiple resource descriptors,
3456   // this will allow CSEing of the 2-component register.
3457   B.buildInstr(AMDGPU::REG_SEQUENCE)
3458     .addDef(RSrcHi)
3459     .addReg(RSrc2)
3460     .addImm(AMDGPU::sub0)
3461     .addReg(RSrc3)
3462     .addImm(AMDGPU::sub1);
3463 
3464   Register RSrcLo = BasePtr;
3465   if (!BasePtr) {
3466     RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3467     B.buildInstr(AMDGPU::S_MOV_B64)
3468       .addDef(RSrcLo)
3469       .addImm(0);
3470   }
3471 
3472   B.buildInstr(AMDGPU::REG_SEQUENCE)
3473     .addDef(RSrc)
3474     .addReg(RSrcLo)
3475     .addImm(AMDGPU::sub0_sub1)
3476     .addReg(RSrcHi)
3477     .addImm(AMDGPU::sub2_sub3);
3478 
3479   return RSrc;
3480 }
3481 
3482 static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
3483                                 const SIInstrInfo &TII, Register BasePtr) {
3484   uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
3485 
3486   // FIXME: Why are half the "default" bits ignored based on the addressing
3487   // mode?
3488   return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
3489 }
3490 
3491 static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
3492                                const SIInstrInfo &TII, Register BasePtr) {
3493   uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
3494 
3495   // FIXME: Why are half the "default" bits ignored based on the addressing
3496   // mode?
3497   return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
3498 }
3499 
3500 AMDGPUInstructionSelector::MUBUFAddressData
3501 AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
3502   MUBUFAddressData Data;
3503   Data.N0 = Src;
3504 
3505   Register PtrBase;
3506   int64_t Offset;
3507 
3508   std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
3509   if (isUInt<32>(Offset)) {
3510     Data.N0 = PtrBase;
3511     Data.Offset = Offset;
3512   }
3513 
3514   if (MachineInstr *InputAdd
3515       = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
3516     Data.N2 = InputAdd->getOperand(1).getReg();
3517     Data.N3 = InputAdd->getOperand(2).getReg();
3518 
3519     // FIXME: Need to fix extra SGPR->VGPRcopies inserted
3520     // FIXME: Don't know this was defined by operand 0
3521     //
3522     // TODO: Remove this when we have copy folding optimizations after
3523     // RegBankSelect.
3524     Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
3525     Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
3526   }
3527 
3528   return Data;
3529 }
3530 
3531 /// Return if the addr64 mubuf mode should be used for the given address.
3532 bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
3533   // (ptr_add N2, N3) -> addr64, or
3534   // (ptr_add (ptr_add N2, N3), C1) -> addr64
3535   if (Addr.N2)
3536     return true;
3537 
3538   const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
3539   return N0Bank->getID() == AMDGPU::VGPRRegBankID;
3540 }
3541 
3542 /// Split an immediate offset \p ImmOffset depending on whether it fits in the
3543 /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
3544 /// component.
3545 void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
3546   MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
3547   if (SIInstrInfo::isLegalMUBUFImmOffset(ImmOffset))
3548     return;
3549 
3550   // Illegal offset, store it in soffset.
3551   SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3552   B.buildInstr(AMDGPU::S_MOV_B32)
3553     .addDef(SOffset)
3554     .addImm(ImmOffset);
3555   ImmOffset = 0;
3556 }
3557 
3558 bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
3559   MachineOperand &Root, Register &VAddr, Register &RSrcReg,
3560   Register &SOffset, int64_t &Offset) const {
3561   // FIXME: Predicates should stop this from reaching here.
3562   // addr64 bit was removed for volcanic islands.
3563   if (!STI.hasAddr64() || STI.useFlatForGlobal())
3564     return false;
3565 
3566   MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
3567   if (!shouldUseAddr64(AddrData))
3568     return false;
3569 
3570   Register N0 = AddrData.N0;
3571   Register N2 = AddrData.N2;
3572   Register N3 = AddrData.N3;
3573   Offset = AddrData.Offset;
3574 
3575   // Base pointer for the SRD.
3576   Register SRDPtr;
3577 
3578   if (N2) {
3579     if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
3580       assert(N3);
3581       if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
3582         // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
3583         // addr64, and construct the default resource from a 0 address.
3584         VAddr = N0;
3585       } else {
3586         SRDPtr = N3;
3587         VAddr = N2;
3588       }
3589     } else {
3590       // N2 is not divergent.
3591       SRDPtr = N2;
3592       VAddr = N3;
3593     }
3594   } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
3595     // Use the default null pointer in the resource
3596     VAddr = N0;
3597   } else {
3598     // N0 -> offset, or
3599     // (N0 + C1) -> offset
3600     SRDPtr = N0;
3601   }
3602 
3603   MachineIRBuilder B(*Root.getParent());
3604   RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
3605   splitIllegalMUBUFOffset(B, SOffset, Offset);
3606   return true;
3607 }
3608 
3609 bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
3610   MachineOperand &Root, Register &RSrcReg, Register &SOffset,
3611   int64_t &Offset) const {
3612   MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
3613   if (shouldUseAddr64(AddrData))
3614     return false;
3615 
3616   // N0 -> offset, or
3617   // (N0 + C1) -> offset
3618   Register SRDPtr = AddrData.N0;
3619   Offset = AddrData.Offset;
3620 
3621   // TODO: Look through extensions for 32-bit soffset.
3622   MachineIRBuilder B(*Root.getParent());
3623 
3624   RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
3625   splitIllegalMUBUFOffset(B, SOffset, Offset);
3626   return true;
3627 }
3628 
3629 InstructionSelector::ComplexRendererFns
3630 AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
3631   Register VAddr;
3632   Register RSrcReg;
3633   Register SOffset;
3634   int64_t Offset = 0;
3635 
3636   if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
3637     return {};
3638 
3639   // FIXME: Use defaulted operands for trailing 0s and remove from the complex
3640   // pattern.
3641   return {{
3642       [=](MachineInstrBuilder &MIB) {  // rsrc
3643         MIB.addReg(RSrcReg);
3644       },
3645       [=](MachineInstrBuilder &MIB) { // vaddr
3646         MIB.addReg(VAddr);
3647       },
3648       [=](MachineInstrBuilder &MIB) { // soffset
3649         if (SOffset)
3650           MIB.addReg(SOffset);
3651         else
3652           MIB.addImm(0);
3653       },
3654       [=](MachineInstrBuilder &MIB) { // offset
3655         MIB.addImm(Offset);
3656       },
3657       addZeroImm, //  glc
3658       addZeroImm, //  slc
3659       addZeroImm, //  tfe
3660       addZeroImm, //  dlc
3661       addZeroImm  //  swz
3662     }};
3663 }
3664 
3665 InstructionSelector::ComplexRendererFns
3666 AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
3667   Register RSrcReg;
3668   Register SOffset;
3669   int64_t Offset = 0;
3670 
3671   if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
3672     return {};
3673 
3674   return {{
3675       [=](MachineInstrBuilder &MIB) {  // rsrc
3676         MIB.addReg(RSrcReg);
3677       },
3678       [=](MachineInstrBuilder &MIB) { // soffset
3679         if (SOffset)
3680           MIB.addReg(SOffset);
3681         else
3682           MIB.addImm(0);
3683       },
3684       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
3685       addZeroImm, //  glc
3686       addZeroImm, //  slc
3687       addZeroImm, //  tfe
3688       addZeroImm, //  dlc
3689       addZeroImm  //  swz
3690     }};
3691 }
3692 
3693 InstructionSelector::ComplexRendererFns
3694 AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const {
3695   Register VAddr;
3696   Register RSrcReg;
3697   Register SOffset;
3698   int64_t Offset = 0;
3699 
3700   if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
3701     return {};
3702 
3703   // FIXME: Use defaulted operands for trailing 0s and remove from the complex
3704   // pattern.
3705   return {{
3706       [=](MachineInstrBuilder &MIB) {  // rsrc
3707         MIB.addReg(RSrcReg);
3708       },
3709       [=](MachineInstrBuilder &MIB) { // vaddr
3710         MIB.addReg(VAddr);
3711       },
3712       [=](MachineInstrBuilder &MIB) { // soffset
3713         if (SOffset)
3714           MIB.addReg(SOffset);
3715         else
3716           MIB.addImm(0);
3717       },
3718       [=](MachineInstrBuilder &MIB) { // offset
3719         MIB.addImm(Offset);
3720       },
3721       addZeroImm //  slc
3722     }};
3723 }
3724 
3725 InstructionSelector::ComplexRendererFns
3726 AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const {
3727   Register RSrcReg;
3728   Register SOffset;
3729   int64_t Offset = 0;
3730 
3731   if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
3732     return {};
3733 
3734   return {{
3735       [=](MachineInstrBuilder &MIB) {  // rsrc
3736         MIB.addReg(RSrcReg);
3737       },
3738       [=](MachineInstrBuilder &MIB) { // soffset
3739         if (SOffset)
3740           MIB.addReg(SOffset);
3741         else
3742           MIB.addImm(0);
3743       },
3744       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
3745       addZeroImm //  slc
3746     }};
3747 }
3748 
3749 /// Get an immediate that must be 32-bits, and treated as zero extended.
3750 static Optional<uint64_t> getConstantZext32Val(Register Reg,
3751                                                const MachineRegisterInfo &MRI) {
3752   // getConstantVRegVal sexts any values, so see if that matters.
3753   Optional<int64_t> OffsetVal = getConstantVRegVal(Reg, MRI);
3754   if (!OffsetVal || !isInt<32>(*OffsetVal))
3755     return None;
3756   return Lo_32(*OffsetVal);
3757 }
3758 
3759 InstructionSelector::ComplexRendererFns
3760 AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
3761   Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
3762   if (!OffsetVal)
3763     return {};
3764 
3765   Optional<int64_t> EncodedImm =
3766       AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
3767   if (!EncodedImm)
3768     return {};
3769 
3770   return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }  }};
3771 }
3772 
3773 InstructionSelector::ComplexRendererFns
3774 AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
3775   assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
3776 
3777   Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
3778   if (!OffsetVal)
3779     return {};
3780 
3781   Optional<int64_t> EncodedImm
3782     = AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal);
3783   if (!EncodedImm)
3784     return {};
3785 
3786   return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }  }};
3787 }
3788 
3789 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
3790                                                  const MachineInstr &MI,
3791                                                  int OpIdx) const {
3792   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
3793          "Expected G_CONSTANT");
3794   MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
3795 }
3796 
3797 void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
3798                                                 const MachineInstr &MI,
3799                                                 int OpIdx) const {
3800   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
3801          "Expected G_CONSTANT");
3802   MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
3803 }
3804 
3805 void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB,
3806                                                  const MachineInstr &MI,
3807                                                  int OpIdx) const {
3808   assert(OpIdx == -1);
3809 
3810   const MachineOperand &Op = MI.getOperand(1);
3811   if (MI.getOpcode() == TargetOpcode::G_FCONSTANT)
3812     MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
3813   else {
3814     assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
3815     MIB.addImm(Op.getCImm()->getSExtValue());
3816   }
3817 }
3818 
3819 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
3820                                                 const MachineInstr &MI,
3821                                                 int OpIdx) const {
3822   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
3823          "Expected G_CONSTANT");
3824   MIB.addImm(MI.getOperand(1).getCImm()->getValue().countPopulation());
3825 }
3826 
3827 /// This only really exists to satisfy DAG type checking machinery, so is a
3828 /// no-op here.
3829 void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
3830                                                 const MachineInstr &MI,
3831                                                 int OpIdx) const {
3832   MIB.addImm(MI.getOperand(OpIdx).getImm());
3833 }
3834 
3835 void AMDGPUInstructionSelector::renderExtractGLC(MachineInstrBuilder &MIB,
3836                                                  const MachineInstr &MI,
3837                                                  int OpIdx) const {
3838   assert(OpIdx >= 0 && "expected to match an immediate operand");
3839   MIB.addImm(MI.getOperand(OpIdx).getImm() & 1);
3840 }
3841 
3842 void AMDGPUInstructionSelector::renderExtractSLC(MachineInstrBuilder &MIB,
3843                                                  const MachineInstr &MI,
3844                                                  int OpIdx) const {
3845   assert(OpIdx >= 0 && "expected to match an immediate operand");
3846   MIB.addImm((MI.getOperand(OpIdx).getImm() >> 1) & 1);
3847 }
3848 
3849 void AMDGPUInstructionSelector::renderExtractDLC(MachineInstrBuilder &MIB,
3850                                                  const MachineInstr &MI,
3851                                                  int OpIdx) const {
3852   assert(OpIdx >= 0 && "expected to match an immediate operand");
3853   MIB.addImm((MI.getOperand(OpIdx).getImm() >> 2) & 1);
3854 }
3855 
3856 void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
3857                                                  const MachineInstr &MI,
3858                                                  int OpIdx) const {
3859   assert(OpIdx >= 0 && "expected to match an immediate operand");
3860   MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1);
3861 }
3862 
3863 bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const {
3864   return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm());
3865 }
3866 
3867 bool AMDGPUInstructionSelector::isInlineImmediate32(int64_t Imm) const {
3868   return AMDGPU::isInlinableLiteral32(Imm, STI.hasInv2PiInlineImm());
3869 }
3870 
3871 bool AMDGPUInstructionSelector::isInlineImmediate64(int64_t Imm) const {
3872   return AMDGPU::isInlinableLiteral64(Imm, STI.hasInv2PiInlineImm());
3873 }
3874 
3875 bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
3876   return TII.isInlineConstant(Imm);
3877 }
3878