1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the InstructionSelector class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUInstructionSelector.h"
15 #include "AMDGPUInstrInfo.h"
16 #include "AMDGPUGlobalISelUtils.h"
17 #include "AMDGPURegisterBankInfo.h"
18 #include "AMDGPUSubtarget.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
23 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
24 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
25 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
26 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
27 #include "llvm/CodeGen/GlobalISel/Utils.h"
28 #include "llvm/CodeGen/MachineBasicBlock.h"
29 #include "llvm/CodeGen/MachineFunction.h"
30 #include "llvm/CodeGen/MachineInstr.h"
31 #include "llvm/CodeGen/MachineInstrBuilder.h"
32 #include "llvm/CodeGen/MachineRegisterInfo.h"
33 #include "llvm/IR/Type.h"
34 #include "llvm/Support/Debug.h"
35 #include "llvm/Support/raw_ostream.h"
36 
37 #define DEBUG_TYPE "amdgpu-isel"
38 
39 using namespace llvm;
40 using namespace MIPatternMatch;
41 
42 static cl::opt<bool> AllowRiskySelect(
43   "amdgpu-global-isel-risky-select",
44   cl::desc("Allow GlobalISel to select cases that are likely to not work yet"),
45   cl::init(false),
46   cl::ReallyHidden);
47 
48 #define GET_GLOBALISEL_IMPL
49 #define AMDGPUSubtarget GCNSubtarget
50 #include "AMDGPUGenGlobalISel.inc"
51 #undef GET_GLOBALISEL_IMPL
52 #undef AMDGPUSubtarget
53 
54 AMDGPUInstructionSelector::AMDGPUInstructionSelector(
55     const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
56     const AMDGPUTargetMachine &TM)
57     : InstructionSelector(), TII(*STI.getInstrInfo()),
58       TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
59       STI(STI),
60       EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG),
61 #define GET_GLOBALISEL_PREDICATES_INIT
62 #include "AMDGPUGenGlobalISel.inc"
63 #undef GET_GLOBALISEL_PREDICATES_INIT
64 #define GET_GLOBALISEL_TEMPORARIES_INIT
65 #include "AMDGPUGenGlobalISel.inc"
66 #undef GET_GLOBALISEL_TEMPORARIES_INIT
67 {
68 }
69 
70 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
71 
72 void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB,
73                                         CodeGenCoverage &CoverageInfo) {
74   MRI = &MF.getRegInfo();
75   InstructionSelector::setupMF(MF, KB, CoverageInfo);
76 }
77 
78 bool AMDGPUInstructionSelector::isVCC(Register Reg,
79                                       const MachineRegisterInfo &MRI) const {
80   if (Register::isPhysicalRegister(Reg))
81     return Reg == TRI.getVCC();
82 
83   auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
84   const TargetRegisterClass *RC =
85       RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
86   if (RC) {
87     const LLT Ty = MRI.getType(Reg);
88     return RC->hasSuperClassEq(TRI.getBoolRC()) &&
89            Ty.isValid() && Ty.getSizeInBits() == 1;
90   }
91 
92   const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
93   return RB->getID() == AMDGPU::VCCRegBankID;
94 }
95 
96 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
97                                                         unsigned NewOpc) const {
98   MI.setDesc(TII.get(NewOpc));
99   MI.RemoveOperand(1); // Remove intrinsic ID.
100   MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
101 
102   MachineOperand &Dst = MI.getOperand(0);
103   MachineOperand &Src = MI.getOperand(1);
104 
105   // TODO: This should be legalized to s32 if needed
106   if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
107     return false;
108 
109   const TargetRegisterClass *DstRC
110     = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
111   const TargetRegisterClass *SrcRC
112     = TRI.getConstrainedRegClassForOperand(Src, *MRI);
113   if (!DstRC || DstRC != SrcRC)
114     return false;
115 
116   return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
117          RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
118 }
119 
120 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
121   const DebugLoc &DL = I.getDebugLoc();
122   MachineBasicBlock *BB = I.getParent();
123   I.setDesc(TII.get(TargetOpcode::COPY));
124 
125   const MachineOperand &Src = I.getOperand(1);
126   MachineOperand &Dst = I.getOperand(0);
127   Register DstReg = Dst.getReg();
128   Register SrcReg = Src.getReg();
129 
130   if (isVCC(DstReg, *MRI)) {
131     if (SrcReg == AMDGPU::SCC) {
132       const TargetRegisterClass *RC
133         = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
134       if (!RC)
135         return true;
136       return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
137     }
138 
139     if (!isVCC(SrcReg, *MRI)) {
140       // TODO: Should probably leave the copy and let copyPhysReg expand it.
141       if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
142         return false;
143 
144       const TargetRegisterClass *SrcRC
145         = TRI.getConstrainedRegClassForOperand(Src, *MRI);
146 
147       Register MaskedReg = MRI->createVirtualRegister(SrcRC);
148 
149       // We can't trust the high bits at this point, so clear them.
150 
151       // TODO: Skip masking high bits if def is known boolean.
152 
153       unsigned AndOpc = TRI.isSGPRClass(SrcRC) ?
154         AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
155       BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
156         .addImm(1)
157         .addReg(SrcReg);
158       BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
159         .addImm(0)
160         .addReg(MaskedReg);
161 
162       if (!MRI->getRegClassOrNull(SrcReg))
163         MRI->setRegClass(SrcReg, SrcRC);
164       I.eraseFromParent();
165       return true;
166     }
167 
168     const TargetRegisterClass *RC =
169       TRI.getConstrainedRegClassForOperand(Dst, *MRI);
170     if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
171       return false;
172 
173     return true;
174   }
175 
176   for (const MachineOperand &MO : I.operands()) {
177     if (Register::isPhysicalRegister(MO.getReg()))
178       continue;
179 
180     const TargetRegisterClass *RC =
181             TRI.getConstrainedRegClassForOperand(MO, *MRI);
182     if (!RC)
183       continue;
184     RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
185   }
186   return true;
187 }
188 
189 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
190   const Register DefReg = I.getOperand(0).getReg();
191   const LLT DefTy = MRI->getType(DefReg);
192   if (DefTy == LLT::scalar(1)) {
193     if (!AllowRiskySelect) {
194       LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n");
195       return false;
196     }
197 
198     LLVM_DEBUG(dbgs() << "Selecting risky boolean phi\n");
199   }
200 
201   // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
202 
203   const RegClassOrRegBank &RegClassOrBank =
204     MRI->getRegClassOrRegBank(DefReg);
205 
206   const TargetRegisterClass *DefRC
207     = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
208   if (!DefRC) {
209     if (!DefTy.isValid()) {
210       LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
211       return false;
212     }
213 
214     const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
215     DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI);
216     if (!DefRC) {
217       LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
218       return false;
219     }
220   }
221 
222   // TODO: Verify that all registers have the same bank
223   I.setDesc(TII.get(TargetOpcode::PHI));
224   return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
225 }
226 
227 MachineOperand
228 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
229                                            const TargetRegisterClass &SubRC,
230                                            unsigned SubIdx) const {
231 
232   MachineInstr *MI = MO.getParent();
233   MachineBasicBlock *BB = MO.getParent()->getParent();
234   Register DstReg = MRI->createVirtualRegister(&SubRC);
235 
236   if (MO.isReg()) {
237     unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
238     Register Reg = MO.getReg();
239     BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
240             .addReg(Reg, 0, ComposedSubIdx);
241 
242     return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
243                                      MO.isKill(), MO.isDead(), MO.isUndef(),
244                                      MO.isEarlyClobber(), 0, MO.isDebug(),
245                                      MO.isInternalRead());
246   }
247 
248   assert(MO.isImm());
249 
250   APInt Imm(64, MO.getImm());
251 
252   switch (SubIdx) {
253   default:
254     llvm_unreachable("do not know to split immediate with this sub index.");
255   case AMDGPU::sub0:
256     return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
257   case AMDGPU::sub1:
258     return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
259   }
260 }
261 
262 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
263   switch (Opc) {
264   case AMDGPU::G_AND:
265     return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
266   case AMDGPU::G_OR:
267     return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
268   case AMDGPU::G_XOR:
269     return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
270   default:
271     llvm_unreachable("not a bit op");
272   }
273 }
274 
275 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
276   Register DstReg = I.getOperand(0).getReg();
277   unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
278 
279   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
280   if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
281       DstRB->getID() != AMDGPU::VCCRegBankID)
282     return false;
283 
284   bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
285                             STI.isWave64());
286   I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
287 
288   // Dead implicit-def of scc
289   I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
290                                          true, // isImp
291                                          false, // isKill
292                                          true)); // isDead
293   return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
294 }
295 
296 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
297   MachineBasicBlock *BB = I.getParent();
298   MachineFunction *MF = BB->getParent();
299   Register DstReg = I.getOperand(0).getReg();
300   const DebugLoc &DL = I.getDebugLoc();
301   LLT Ty = MRI->getType(DstReg);
302   if (Ty.isVector())
303     return false;
304 
305   unsigned Size = Ty.getSizeInBits();
306   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
307   const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
308   const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
309 
310   if (Size == 32) {
311     if (IsSALU) {
312       const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
313       MachineInstr *Add =
314         BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
315         .add(I.getOperand(1))
316         .add(I.getOperand(2));
317       I.eraseFromParent();
318       return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
319     }
320 
321     if (STI.hasAddNoCarry()) {
322       const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
323       I.setDesc(TII.get(Opc));
324       I.addOperand(*MF, MachineOperand::CreateImm(0));
325       I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
326       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
327     }
328 
329     const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
330 
331     Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
332     MachineInstr *Add
333       = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
334       .addDef(UnusedCarry, RegState::Dead)
335       .add(I.getOperand(1))
336       .add(I.getOperand(2))
337       .addImm(0);
338     I.eraseFromParent();
339     return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
340   }
341 
342   assert(!Sub && "illegal sub should not reach here");
343 
344   const TargetRegisterClass &RC
345     = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
346   const TargetRegisterClass &HalfRC
347     = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
348 
349   MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
350   MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
351   MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
352   MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
353 
354   Register DstLo = MRI->createVirtualRegister(&HalfRC);
355   Register DstHi = MRI->createVirtualRegister(&HalfRC);
356 
357   if (IsSALU) {
358     BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
359       .add(Lo1)
360       .add(Lo2);
361     BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
362       .add(Hi1)
363       .add(Hi2);
364   } else {
365     const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
366     Register CarryReg = MRI->createVirtualRegister(CarryRC);
367     BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
368       .addDef(CarryReg)
369       .add(Lo1)
370       .add(Lo2)
371       .addImm(0);
372     MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
373       .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
374       .add(Hi1)
375       .add(Hi2)
376       .addReg(CarryReg, RegState::Kill)
377       .addImm(0);
378 
379     if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
380       return false;
381   }
382 
383   BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
384     .addReg(DstLo)
385     .addImm(AMDGPU::sub0)
386     .addReg(DstHi)
387     .addImm(AMDGPU::sub1);
388 
389 
390   if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
391     return false;
392 
393   I.eraseFromParent();
394   return true;
395 }
396 
397 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
398   MachineInstr &I) const {
399   MachineBasicBlock *BB = I.getParent();
400   MachineFunction *MF = BB->getParent();
401   const DebugLoc &DL = I.getDebugLoc();
402   Register Dst0Reg = I.getOperand(0).getReg();
403   Register Dst1Reg = I.getOperand(1).getReg();
404   const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
405                      I.getOpcode() == AMDGPU::G_UADDE;
406   const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
407                           I.getOpcode() == AMDGPU::G_USUBE;
408 
409   if (isVCC(Dst1Reg, *MRI)) {
410     unsigned NoCarryOpc =
411         IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
412     unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
413     I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
414     I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
415     I.addOperand(*MF, MachineOperand::CreateImm(0));
416     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
417   }
418 
419   Register Src0Reg = I.getOperand(2).getReg();
420   Register Src1Reg = I.getOperand(3).getReg();
421 
422   if (HasCarryIn) {
423     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
424       .addReg(I.getOperand(4).getReg());
425   }
426 
427   unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
428   unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
429 
430   BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
431     .add(I.getOperand(2))
432     .add(I.getOperand(3));
433   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
434     .addReg(AMDGPU::SCC);
435 
436   if (!MRI->getRegClassOrNull(Dst1Reg))
437     MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
438 
439   if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
440       !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
441       !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
442     return false;
443 
444   if (HasCarryIn &&
445       !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
446                                     AMDGPU::SReg_32RegClass, *MRI))
447     return false;
448 
449   I.eraseFromParent();
450   return true;
451 }
452 
453 // TODO: We should probably legalize these to only using 32-bit results.
454 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
455   MachineBasicBlock *BB = I.getParent();
456   Register DstReg = I.getOperand(0).getReg();
457   Register SrcReg = I.getOperand(1).getReg();
458   LLT DstTy = MRI->getType(DstReg);
459   LLT SrcTy = MRI->getType(SrcReg);
460   const unsigned SrcSize = SrcTy.getSizeInBits();
461   unsigned DstSize = DstTy.getSizeInBits();
462 
463   // TODO: Should handle any multiple of 32 offset.
464   unsigned Offset = I.getOperand(2).getImm();
465   if (Offset % 32 != 0 || DstSize > 128)
466     return false;
467 
468   // 16-bit operations really use 32-bit registers.
469   // FIXME: Probably should not allow 16-bit G_EXTRACT results.
470   if (DstSize == 16)
471     DstSize = 32;
472 
473   const TargetRegisterClass *DstRC =
474     TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
475   if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
476     return false;
477 
478   const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
479   const TargetRegisterClass *SrcRC =
480     TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
481   if (!SrcRC)
482     return false;
483   unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32,
484                                                          DstSize / 32);
485   SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
486   if (!SrcRC)
487     return false;
488 
489   SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
490                                     *SrcRC, I.getOperand(1));
491   const DebugLoc &DL = I.getDebugLoc();
492   BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
493     .addReg(SrcReg, 0, SubReg);
494 
495   I.eraseFromParent();
496   return true;
497 }
498 
499 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
500   MachineBasicBlock *BB = MI.getParent();
501   Register DstReg = MI.getOperand(0).getReg();
502   LLT DstTy = MRI->getType(DstReg);
503   LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
504 
505   const unsigned SrcSize = SrcTy.getSizeInBits();
506   if (SrcSize < 32)
507     return selectImpl(MI, *CoverageInfo);
508 
509   const DebugLoc &DL = MI.getDebugLoc();
510   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
511   const unsigned DstSize = DstTy.getSizeInBits();
512   const TargetRegisterClass *DstRC =
513     TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
514   if (!DstRC)
515     return false;
516 
517   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
518   MachineInstrBuilder MIB =
519     BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
520   for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
521     MachineOperand &Src = MI.getOperand(I + 1);
522     MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
523     MIB.addImm(SubRegs[I]);
524 
525     const TargetRegisterClass *SrcRC
526       = TRI.getConstrainedRegClassForOperand(Src, *MRI);
527     if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
528       return false;
529   }
530 
531   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
532     return false;
533 
534   MI.eraseFromParent();
535   return true;
536 }
537 
538 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
539   MachineBasicBlock *BB = MI.getParent();
540   const int NumDst = MI.getNumOperands() - 1;
541 
542   MachineOperand &Src = MI.getOperand(NumDst);
543 
544   Register SrcReg = Src.getReg();
545   Register DstReg0 = MI.getOperand(0).getReg();
546   LLT DstTy = MRI->getType(DstReg0);
547   LLT SrcTy = MRI->getType(SrcReg);
548 
549   const unsigned DstSize = DstTy.getSizeInBits();
550   const unsigned SrcSize = SrcTy.getSizeInBits();
551   const DebugLoc &DL = MI.getDebugLoc();
552   const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
553 
554   const TargetRegisterClass *SrcRC =
555     TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
556   if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
557     return false;
558 
559   const unsigned SrcFlags = getUndefRegState(Src.isUndef());
560 
561   // Note we could have mixed SGPR and VGPR destination banks for an SGPR
562   // source, and this relies on the fact that the same subregister indices are
563   // used for both.
564   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
565   for (int I = 0, E = NumDst; I != E; ++I) {
566     MachineOperand &Dst = MI.getOperand(I);
567     BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
568       .addReg(SrcReg, SrcFlags, SubRegs[I]);
569 
570     const TargetRegisterClass *DstRC =
571       TRI.getConstrainedRegClassForOperand(Dst, *MRI);
572     if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
573       return false;
574   }
575 
576   MI.eraseFromParent();
577   return true;
578 }
579 
580 bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
581   MachineInstr &MI) const {
582   if (selectImpl(MI, *CoverageInfo))
583     return true;
584 
585   const LLT S32 = LLT::scalar(32);
586   const LLT V2S16 = LLT::vector(2, 16);
587 
588   Register Dst = MI.getOperand(0).getReg();
589   if (MRI->getType(Dst) != V2S16)
590     return false;
591 
592   const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
593   if (DstBank->getID() != AMDGPU::SGPRRegBankID)
594     return false;
595 
596   Register Src0 = MI.getOperand(1).getReg();
597   Register Src1 = MI.getOperand(2).getReg();
598   if (MRI->getType(Src0) != S32)
599     return false;
600 
601   const DebugLoc &DL = MI.getDebugLoc();
602   MachineBasicBlock *BB = MI.getParent();
603 
604   auto ConstSrc1 = getConstantVRegValWithLookThrough(Src1, *MRI, true, true);
605   if (ConstSrc1) {
606     auto ConstSrc0 = getConstantVRegValWithLookThrough(Src0, *MRI, true, true);
607     if (ConstSrc0) {
608       uint32_t Lo16 = static_cast<uint32_t>(ConstSrc0->Value) & 0xffff;
609       uint32_t Hi16 = static_cast<uint32_t>(ConstSrc1->Value) & 0xffff;
610 
611       BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst)
612         .addImm(Lo16 | (Hi16 << 16));
613       MI.eraseFromParent();
614       return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
615     }
616   }
617 
618   // TODO: This should probably be a combine somewhere
619   // (build_vector_trunc $src0, undef -> copy $src0
620   MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
621   if (Src1Def && Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
622     MI.setDesc(TII.get(AMDGPU::COPY));
623     MI.RemoveOperand(2);
624     return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI) &&
625            RBI.constrainGenericRegister(Src0, AMDGPU::SReg_32RegClass, *MRI);
626   }
627 
628   Register ShiftSrc0;
629   Register ShiftSrc1;
630   int64_t ShiftAmt;
631 
632   // With multiple uses of the shift, this will duplicate the shift and
633   // increase register pressure.
634   //
635   // (build_vector_trunc (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
636   //  => (S_PACK_HH_B32_B16 $src0, $src1)
637   // (build_vector_trunc $src0, (lshr_oneuse SReg_32:$src1, 16))
638   //  => (S_PACK_LH_B32_B16 $src0, $src1)
639   // (build_vector_trunc $src0, $src1)
640   //  => (S_PACK_LL_B32_B16 $src0, $src1)
641 
642   // FIXME: This is an inconvenient way to check a specific value
643   bool Shift0 = mi_match(
644     Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_ICst(ShiftAmt)))) &&
645     ShiftAmt == 16;
646 
647   bool Shift1 = mi_match(
648     Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_ICst(ShiftAmt)))) &&
649     ShiftAmt == 16;
650 
651   unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
652   if (Shift0 && Shift1) {
653     Opc = AMDGPU::S_PACK_HH_B32_B16;
654     MI.getOperand(1).setReg(ShiftSrc0);
655     MI.getOperand(2).setReg(ShiftSrc1);
656   } else if (Shift1) {
657     Opc = AMDGPU::S_PACK_LH_B32_B16;
658     MI.getOperand(2).setReg(ShiftSrc1);
659   } else if (Shift0 && ConstSrc1 && ConstSrc1->Value == 0) {
660     // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
661     auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
662       .addReg(ShiftSrc0)
663       .addImm(16);
664 
665     MI.eraseFromParent();
666     return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
667   }
668 
669   MI.setDesc(TII.get(Opc));
670   return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
671 }
672 
673 bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const {
674   return selectG_ADD_SUB(I);
675 }
676 
677 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
678   const MachineOperand &MO = I.getOperand(0);
679 
680   // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
681   // regbank check here is to know why getConstrainedRegClassForOperand failed.
682   const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
683   if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
684       (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
685     I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
686     return true;
687   }
688 
689   return false;
690 }
691 
692 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
693   MachineBasicBlock *BB = I.getParent();
694 
695   Register DstReg = I.getOperand(0).getReg();
696   Register Src0Reg = I.getOperand(1).getReg();
697   Register Src1Reg = I.getOperand(2).getReg();
698   LLT Src1Ty = MRI->getType(Src1Reg);
699 
700   unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
701   unsigned InsSize = Src1Ty.getSizeInBits();
702 
703   int64_t Offset = I.getOperand(3).getImm();
704 
705   // FIXME: These cases should have been illegal and unnecessary to check here.
706   if (Offset % 32 != 0 || InsSize % 32 != 0)
707     return false;
708 
709   // Currently not handled by getSubRegFromChannel.
710   if (InsSize > 128)
711     return false;
712 
713   unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
714   if (SubReg == AMDGPU::NoSubRegister)
715     return false;
716 
717   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
718   const TargetRegisterClass *DstRC =
719     TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
720   if (!DstRC)
721     return false;
722 
723   const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
724   const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
725   const TargetRegisterClass *Src0RC =
726     TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI);
727   const TargetRegisterClass *Src1RC =
728     TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI);
729 
730   // Deal with weird cases where the class only partially supports the subreg
731   // index.
732   Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
733   if (!Src0RC || !Src1RC)
734     return false;
735 
736   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
737       !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
738       !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
739     return false;
740 
741   const DebugLoc &DL = I.getDebugLoc();
742   BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
743     .addReg(Src0Reg)
744     .addReg(Src1Reg)
745     .addImm(SubReg);
746 
747   I.eraseFromParent();
748   return true;
749 }
750 
751 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
752   if (STI.getLDSBankCount() != 16)
753     return selectImpl(MI, *CoverageInfo);
754 
755   Register Dst = MI.getOperand(0).getReg();
756   Register Src0 = MI.getOperand(2).getReg();
757   Register M0Val = MI.getOperand(6).getReg();
758   if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
759       !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
760       !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
761     return false;
762 
763   // This requires 2 instructions. It is possible to write a pattern to support
764   // this, but the generated isel emitter doesn't correctly deal with multiple
765   // output instructions using the same physical register input. The copy to m0
766   // is incorrectly placed before the second instruction.
767   //
768   // TODO: Match source modifiers.
769 
770   Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
771   const DebugLoc &DL = MI.getDebugLoc();
772   MachineBasicBlock *MBB = MI.getParent();
773 
774   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
775     .addReg(M0Val);
776   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
777     .addImm(2)
778     .addImm(MI.getOperand(4).getImm())  // $attr
779     .addImm(MI.getOperand(3).getImm()); // $attrchan
780 
781   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
782     .addImm(0)                          // $src0_modifiers
783     .addReg(Src0)                       // $src0
784     .addImm(MI.getOperand(4).getImm())  // $attr
785     .addImm(MI.getOperand(3).getImm())  // $attrchan
786     .addImm(0)                          // $src2_modifiers
787     .addReg(InterpMov)                  // $src2 - 2 f16 values selected by high
788     .addImm(MI.getOperand(5).getImm())  // $high
789     .addImm(0)                          // $clamp
790     .addImm(0);                         // $omod
791 
792   MI.eraseFromParent();
793   return true;
794 }
795 
796 // We need to handle this here because tablegen doesn't support matching
797 // instructions with multiple outputs.
798 bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
799   Register Dst0 = MI.getOperand(0).getReg();
800   Register Dst1 = MI.getOperand(1).getReg();
801 
802   LLT Ty = MRI->getType(Dst0);
803   unsigned Opc;
804   if (Ty == LLT::scalar(32))
805     Opc = AMDGPU::V_DIV_SCALE_F32;
806   else if (Ty == LLT::scalar(64))
807     Opc = AMDGPU::V_DIV_SCALE_F64;
808   else
809     return false;
810 
811   const DebugLoc &DL = MI.getDebugLoc();
812   MachineBasicBlock *MBB = MI.getParent();
813 
814   Register Numer = MI.getOperand(3).getReg();
815   Register Denom = MI.getOperand(4).getReg();
816   unsigned ChooseDenom = MI.getOperand(5).getImm();
817 
818   Register Src0 = ChooseDenom != 0 ? Numer : Denom;
819 
820   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
821     .addDef(Dst1)
822     .addUse(Src0)
823     .addUse(Denom)
824     .addUse(Numer);
825 
826   MI.eraseFromParent();
827   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
828 }
829 
830 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
831   unsigned IntrinsicID = I.getIntrinsicID();
832   switch (IntrinsicID) {
833   case Intrinsic::amdgcn_if_break: {
834     MachineBasicBlock *BB = I.getParent();
835 
836     // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
837     // SelectionDAG uses for wave32 vs wave64.
838     BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
839       .add(I.getOperand(0))
840       .add(I.getOperand(2))
841       .add(I.getOperand(3));
842 
843     Register DstReg = I.getOperand(0).getReg();
844     Register Src0Reg = I.getOperand(2).getReg();
845     Register Src1Reg = I.getOperand(3).getReg();
846 
847     I.eraseFromParent();
848 
849     for (Register Reg : { DstReg, Src0Reg, Src1Reg })
850       MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
851 
852     return true;
853   }
854   case Intrinsic::amdgcn_interp_p1_f16:
855     return selectInterpP1F16(I);
856   case Intrinsic::amdgcn_wqm:
857     return constrainCopyLikeIntrin(I, AMDGPU::WQM);
858   case Intrinsic::amdgcn_softwqm:
859     return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
860   case Intrinsic::amdgcn_wwm:
861     return constrainCopyLikeIntrin(I, AMDGPU::WWM);
862   case Intrinsic::amdgcn_div_scale:
863     return selectDivScale(I);
864   case Intrinsic::amdgcn_icmp:
865     return selectIntrinsicIcmp(I);
866   case Intrinsic::amdgcn_ballot:
867     return selectBallot(I);
868   case Intrinsic::amdgcn_reloc_constant:
869     return selectRelocConstant(I);
870   default:
871     return selectImpl(I, *CoverageInfo);
872   }
873 }
874 
875 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) {
876   if (Size != 32 && Size != 64)
877     return -1;
878   switch (P) {
879   default:
880     llvm_unreachable("Unknown condition code!");
881   case CmpInst::ICMP_NE:
882     return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64;
883   case CmpInst::ICMP_EQ:
884     return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64;
885   case CmpInst::ICMP_SGT:
886     return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64;
887   case CmpInst::ICMP_SGE:
888     return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64;
889   case CmpInst::ICMP_SLT:
890     return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64;
891   case CmpInst::ICMP_SLE:
892     return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64;
893   case CmpInst::ICMP_UGT:
894     return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64;
895   case CmpInst::ICMP_UGE:
896     return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64;
897   case CmpInst::ICMP_ULT:
898     return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64;
899   case CmpInst::ICMP_ULE:
900     return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64;
901   }
902 }
903 
904 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
905                                               unsigned Size) const {
906   if (Size == 64) {
907     if (!STI.hasScalarCompareEq64())
908       return -1;
909 
910     switch (P) {
911     case CmpInst::ICMP_NE:
912       return AMDGPU::S_CMP_LG_U64;
913     case CmpInst::ICMP_EQ:
914       return AMDGPU::S_CMP_EQ_U64;
915     default:
916       return -1;
917     }
918   }
919 
920   if (Size != 32)
921     return -1;
922 
923   switch (P) {
924   case CmpInst::ICMP_NE:
925     return AMDGPU::S_CMP_LG_U32;
926   case CmpInst::ICMP_EQ:
927     return AMDGPU::S_CMP_EQ_U32;
928   case CmpInst::ICMP_SGT:
929     return AMDGPU::S_CMP_GT_I32;
930   case CmpInst::ICMP_SGE:
931     return AMDGPU::S_CMP_GE_I32;
932   case CmpInst::ICMP_SLT:
933     return AMDGPU::S_CMP_LT_I32;
934   case CmpInst::ICMP_SLE:
935     return AMDGPU::S_CMP_LE_I32;
936   case CmpInst::ICMP_UGT:
937     return AMDGPU::S_CMP_GT_U32;
938   case CmpInst::ICMP_UGE:
939     return AMDGPU::S_CMP_GE_U32;
940   case CmpInst::ICMP_ULT:
941     return AMDGPU::S_CMP_LT_U32;
942   case CmpInst::ICMP_ULE:
943     return AMDGPU::S_CMP_LE_U32;
944   default:
945     llvm_unreachable("Unknown condition code!");
946   }
947 }
948 
949 bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
950   MachineBasicBlock *BB = I.getParent();
951   const DebugLoc &DL = I.getDebugLoc();
952 
953   Register SrcReg = I.getOperand(2).getReg();
954   unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
955 
956   auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
957 
958   Register CCReg = I.getOperand(0).getReg();
959   if (!isVCC(CCReg, *MRI)) {
960     int Opcode = getS_CMPOpcode(Pred, Size);
961     if (Opcode == -1)
962       return false;
963     MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
964             .add(I.getOperand(2))
965             .add(I.getOperand(3));
966     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
967       .addReg(AMDGPU::SCC);
968     bool Ret =
969         constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
970         RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
971     I.eraseFromParent();
972     return Ret;
973   }
974 
975   int Opcode = getV_CMPOpcode(Pred, Size);
976   if (Opcode == -1)
977     return false;
978 
979   MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode),
980             I.getOperand(0).getReg())
981             .add(I.getOperand(2))
982             .add(I.getOperand(3));
983   RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
984                                *TRI.getBoolRC(), *MRI);
985   bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
986   I.eraseFromParent();
987   return Ret;
988 }
989 
990 bool AMDGPUInstructionSelector::selectIntrinsicIcmp(MachineInstr &I) const {
991   Register Dst = I.getOperand(0).getReg();
992   if (isVCC(Dst, *MRI))
993     return false;
994 
995   if (MRI->getType(Dst).getSizeInBits() != STI.getWavefrontSize())
996     return false;
997 
998   MachineBasicBlock *BB = I.getParent();
999   const DebugLoc &DL = I.getDebugLoc();
1000   Register SrcReg = I.getOperand(2).getReg();
1001   unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1002   auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1003 
1004   int Opcode = getV_CMPOpcode(Pred, Size);
1005   if (Opcode == -1)
1006     return false;
1007 
1008   MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst)
1009                            .add(I.getOperand(2))
1010                            .add(I.getOperand(3));
1011   RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), *TRI.getBoolRC(),
1012                                *MRI);
1013   bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1014   I.eraseFromParent();
1015   return Ret;
1016 }
1017 
1018 bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1019   MachineBasicBlock *BB = I.getParent();
1020   const DebugLoc &DL = I.getDebugLoc();
1021   Register DstReg = I.getOperand(0).getReg();
1022   const unsigned Size = MRI->getType(DstReg).getSizeInBits();
1023   const bool Is64 = Size == 64;
1024 
1025   if (Size != STI.getWavefrontSize())
1026     return false;
1027 
1028   Optional<ValueAndVReg> Arg =
1029       getConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI, true);
1030 
1031   if (Arg.hasValue()) {
1032     const int64_t Value = Arg.getValue().Value;
1033     if (Value == 0) {
1034       unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1035       BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
1036     } else if (Value == -1) { // all ones
1037       Register SrcReg = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
1038       BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
1039     } else
1040       return false;
1041   } else {
1042     Register SrcReg = I.getOperand(2).getReg();
1043     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
1044   }
1045 
1046   I.eraseFromParent();
1047   return true;
1048 }
1049 
1050 bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1051   Register DstReg = I.getOperand(0).getReg();
1052   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1053   const TargetRegisterClass *DstRC =
1054     TRI.getRegClassForSizeOnBank(32, *DstBank, *MRI);
1055   if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1056     return false;
1057 
1058   const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1059 
1060   Module *M = MF->getFunction().getParent();
1061   const MDNode *Metadata = I.getOperand(2).getMetadata();
1062   auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1063   auto RelocSymbol = cast<GlobalVariable>(
1064     M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1065 
1066   MachineBasicBlock *BB = I.getParent();
1067   BuildMI(*BB, &I, I.getDebugLoc(),
1068           TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1069     .addGlobalAddress(RelocSymbol, 0, SIInstrInfo::MO_ABS32_LO);
1070 
1071   I.eraseFromParent();
1072   return true;
1073 }
1074 
1075 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1076   // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
1077   // SelectionDAG uses for wave32 vs wave64.
1078   MachineBasicBlock *BB = MI.getParent();
1079   BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1080       .add(MI.getOperand(1));
1081 
1082   Register Reg = MI.getOperand(1).getReg();
1083   MI.eraseFromParent();
1084 
1085   if (!MRI->getRegClassOrNull(Reg))
1086     MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1087   return true;
1088 }
1089 
1090 static unsigned getDSShaderTypeValue(const MachineFunction &MF) {
1091   switch (MF.getFunction().getCallingConv()) {
1092   case CallingConv::AMDGPU_PS:
1093     return 1;
1094   case CallingConv::AMDGPU_VS:
1095     return 2;
1096   case CallingConv::AMDGPU_GS:
1097     return 3;
1098   case CallingConv::AMDGPU_HS:
1099   case CallingConv::AMDGPU_LS:
1100   case CallingConv::AMDGPU_ES:
1101     report_fatal_error("ds_ordered_count unsupported for this calling conv");
1102   case CallingConv::AMDGPU_CS:
1103   case CallingConv::AMDGPU_KERNEL:
1104   case CallingConv::C:
1105   case CallingConv::Fast:
1106   default:
1107     // Assume other calling conventions are various compute callable functions
1108     return 0;
1109   }
1110 }
1111 
1112 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1113   MachineInstr &MI, Intrinsic::ID IntrID) const {
1114   MachineBasicBlock *MBB = MI.getParent();
1115   MachineFunction *MF = MBB->getParent();
1116   const DebugLoc &DL = MI.getDebugLoc();
1117 
1118   unsigned IndexOperand = MI.getOperand(7).getImm();
1119   bool WaveRelease = MI.getOperand(8).getImm() != 0;
1120   bool WaveDone = MI.getOperand(9).getImm() != 0;
1121 
1122   if (WaveDone && !WaveRelease)
1123     report_fatal_error("ds_ordered_count: wave_done requires wave_release");
1124 
1125   unsigned OrderedCountIndex = IndexOperand & 0x3f;
1126   IndexOperand &= ~0x3f;
1127   unsigned CountDw = 0;
1128 
1129   if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1130     CountDw = (IndexOperand >> 24) & 0xf;
1131     IndexOperand &= ~(0xf << 24);
1132 
1133     if (CountDw < 1 || CountDw > 4) {
1134       report_fatal_error(
1135         "ds_ordered_count: dword count must be between 1 and 4");
1136     }
1137   }
1138 
1139   if (IndexOperand)
1140     report_fatal_error("ds_ordered_count: bad index operand");
1141 
1142   unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1143   unsigned ShaderType = getDSShaderTypeValue(*MF);
1144 
1145   unsigned Offset0 = OrderedCountIndex << 2;
1146   unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) |
1147                      (Instruction << 4);
1148 
1149   if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
1150     Offset1 |= (CountDw - 1) << 6;
1151 
1152   unsigned Offset = Offset0 | (Offset1 << 8);
1153 
1154   Register M0Val = MI.getOperand(2).getReg();
1155   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1156     .addReg(M0Val);
1157 
1158   Register DstReg = MI.getOperand(0).getReg();
1159   Register ValReg = MI.getOperand(3).getReg();
1160   MachineInstrBuilder DS =
1161     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1162       .addReg(ValReg)
1163       .addImm(Offset)
1164       .cloneMemRefs(MI);
1165 
1166   if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1167     return false;
1168 
1169   bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1170   MI.eraseFromParent();
1171   return Ret;
1172 }
1173 
1174 static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1175   switch (IntrID) {
1176   case Intrinsic::amdgcn_ds_gws_init:
1177     return AMDGPU::DS_GWS_INIT;
1178   case Intrinsic::amdgcn_ds_gws_barrier:
1179     return AMDGPU::DS_GWS_BARRIER;
1180   case Intrinsic::amdgcn_ds_gws_sema_v:
1181     return AMDGPU::DS_GWS_SEMA_V;
1182   case Intrinsic::amdgcn_ds_gws_sema_br:
1183     return AMDGPU::DS_GWS_SEMA_BR;
1184   case Intrinsic::amdgcn_ds_gws_sema_p:
1185     return AMDGPU::DS_GWS_SEMA_P;
1186   case Intrinsic::amdgcn_ds_gws_sema_release_all:
1187     return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1188   default:
1189     llvm_unreachable("not a gws intrinsic");
1190   }
1191 }
1192 
1193 bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1194                                                      Intrinsic::ID IID) const {
1195   if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1196       !STI.hasGWSSemaReleaseAll())
1197     return false;
1198 
1199   // intrinsic ID, vsrc, offset
1200   const bool HasVSrc = MI.getNumOperands() == 3;
1201   assert(HasVSrc || MI.getNumOperands() == 2);
1202 
1203   Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1204   const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1205   if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1206     return false;
1207 
1208   MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1209   assert(OffsetDef);
1210 
1211   unsigned ImmOffset;
1212 
1213   MachineBasicBlock *MBB = MI.getParent();
1214   const DebugLoc &DL = MI.getDebugLoc();
1215 
1216   MachineInstr *Readfirstlane = nullptr;
1217 
1218   // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1219   // incoming offset, in case there's an add of a constant. We'll have to put it
1220   // back later.
1221   if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1222     Readfirstlane = OffsetDef;
1223     BaseOffset = OffsetDef->getOperand(1).getReg();
1224     OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1225   }
1226 
1227   if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1228     // If we have a constant offset, try to use the 0 in m0 as the base.
1229     // TODO: Look into changing the default m0 initialization value. If the
1230     // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1231     // the immediate offset.
1232 
1233     ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1234     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1235       .addImm(0);
1236   } else {
1237     std::tie(BaseOffset, ImmOffset, OffsetDef)
1238       = AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset);
1239 
1240     if (Readfirstlane) {
1241       // We have the constant offset now, so put the readfirstlane back on the
1242       // variable component.
1243       if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1244         return false;
1245 
1246       Readfirstlane->getOperand(1).setReg(BaseOffset);
1247       BaseOffset = Readfirstlane->getOperand(0).getReg();
1248     } else {
1249       if (!RBI.constrainGenericRegister(BaseOffset,
1250                                         AMDGPU::SReg_32RegClass, *MRI))
1251         return false;
1252     }
1253 
1254     Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1255     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1256       .addReg(BaseOffset)
1257       .addImm(16);
1258 
1259     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1260       .addReg(M0Base);
1261   }
1262 
1263   // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1264   // offset field) % 64. Some versions of the programming guide omit the m0
1265   // part, or claim it's from offset 0.
1266   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1267 
1268   if (HasVSrc) {
1269     Register VSrc = MI.getOperand(1).getReg();
1270     MIB.addReg(VSrc);
1271     if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1272       return false;
1273   }
1274 
1275   MIB.addImm(ImmOffset)
1276      .addImm(-1) // $gds
1277      .cloneMemRefs(MI);
1278 
1279   MI.eraseFromParent();
1280   return true;
1281 }
1282 
1283 bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1284                                                       bool IsAppend) const {
1285   Register PtrBase = MI.getOperand(2).getReg();
1286   LLT PtrTy = MRI->getType(PtrBase);
1287   bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1288 
1289   unsigned Offset;
1290   std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1291 
1292   // TODO: Should this try to look through readfirstlane like GWS?
1293   if (!isDSOffsetLegal(PtrBase, Offset, 16)) {
1294     PtrBase = MI.getOperand(2).getReg();
1295     Offset = 0;
1296   }
1297 
1298   MachineBasicBlock *MBB = MI.getParent();
1299   const DebugLoc &DL = MI.getDebugLoc();
1300   const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1301 
1302   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1303     .addReg(PtrBase);
1304   if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1305     return false;
1306 
1307   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1308     .addImm(Offset)
1309     .addImm(IsGDS ? -1 : 0)
1310     .cloneMemRefs(MI);
1311   MI.eraseFromParent();
1312   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1313 }
1314 
1315 static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
1316                          bool &IsTexFail) {
1317   if (TexFailCtrl)
1318     IsTexFail = true;
1319 
1320   TFE = (TexFailCtrl & 0x1) ? 1 : 0;
1321   TexFailCtrl &= ~(uint64_t)0x1;
1322   LWE = (TexFailCtrl & 0x2) ? 1 : 0;
1323   TexFailCtrl &= ~(uint64_t)0x2;
1324 
1325   return TexFailCtrl == 0;
1326 }
1327 
1328 static bool parseCachePolicy(uint64_t Value,
1329                              bool *GLC, bool *SLC, bool *DLC) {
1330   if (GLC) {
1331     *GLC = (Value & 0x1) ? 1 : 0;
1332     Value &= ~(uint64_t)0x1;
1333   }
1334   if (SLC) {
1335     *SLC = (Value & 0x2) ? 1 : 0;
1336     Value &= ~(uint64_t)0x2;
1337   }
1338   if (DLC) {
1339     *DLC = (Value & 0x4) ? 1 : 0;
1340     Value &= ~(uint64_t)0x4;
1341   }
1342 
1343   return Value == 0;
1344 }
1345 
1346 bool AMDGPUInstructionSelector::selectImageIntrinsic(
1347   MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
1348   MachineBasicBlock *MBB = MI.getParent();
1349   const DebugLoc &DL = MI.getDebugLoc();
1350 
1351   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1352     AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1353 
1354   const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
1355   const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
1356       AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
1357   const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo =
1358       AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode);
1359   unsigned IntrOpcode = Intr->BaseOpcode;
1360   const bool IsGFX10 = STI.getGeneration() >= AMDGPUSubtarget::GFX10;
1361 
1362   const int VAddrIdx = getImageVAddrIdxBegin(BaseOpcode,
1363                                              MI.getNumExplicitDefs());
1364   int NumVAddr, NumGradients;
1365   std::tie(NumVAddr, NumGradients) = getImageNumVAddr(Intr, BaseOpcode);
1366 
1367   Register VDataIn, VDataOut;
1368   LLT VDataTy;
1369   int NumVDataDwords = -1;
1370   bool IsD16 = false;
1371 
1372   // XXX - Can we just get the second to last argument for ctrl?
1373   unsigned CtrlIdx; // Index of texfailctrl argument
1374   bool Unorm;
1375   if (!BaseOpcode->Sampler) {
1376     Unorm = true;
1377     CtrlIdx = VAddrIdx + NumVAddr + 1;
1378   } else {
1379     Unorm = MI.getOperand(VAddrIdx + NumVAddr + 2).getImm() != 0;
1380     CtrlIdx = VAddrIdx + NumVAddr + 3;
1381   }
1382 
1383   bool TFE;
1384   bool LWE;
1385   bool IsTexFail = false;
1386   if (!parseTexFail(MI.getOperand(CtrlIdx).getImm(), TFE, LWE, IsTexFail))
1387     return false;
1388 
1389   const int Flags = MI.getOperand(CtrlIdx + 2).getImm();
1390   const bool IsA16 = (Flags & 1) != 0;
1391   const bool IsG16 = (Flags & 2) != 0;
1392 
1393   // A16 implies 16 bit gradients
1394   if (IsA16 && !IsG16)
1395     return false;
1396 
1397   unsigned DMask = 0;
1398   unsigned DMaskLanes = 0;
1399 
1400   if (BaseOpcode->Atomic) {
1401     VDataOut = MI.getOperand(0).getReg();
1402     VDataIn = MI.getOperand(2).getReg();
1403     LLT Ty = MRI->getType(VDataIn);
1404 
1405     // Be careful to allow atomic swap on 16-bit element vectors.
1406     const bool Is64Bit = BaseOpcode->AtomicX2 ?
1407       Ty.getSizeInBits() == 128 :
1408       Ty.getSizeInBits() == 64;
1409 
1410     if (BaseOpcode->AtomicX2) {
1411       assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
1412 
1413       DMask = Is64Bit ? 0xf : 0x3;
1414       NumVDataDwords = Is64Bit ? 4 : 2;
1415     } else {
1416       DMask = Is64Bit ? 0x3 : 0x1;
1417       NumVDataDwords = Is64Bit ? 2 : 1;
1418     }
1419   } else {
1420     const int DMaskIdx = 2; // Input/output + intrinsic ID.
1421 
1422     DMask = MI.getOperand(DMaskIdx).getImm();
1423     DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
1424 
1425     if (BaseOpcode->Store) {
1426       VDataIn = MI.getOperand(1).getReg();
1427       VDataTy = MRI->getType(VDataIn);
1428       NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
1429     } else {
1430       VDataOut = MI.getOperand(0).getReg();
1431       VDataTy = MRI->getType(VDataOut);
1432       NumVDataDwords = DMaskLanes;
1433 
1434       // One memoperand is mandatory, except for getresinfo.
1435       // FIXME: Check this in verifier.
1436       if (!MI.memoperands_empty()) {
1437         const MachineMemOperand *MMO = *MI.memoperands_begin();
1438 
1439         // Infer d16 from the memory size, as the register type will be mangled by
1440         // unpacked subtargets, or by TFE.
1441         IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32;
1442 
1443         if (IsD16 && !STI.hasUnpackedD16VMem())
1444           NumVDataDwords = (DMaskLanes + 1) / 2;
1445       }
1446     }
1447   }
1448 
1449   // Optimize _L to _LZ when _L is zero
1450   if (LZMappingInfo) {
1451     // The legalizer replaced the register with an immediate 0 if we need to
1452     // change the opcode.
1453     const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1);
1454     if (Lod.isImm()) {
1455       assert(Lod.getImm() == 0);
1456       IntrOpcode = LZMappingInfo->LZ;  // set new opcode to _lz variant of _l
1457     }
1458   }
1459 
1460   // Optimize _mip away, when 'lod' is zero
1461   if (MIPMappingInfo) {
1462     const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1);
1463     if (Lod.isImm()) {
1464       assert(Lod.getImm() == 0);
1465       IntrOpcode = MIPMappingInfo->NONMIP;  // set new opcode to variant without _mip
1466     }
1467   }
1468 
1469   // Set G16 opcode
1470   if (IsG16 && !IsA16) {
1471     const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
1472         AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
1473     assert(G16MappingInfo);
1474     IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
1475   }
1476 
1477   // TODO: Check this in verifier.
1478   assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
1479 
1480   bool GLC = false;
1481   bool SLC = false;
1482   bool DLC = false;
1483   if (BaseOpcode->Atomic) {
1484     GLC = true; // TODO no-return optimization
1485     if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), nullptr, &SLC,
1486                           IsGFX10 ? &DLC : nullptr))
1487       return false;
1488   } else {
1489     if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), &GLC, &SLC,
1490                           IsGFX10 ? &DLC : nullptr))
1491       return false;
1492   }
1493 
1494   int NumVAddrRegs = 0;
1495   int NumVAddrDwords = 0;
1496   for (int I = 0; I < NumVAddr; ++I) {
1497     // Skip the $noregs and 0s inserted during legalization.
1498     MachineOperand &AddrOp = MI.getOperand(VAddrIdx + I);
1499     if (!AddrOp.isReg())
1500       continue; // XXX - Break?
1501 
1502     Register Addr = AddrOp.getReg();
1503     if (!Addr)
1504       break;
1505 
1506     ++NumVAddrRegs;
1507     NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
1508   }
1509 
1510   // The legalizer preprocessed the intrinsic arguments. If we aren't using
1511   // NSA, these should have beeen packed into a single value in the first
1512   // address register
1513   const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs;
1514   if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
1515     LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
1516     return false;
1517   }
1518 
1519   if (IsTexFail)
1520     ++NumVDataDwords;
1521 
1522   int Opcode = -1;
1523   if (IsGFX10) {
1524     Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1525                                    UseNSA ? AMDGPU::MIMGEncGfx10NSA
1526                                           : AMDGPU::MIMGEncGfx10Default,
1527                                    NumVDataDwords, NumVAddrDwords);
1528   } else {
1529     if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
1530       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
1531                                      NumVDataDwords, NumVAddrDwords);
1532     if (Opcode == -1)
1533       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
1534                                      NumVDataDwords, NumVAddrDwords);
1535   }
1536   assert(Opcode != -1);
1537 
1538   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
1539     .cloneMemRefs(MI);
1540 
1541   if (VDataOut) {
1542     if (BaseOpcode->AtomicX2) {
1543       const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
1544 
1545       Register TmpReg = MRI->createVirtualRegister(
1546         Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
1547       unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
1548 
1549       MIB.addDef(TmpReg);
1550       BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
1551         .addReg(TmpReg, RegState::Kill, SubReg);
1552 
1553     } else {
1554       MIB.addDef(VDataOut); // vdata output
1555     }
1556   }
1557 
1558   if (VDataIn)
1559     MIB.addReg(VDataIn); // vdata input
1560 
1561   for (int i = 0; i != NumVAddrRegs; ++i) {
1562     MachineOperand &SrcOp = MI.getOperand(VAddrIdx + i);
1563     if (SrcOp.isReg()) {
1564       assert(SrcOp.getReg() != 0);
1565       MIB.addReg(SrcOp.getReg());
1566     }
1567   }
1568 
1569   MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr).getReg()); // rsrc
1570   if (BaseOpcode->Sampler)
1571     MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr + 1).getReg()); // sampler
1572 
1573   MIB.addImm(DMask); // dmask
1574 
1575   if (IsGFX10)
1576     MIB.addImm(DimInfo->Encoding);
1577   MIB.addImm(Unorm);
1578   if (IsGFX10)
1579     MIB.addImm(DLC);
1580 
1581   MIB.addImm(GLC);
1582   MIB.addImm(SLC);
1583   MIB.addImm(IsA16 &&  // a16 or r128
1584              STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
1585   if (IsGFX10)
1586     MIB.addImm(IsA16 ? -1 : 0);
1587 
1588   MIB.addImm(TFE); // tfe
1589   MIB.addImm(LWE); // lwe
1590   if (!IsGFX10)
1591     MIB.addImm(DimInfo->DA ? -1 : 0);
1592   if (BaseOpcode->HasD16)
1593     MIB.addImm(IsD16 ? -1 : 0);
1594 
1595   MI.eraseFromParent();
1596   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1597 }
1598 
1599 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
1600     MachineInstr &I) const {
1601   unsigned IntrinsicID = I.getIntrinsicID();
1602   switch (IntrinsicID) {
1603   case Intrinsic::amdgcn_end_cf:
1604     return selectEndCfIntrinsic(I);
1605   case Intrinsic::amdgcn_ds_ordered_add:
1606   case Intrinsic::amdgcn_ds_ordered_swap:
1607     return selectDSOrderedIntrinsic(I, IntrinsicID);
1608   case Intrinsic::amdgcn_ds_gws_init:
1609   case Intrinsic::amdgcn_ds_gws_barrier:
1610   case Intrinsic::amdgcn_ds_gws_sema_v:
1611   case Intrinsic::amdgcn_ds_gws_sema_br:
1612   case Intrinsic::amdgcn_ds_gws_sema_p:
1613   case Intrinsic::amdgcn_ds_gws_sema_release_all:
1614     return selectDSGWSIntrinsic(I, IntrinsicID);
1615   case Intrinsic::amdgcn_ds_append:
1616     return selectDSAppendConsume(I, true);
1617   case Intrinsic::amdgcn_ds_consume:
1618     return selectDSAppendConsume(I, false);
1619   default: {
1620     return selectImpl(I, *CoverageInfo);
1621   }
1622   }
1623 }
1624 
1625 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
1626   if (selectImpl(I, *CoverageInfo))
1627     return true;
1628 
1629   MachineBasicBlock *BB = I.getParent();
1630   const DebugLoc &DL = I.getDebugLoc();
1631 
1632   Register DstReg = I.getOperand(0).getReg();
1633   unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
1634   assert(Size <= 32 || Size == 64);
1635   const MachineOperand &CCOp = I.getOperand(1);
1636   Register CCReg = CCOp.getReg();
1637   if (!isVCC(CCReg, *MRI)) {
1638     unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
1639                                          AMDGPU::S_CSELECT_B32;
1640     MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
1641             .addReg(CCReg);
1642 
1643     // The generic constrainSelectedInstRegOperands doesn't work for the scc register
1644     // bank, because it does not cover the register class that we used to represent
1645     // for it.  So we need to manually set the register class here.
1646     if (!MRI->getRegClassOrNull(CCReg))
1647         MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
1648     MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
1649             .add(I.getOperand(2))
1650             .add(I.getOperand(3));
1651 
1652     bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI) |
1653                constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
1654     I.eraseFromParent();
1655     return Ret;
1656   }
1657 
1658   // Wide VGPR select should have been split in RegBankSelect.
1659   if (Size > 32)
1660     return false;
1661 
1662   MachineInstr *Select =
1663       BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1664               .addImm(0)
1665               .add(I.getOperand(3))
1666               .addImm(0)
1667               .add(I.getOperand(2))
1668               .add(I.getOperand(1));
1669 
1670   bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
1671   I.eraseFromParent();
1672   return Ret;
1673 }
1674 
1675 bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const {
1676   initM0(I);
1677   return selectImpl(I, *CoverageInfo);
1678 }
1679 
1680 static int sizeToSubRegIndex(unsigned Size) {
1681   switch (Size) {
1682   case 32:
1683     return AMDGPU::sub0;
1684   case 64:
1685     return AMDGPU::sub0_sub1;
1686   case 96:
1687     return AMDGPU::sub0_sub1_sub2;
1688   case 128:
1689     return AMDGPU::sub0_sub1_sub2_sub3;
1690   case 256:
1691     return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
1692   default:
1693     if (Size < 32)
1694       return AMDGPU::sub0;
1695     if (Size > 256)
1696       return -1;
1697     return sizeToSubRegIndex(PowerOf2Ceil(Size));
1698   }
1699 }
1700 
1701 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
1702   Register DstReg = I.getOperand(0).getReg();
1703   Register SrcReg = I.getOperand(1).getReg();
1704   const LLT DstTy = MRI->getType(DstReg);
1705   const LLT SrcTy = MRI->getType(SrcReg);
1706   const LLT S1 = LLT::scalar(1);
1707 
1708   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
1709   const RegisterBank *DstRB;
1710   if (DstTy == S1) {
1711     // This is a special case. We don't treat s1 for legalization artifacts as
1712     // vcc booleans.
1713     DstRB = SrcRB;
1714   } else {
1715     DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1716     if (SrcRB != DstRB)
1717       return false;
1718   }
1719 
1720   const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
1721 
1722   unsigned DstSize = DstTy.getSizeInBits();
1723   unsigned SrcSize = SrcTy.getSizeInBits();
1724 
1725   const TargetRegisterClass *SrcRC
1726     = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI);
1727   const TargetRegisterClass *DstRC
1728     = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI);
1729   if (!SrcRC || !DstRC)
1730     return false;
1731 
1732   if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
1733       !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
1734     LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
1735     return false;
1736   }
1737 
1738   if (DstTy == LLT::vector(2, 16) && SrcTy == LLT::vector(2, 32)) {
1739     MachineBasicBlock *MBB = I.getParent();
1740     const DebugLoc &DL = I.getDebugLoc();
1741 
1742     Register LoReg = MRI->createVirtualRegister(DstRC);
1743     Register HiReg = MRI->createVirtualRegister(DstRC);
1744     BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
1745       .addReg(SrcReg, 0, AMDGPU::sub0);
1746     BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
1747       .addReg(SrcReg, 0, AMDGPU::sub1);
1748 
1749     if (IsVALU && STI.hasSDWA()) {
1750       // Write the low 16-bits of the high element into the high 16-bits of the
1751       // low element.
1752       MachineInstr *MovSDWA =
1753         BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
1754         .addImm(0)                             // $src0_modifiers
1755         .addReg(HiReg)                         // $src0
1756         .addImm(0)                             // $clamp
1757         .addImm(AMDGPU::SDWA::WORD_1)          // $dst_sel
1758         .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
1759         .addImm(AMDGPU::SDWA::WORD_0)          // $src0_sel
1760         .addReg(LoReg, RegState::Implicit);
1761       MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
1762     } else {
1763       Register TmpReg0 = MRI->createVirtualRegister(DstRC);
1764       Register TmpReg1 = MRI->createVirtualRegister(DstRC);
1765       Register ImmReg = MRI->createVirtualRegister(DstRC);
1766       if (IsVALU) {
1767         BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
1768           .addImm(16)
1769           .addReg(HiReg);
1770       } else {
1771         BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
1772           .addReg(HiReg)
1773           .addImm(16);
1774       }
1775 
1776       unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
1777       unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
1778       unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
1779 
1780       BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
1781         .addImm(0xffff);
1782       BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
1783         .addReg(LoReg)
1784         .addReg(ImmReg);
1785       BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
1786         .addReg(TmpReg0)
1787         .addReg(TmpReg1);
1788     }
1789 
1790     I.eraseFromParent();
1791     return true;
1792   }
1793 
1794   if (!DstTy.isScalar())
1795     return false;
1796 
1797   if (SrcSize > 32) {
1798     int SubRegIdx = sizeToSubRegIndex(DstSize);
1799     if (SubRegIdx == -1)
1800       return false;
1801 
1802     // Deal with weird cases where the class only partially supports the subreg
1803     // index.
1804     const TargetRegisterClass *SrcWithSubRC
1805       = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
1806     if (!SrcWithSubRC)
1807       return false;
1808 
1809     if (SrcWithSubRC != SrcRC) {
1810       if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
1811         return false;
1812     }
1813 
1814     I.getOperand(1).setSubReg(SubRegIdx);
1815   }
1816 
1817   I.setDesc(TII.get(TargetOpcode::COPY));
1818   return true;
1819 }
1820 
1821 /// \returns true if a bitmask for \p Size bits will be an inline immediate.
1822 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
1823   Mask = maskTrailingOnes<unsigned>(Size);
1824   int SignedMask = static_cast<int>(Mask);
1825   return SignedMask >= -16 && SignedMask <= 64;
1826 }
1827 
1828 // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
1829 const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
1830   Register Reg, const MachineRegisterInfo &MRI,
1831   const TargetRegisterInfo &TRI) const {
1832   const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
1833   if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>())
1834     return RB;
1835 
1836   // Ignore the type, since we don't use vcc in artifacts.
1837   if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
1838     return &RBI.getRegBankFromRegClass(*RC, LLT());
1839   return nullptr;
1840 }
1841 
1842 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
1843   bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
1844   bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
1845   const DebugLoc &DL = I.getDebugLoc();
1846   MachineBasicBlock &MBB = *I.getParent();
1847   const Register DstReg = I.getOperand(0).getReg();
1848   const Register SrcReg = I.getOperand(1).getReg();
1849 
1850   const LLT DstTy = MRI->getType(DstReg);
1851   const LLT SrcTy = MRI->getType(SrcReg);
1852   const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
1853     I.getOperand(2).getImm() : SrcTy.getSizeInBits();
1854   const unsigned DstSize = DstTy.getSizeInBits();
1855   if (!DstTy.isScalar())
1856     return false;
1857 
1858   // Artifact casts should never use vcc.
1859   const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
1860 
1861   // FIXME: This should probably be illegal and split earlier.
1862   if (I.getOpcode() == AMDGPU::G_ANYEXT) {
1863     if (DstSize <= 32)
1864       return selectCOPY(I);
1865 
1866     const TargetRegisterClass *SrcRC =
1867         TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank, *MRI);
1868     const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1869     const TargetRegisterClass *DstRC =
1870         TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
1871 
1872     Register UndefReg = MRI->createVirtualRegister(SrcRC);
1873     BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
1874     BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1875       .addReg(SrcReg)
1876       .addImm(AMDGPU::sub0)
1877       .addReg(UndefReg)
1878       .addImm(AMDGPU::sub1);
1879     I.eraseFromParent();
1880 
1881     return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
1882            RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
1883   }
1884 
1885   if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
1886     // 64-bit should have been split up in RegBankSelect
1887 
1888     // Try to use an and with a mask if it will save code size.
1889     unsigned Mask;
1890     if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
1891       MachineInstr *ExtI =
1892       BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
1893         .addImm(Mask)
1894         .addReg(SrcReg);
1895       I.eraseFromParent();
1896       return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
1897     }
1898 
1899     const unsigned BFE = Signed ? AMDGPU::V_BFE_I32 : AMDGPU::V_BFE_U32;
1900     MachineInstr *ExtI =
1901       BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
1902       .addReg(SrcReg)
1903       .addImm(0) // Offset
1904       .addImm(SrcSize); // Width
1905     I.eraseFromParent();
1906     return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
1907   }
1908 
1909   if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
1910     const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
1911       AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
1912     if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
1913       return false;
1914 
1915     if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
1916       const unsigned SextOpc = SrcSize == 8 ?
1917         AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
1918       BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
1919         .addReg(SrcReg);
1920       I.eraseFromParent();
1921       return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
1922     }
1923 
1924     const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
1925     const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
1926 
1927     // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
1928     if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
1929       // We need a 64-bit register source, but the high bits don't matter.
1930       Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
1931       Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1932       unsigned SubReg = InReg ? AMDGPU::sub0 : 0;
1933 
1934       BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
1935       BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
1936         .addReg(SrcReg, 0, SubReg)
1937         .addImm(AMDGPU::sub0)
1938         .addReg(UndefReg)
1939         .addImm(AMDGPU::sub1);
1940 
1941       BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
1942         .addReg(ExtReg)
1943         .addImm(SrcSize << 16);
1944 
1945       I.eraseFromParent();
1946       return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
1947     }
1948 
1949     unsigned Mask;
1950     if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
1951       BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
1952         .addReg(SrcReg)
1953         .addImm(Mask);
1954     } else {
1955       BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
1956         .addReg(SrcReg)
1957         .addImm(SrcSize << 16);
1958     }
1959 
1960     I.eraseFromParent();
1961     return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
1962   }
1963 
1964   return false;
1965 }
1966 
1967 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
1968   MachineBasicBlock *BB = I.getParent();
1969   MachineOperand &ImmOp = I.getOperand(1);
1970 
1971   // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
1972   if (ImmOp.isFPImm()) {
1973     const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
1974     ImmOp.ChangeToImmediate(Imm.getZExtValue());
1975   } else if (ImmOp.isCImm()) {
1976     ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue());
1977   }
1978 
1979   Register DstReg = I.getOperand(0).getReg();
1980   unsigned Size;
1981   bool IsSgpr;
1982   const RegisterBank *RB = MRI->getRegBankOrNull(I.getOperand(0).getReg());
1983   if (RB) {
1984     IsSgpr = RB->getID() == AMDGPU::SGPRRegBankID;
1985     Size = MRI->getType(DstReg).getSizeInBits();
1986   } else {
1987     const TargetRegisterClass *RC = TRI.getRegClassForReg(*MRI, DstReg);
1988     IsSgpr = TRI.isSGPRClass(RC);
1989     Size = TRI.getRegSizeInBits(*RC);
1990   }
1991 
1992   if (Size != 32 && Size != 64)
1993     return false;
1994 
1995   unsigned Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1996   if (Size == 32) {
1997     I.setDesc(TII.get(Opcode));
1998     I.addImplicitDefUseOperands(*MF);
1999     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2000   }
2001 
2002   const DebugLoc &DL = I.getDebugLoc();
2003 
2004   APInt Imm(Size, I.getOperand(1).getImm());
2005 
2006   MachineInstr *ResInst;
2007   if (IsSgpr && TII.isInlineConstant(Imm)) {
2008     ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
2009       .addImm(I.getOperand(1).getImm());
2010   } else {
2011     const TargetRegisterClass *RC = IsSgpr ?
2012       &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass;
2013     Register LoReg = MRI->createVirtualRegister(RC);
2014     Register HiReg = MRI->createVirtualRegister(RC);
2015 
2016     BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
2017       .addImm(Imm.trunc(32).getZExtValue());
2018 
2019     BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
2020       .addImm(Imm.ashr(32).getZExtValue());
2021 
2022     ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2023       .addReg(LoReg)
2024       .addImm(AMDGPU::sub0)
2025       .addReg(HiReg)
2026       .addImm(AMDGPU::sub1);
2027   }
2028 
2029   // We can't call constrainSelectedInstRegOperands here, because it doesn't
2030   // work for target independent opcodes
2031   I.eraseFromParent();
2032   const TargetRegisterClass *DstRC =
2033     TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI);
2034   if (!DstRC)
2035     return true;
2036   return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI);
2037 }
2038 
2039 bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2040   // Only manually handle the f64 SGPR case.
2041   //
2042   // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2043   // the bit ops theoretically have a second result due to the implicit def of
2044   // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2045   // that is easy by disabling the check. The result works, but uses a
2046   // nonsensical sreg32orlds_and_sreg_1 regclass.
2047   //
2048   // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2049   // the variadic REG_SEQUENCE operands.
2050 
2051   Register Dst = MI.getOperand(0).getReg();
2052   const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2053   if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2054       MRI->getType(Dst) != LLT::scalar(64))
2055     return false;
2056 
2057   Register Src = MI.getOperand(1).getReg();
2058   MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2059   if (Fabs)
2060     Src = Fabs->getOperand(1).getReg();
2061 
2062   if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2063       !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2064     return false;
2065 
2066   MachineBasicBlock *BB = MI.getParent();
2067   const DebugLoc &DL = MI.getDebugLoc();
2068   Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2069   Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2070   Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2071   Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2072 
2073   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2074     .addReg(Src, 0, AMDGPU::sub0);
2075   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2076     .addReg(Src, 0, AMDGPU::sub1);
2077   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2078     .addImm(0x80000000);
2079 
2080   // Set or toggle sign bit.
2081   unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2082   BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2083     .addReg(HiReg)
2084     .addReg(ConstReg);
2085   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2086     .addReg(LoReg)
2087     .addImm(AMDGPU::sub0)
2088     .addReg(OpReg)
2089     .addImm(AMDGPU::sub1);
2090   MI.eraseFromParent();
2091   return true;
2092 }
2093 
2094 // FIXME: This is a workaround for the same tablegen problems as G_FNEG
2095 bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2096   Register Dst = MI.getOperand(0).getReg();
2097   const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2098   if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2099       MRI->getType(Dst) != LLT::scalar(64))
2100     return false;
2101 
2102   Register Src = MI.getOperand(1).getReg();
2103   MachineBasicBlock *BB = MI.getParent();
2104   const DebugLoc &DL = MI.getDebugLoc();
2105   Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2106   Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2107   Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2108   Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2109 
2110   if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2111       !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2112     return false;
2113 
2114   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2115     .addReg(Src, 0, AMDGPU::sub0);
2116   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2117     .addReg(Src, 0, AMDGPU::sub1);
2118   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2119     .addImm(0x7fffffff);
2120 
2121   // Clear sign bit.
2122   // TODO: Should this used S_BITSET0_*?
2123   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2124     .addReg(HiReg)
2125     .addReg(ConstReg);
2126   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2127     .addReg(LoReg)
2128     .addImm(AMDGPU::sub0)
2129     .addReg(OpReg)
2130     .addImm(AMDGPU::sub1);
2131 
2132   MI.eraseFromParent();
2133   return true;
2134 }
2135 
2136 static bool isConstant(const MachineInstr &MI) {
2137   return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2138 }
2139 
2140 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2141     const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2142 
2143   const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg());
2144 
2145   assert(PtrMI);
2146 
2147   if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2148     return;
2149 
2150   GEPInfo GEPInfo(*PtrMI);
2151 
2152   for (unsigned i = 1; i != 3; ++i) {
2153     const MachineOperand &GEPOp = PtrMI->getOperand(i);
2154     const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2155     assert(OpDef);
2156     if (i == 2 && isConstant(*OpDef)) {
2157       // TODO: Could handle constant base + variable offset, but a combine
2158       // probably should have commuted it.
2159       assert(GEPInfo.Imm == 0);
2160       GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2161       continue;
2162     }
2163     const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2164     if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2165       GEPInfo.SgprParts.push_back(GEPOp.getReg());
2166     else
2167       GEPInfo.VgprParts.push_back(GEPOp.getReg());
2168   }
2169 
2170   AddrInfo.push_back(GEPInfo);
2171   getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2172 }
2173 
2174 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2175   if (!MI.hasOneMemOperand())
2176     return false;
2177 
2178   const MachineMemOperand *MMO = *MI.memoperands_begin();
2179   const Value *Ptr = MMO->getValue();
2180 
2181   // UndefValue means this is a load of a kernel input.  These are uniform.
2182   // Sometimes LDS instructions have constant pointers.
2183   // If Ptr is null, then that means this mem operand contains a
2184   // PseudoSourceValue like GOT.
2185   if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
2186       isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
2187     return true;
2188 
2189   if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
2190     return true;
2191 
2192   const Instruction *I = dyn_cast<Instruction>(Ptr);
2193   return I && I->getMetadata("amdgpu.uniform");
2194 }
2195 
2196 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2197   for (const GEPInfo &GEPInfo : AddrInfo) {
2198     if (!GEPInfo.VgprParts.empty())
2199       return true;
2200   }
2201   return false;
2202 }
2203 
2204 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2205   MachineBasicBlock *BB = I.getParent();
2206 
2207   const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2208   unsigned AS = PtrTy.getAddressSpace();
2209   if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) &&
2210       STI.ldsRequiresM0Init()) {
2211     // If DS instructions require M0 initializtion, insert it before selecting.
2212     BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2213       .addImm(-1);
2214   }
2215 }
2216 
2217 bool AMDGPUInstructionSelector::selectG_LOAD_ATOMICRMW(MachineInstr &I) const {
2218   initM0(I);
2219   return selectImpl(I, *CoverageInfo);
2220 }
2221 
2222 // TODO: No rtn optimization.
2223 bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG(
2224   MachineInstr &MI) const {
2225   Register PtrReg = MI.getOperand(1).getReg();
2226   const LLT PtrTy = MRI->getType(PtrReg);
2227   if (PtrTy.getAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
2228       STI.useFlatForGlobal())
2229     return selectImpl(MI, *CoverageInfo);
2230 
2231   Register DstReg = MI.getOperand(0).getReg();
2232   const LLT Ty = MRI->getType(DstReg);
2233   const bool Is64 = Ty.getSizeInBits() == 64;
2234   const unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2235   Register TmpReg = MRI->createVirtualRegister(
2236     Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2237 
2238   const DebugLoc &DL = MI.getDebugLoc();
2239   MachineBasicBlock *BB = MI.getParent();
2240 
2241   Register VAddr, RSrcReg, SOffset;
2242   int64_t Offset = 0;
2243 
2244   unsigned Opcode;
2245   if (selectMUBUFOffsetImpl(MI.getOperand(1), RSrcReg, SOffset, Offset)) {
2246     Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN :
2247                              AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN;
2248   } else if (selectMUBUFAddr64Impl(MI.getOperand(1), VAddr,
2249                                    RSrcReg, SOffset, Offset)) {
2250     Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN :
2251                     AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN;
2252   } else
2253     return selectImpl(MI, *CoverageInfo);
2254 
2255   auto MIB = BuildMI(*BB, &MI, DL, TII.get(Opcode), TmpReg)
2256     .addReg(MI.getOperand(2).getReg());
2257 
2258   if (VAddr)
2259     MIB.addReg(VAddr);
2260 
2261   MIB.addReg(RSrcReg);
2262   if (SOffset)
2263     MIB.addReg(SOffset);
2264   else
2265     MIB.addImm(0);
2266 
2267   MIB.addImm(Offset);
2268   MIB.addImm(0); // slc
2269   MIB.cloneMemRefs(MI);
2270 
2271   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), DstReg)
2272     .addReg(TmpReg, RegState::Kill, SubReg);
2273 
2274   MI.eraseFromParent();
2275 
2276   MRI->setRegClass(
2277     DstReg, Is64 ? &AMDGPU::VReg_64RegClass : &AMDGPU::VGPR_32RegClass);
2278   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2279 }
2280 
2281 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
2282   MachineBasicBlock *BB = I.getParent();
2283   MachineOperand &CondOp = I.getOperand(0);
2284   Register CondReg = CondOp.getReg();
2285   const DebugLoc &DL = I.getDebugLoc();
2286 
2287   unsigned BrOpcode;
2288   Register CondPhysReg;
2289   const TargetRegisterClass *ConstrainRC;
2290 
2291   // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
2292   // whether the branch is uniform when selecting the instruction. In
2293   // GlobalISel, we should push that decision into RegBankSelect. Assume for now
2294   // RegBankSelect knows what it's doing if the branch condition is scc, even
2295   // though it currently does not.
2296   if (!isVCC(CondReg, *MRI)) {
2297     if (MRI->getType(CondReg) != LLT::scalar(32))
2298       return false;
2299 
2300     CondPhysReg = AMDGPU::SCC;
2301     BrOpcode = AMDGPU::S_CBRANCH_SCC1;
2302     ConstrainRC = &AMDGPU::SReg_32RegClass;
2303   } else {
2304     // FIXME: Do we have to insert an and with exec here, like in SelectionDAG?
2305     // We sort of know that a VCC producer based on the register bank, that ands
2306     // inactive lanes with 0. What if there was a logical operation with vcc
2307     // producers in different blocks/with different exec masks?
2308     // FIXME: Should scc->vcc copies and with exec?
2309     CondPhysReg = TRI.getVCC();
2310     BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
2311     ConstrainRC = TRI.getBoolRC();
2312   }
2313 
2314   if (!MRI->getRegClassOrNull(CondReg))
2315     MRI->setRegClass(CondReg, ConstrainRC);
2316 
2317   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
2318     .addReg(CondReg);
2319   BuildMI(*BB, &I, DL, TII.get(BrOpcode))
2320     .addMBB(I.getOperand(1).getMBB());
2321 
2322   I.eraseFromParent();
2323   return true;
2324 }
2325 
2326 bool AMDGPUInstructionSelector::selectG_FRAME_INDEX_GLOBAL_VALUE(
2327   MachineInstr &I) const {
2328   Register DstReg = I.getOperand(0).getReg();
2329   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2330   const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2331   I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
2332   if (IsVGPR)
2333     I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
2334 
2335   return RBI.constrainGenericRegister(
2336     DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
2337 }
2338 
2339 bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
2340   Register DstReg = I.getOperand(0).getReg();
2341   Register SrcReg = I.getOperand(1).getReg();
2342   Register MaskReg = I.getOperand(2).getReg();
2343   LLT Ty = MRI->getType(DstReg);
2344   LLT MaskTy = MRI->getType(MaskReg);
2345 
2346   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2347   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2348   const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
2349   const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2350   if (DstRB != SrcRB) // Should only happen for hand written MIR.
2351     return false;
2352 
2353   unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2354   const TargetRegisterClass &RegRC
2355     = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2356 
2357   const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB,
2358                                                                   *MRI);
2359   const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB,
2360                                                                   *MRI);
2361   const TargetRegisterClass *MaskRC =
2362       TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB, *MRI);
2363 
2364   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2365       !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2366       !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
2367     return false;
2368 
2369   MachineBasicBlock *BB = I.getParent();
2370   const DebugLoc &DL = I.getDebugLoc();
2371   if (Ty.getSizeInBits() == 32) {
2372     assert(MaskTy.getSizeInBits() == 32 &&
2373            "ptrmask should have been narrowed during legalize");
2374 
2375     BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
2376       .addReg(SrcReg)
2377       .addReg(MaskReg);
2378     I.eraseFromParent();
2379     return true;
2380   }
2381 
2382   Register HiReg = MRI->createVirtualRegister(&RegRC);
2383   Register LoReg = MRI->createVirtualRegister(&RegRC);
2384 
2385   // Extract the subregisters from the source pointer.
2386   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
2387     .addReg(SrcReg, 0, AMDGPU::sub0);
2388   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
2389     .addReg(SrcReg, 0, AMDGPU::sub1);
2390 
2391   Register MaskedLo, MaskedHi;
2392 
2393   // Try to avoid emitting a bit operation when we only need to touch half of
2394   // the 64-bit pointer.
2395   APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64);
2396 
2397   const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
2398   const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
2399   if ((MaskOnes & MaskLo32) == MaskLo32) {
2400     // If all the bits in the low half are 1, we only need a copy for it.
2401     MaskedLo = LoReg;
2402   } else {
2403     // Extract the mask subregister and apply the and.
2404     Register MaskLo = MRI->createVirtualRegister(&RegRC);
2405     MaskedLo = MRI->createVirtualRegister(&RegRC);
2406 
2407     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
2408       .addReg(MaskReg, 0, AMDGPU::sub0);
2409     BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
2410       .addReg(LoReg)
2411       .addReg(MaskLo);
2412   }
2413 
2414   if ((MaskOnes & MaskHi32) == MaskHi32) {
2415     // If all the bits in the high half are 1, we only need a copy for it.
2416     MaskedHi = HiReg;
2417   } else {
2418     Register MaskHi = MRI->createVirtualRegister(&RegRC);
2419     MaskedHi = MRI->createVirtualRegister(&RegRC);
2420 
2421     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
2422       .addReg(MaskReg, 0, AMDGPU::sub1);
2423     BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
2424       .addReg(HiReg)
2425       .addReg(MaskHi);
2426   }
2427 
2428   BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2429     .addReg(MaskedLo)
2430     .addImm(AMDGPU::sub0)
2431     .addReg(MaskedHi)
2432     .addImm(AMDGPU::sub1);
2433   I.eraseFromParent();
2434   return true;
2435 }
2436 
2437 /// Return the register to use for the index value, and the subregister to use
2438 /// for the indirectly accessed register.
2439 static std::pair<Register, unsigned>
2440 computeIndirectRegIndex(MachineRegisterInfo &MRI,
2441                         const SIRegisterInfo &TRI,
2442                         const TargetRegisterClass *SuperRC,
2443                         Register IdxReg,
2444                         unsigned EltSize) {
2445   Register IdxBaseReg;
2446   int Offset;
2447   MachineInstr *Unused;
2448 
2449   std::tie(IdxBaseReg, Offset, Unused)
2450     = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg);
2451   if (IdxBaseReg == AMDGPU::NoRegister) {
2452     // This will happen if the index is a known constant. This should ordinarily
2453     // be legalized out, but handle it as a register just in case.
2454     assert(Offset == 0);
2455     IdxBaseReg = IdxReg;
2456   }
2457 
2458   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
2459 
2460   // Skip out of bounds offsets, or else we would end up using an undefined
2461   // register.
2462   if (static_cast<unsigned>(Offset) >= SubRegs.size())
2463     return std::make_pair(IdxReg, SubRegs[0]);
2464   return std::make_pair(IdxBaseReg, SubRegs[Offset]);
2465 }
2466 
2467 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
2468   MachineInstr &MI) const {
2469   Register DstReg = MI.getOperand(0).getReg();
2470   Register SrcReg = MI.getOperand(1).getReg();
2471   Register IdxReg = MI.getOperand(2).getReg();
2472 
2473   LLT DstTy = MRI->getType(DstReg);
2474   LLT SrcTy = MRI->getType(SrcReg);
2475 
2476   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2477   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2478   const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
2479 
2480   // The index must be scalar. If it wasn't RegBankSelect should have moved this
2481   // into a waterfall loop.
2482   if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
2483     return false;
2484 
2485   const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB,
2486                                                                   *MRI);
2487   const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(DstTy, *DstRB,
2488                                                                   *MRI);
2489   if (!SrcRC || !DstRC)
2490     return false;
2491   if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2492       !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2493       !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
2494     return false;
2495 
2496   MachineBasicBlock *BB = MI.getParent();
2497   const DebugLoc &DL = MI.getDebugLoc();
2498   const bool Is64 = DstTy.getSizeInBits() == 64;
2499 
2500   unsigned SubReg;
2501   std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, SrcRC, IdxReg,
2502                                                      DstTy.getSizeInBits() / 8);
2503 
2504   if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
2505     if (DstTy.getSizeInBits() != 32 && !Is64)
2506       return false;
2507 
2508     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2509       .addReg(IdxReg);
2510 
2511     unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
2512     BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
2513       .addReg(SrcReg, 0, SubReg)
2514       .addReg(SrcReg, RegState::Implicit);
2515     MI.eraseFromParent();
2516     return true;
2517   }
2518 
2519   if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
2520     return false;
2521 
2522   if (!STI.useVGPRIndexMode()) {
2523     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2524       .addReg(IdxReg);
2525     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
2526       .addReg(SrcReg, 0, SubReg)
2527       .addReg(SrcReg, RegState::Implicit);
2528     MI.eraseFromParent();
2529     return true;
2530   }
2531 
2532   BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON))
2533     .addReg(IdxReg)
2534     .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE);
2535   BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), DstReg)
2536     .addReg(SrcReg, 0, SubReg)
2537     .addReg(SrcReg, RegState::Implicit)
2538     .addReg(AMDGPU::M0, RegState::Implicit);
2539   BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF));
2540 
2541   MI.eraseFromParent();
2542   return true;
2543 }
2544 
2545 // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
2546 bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
2547   MachineInstr &MI) const {
2548   Register DstReg = MI.getOperand(0).getReg();
2549   Register VecReg = MI.getOperand(1).getReg();
2550   Register ValReg = MI.getOperand(2).getReg();
2551   Register IdxReg = MI.getOperand(3).getReg();
2552 
2553   LLT VecTy = MRI->getType(DstReg);
2554   LLT ValTy = MRI->getType(ValReg);
2555   unsigned VecSize = VecTy.getSizeInBits();
2556   unsigned ValSize = ValTy.getSizeInBits();
2557 
2558   const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
2559   const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
2560   const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
2561 
2562   assert(VecTy.getElementType() == ValTy);
2563 
2564   // The index must be scalar. If it wasn't RegBankSelect should have moved this
2565   // into a waterfall loop.
2566   if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
2567     return false;
2568 
2569   const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB,
2570                                                                   *MRI);
2571   const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB,
2572                                                                   *MRI);
2573 
2574   if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
2575       !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
2576       !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
2577       !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
2578     return false;
2579 
2580   if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
2581     return false;
2582 
2583   unsigned SubReg;
2584   std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg,
2585                                                      ValSize / 8);
2586 
2587   const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
2588                          STI.useVGPRIndexMode();
2589 
2590   MachineBasicBlock *BB = MI.getParent();
2591   const DebugLoc &DL = MI.getDebugLoc();
2592 
2593   if (IndexMode) {
2594     BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON))
2595       .addReg(IdxReg)
2596       .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE);
2597   } else {
2598     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2599       .addReg(IdxReg);
2600   }
2601 
2602   const MCInstrDesc &RegWriteOp
2603     = TII.getIndirectRegWritePseudo(VecSize, ValSize,
2604                                     VecRB->getID() == AMDGPU::SGPRRegBankID);
2605   BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
2606     .addReg(VecReg)
2607     .addReg(ValReg)
2608     .addImm(SubReg);
2609 
2610   if (IndexMode)
2611     BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF));
2612 
2613   MI.eraseFromParent();
2614   return true;
2615 }
2616 
2617 static bool isZeroOrUndef(int X) {
2618   return X == 0 || X == -1;
2619 }
2620 
2621 static bool isOneOrUndef(int X) {
2622   return X == 1 || X == -1;
2623 }
2624 
2625 static bool isZeroOrOneOrUndef(int X) {
2626   return X == 0 || X == 1 || X == -1;
2627 }
2628 
2629 // Normalize a VOP3P shuffle mask to refer to the low/high half of a single
2630 // 32-bit register.
2631 static Register normalizeVOP3PMask(int NewMask[2], Register Src0, Register Src1,
2632                                    ArrayRef<int> Mask) {
2633   NewMask[0] = Mask[0];
2634   NewMask[1] = Mask[1];
2635   if (isZeroOrOneOrUndef(Mask[0]) && isZeroOrOneOrUndef(Mask[1]))
2636     return Src0;
2637 
2638   assert(NewMask[0] == 2 || NewMask[0] == 3 || NewMask[0] == -1);
2639   assert(NewMask[1] == 2 || NewMask[1] == 3 || NewMask[1] == -1);
2640 
2641   // Shift the mask inputs to be 0/1;
2642   NewMask[0] = NewMask[0] == -1 ? -1 : NewMask[0] - 2;
2643   NewMask[1] = NewMask[1] == -1 ? -1 : NewMask[1] - 2;
2644   return Src1;
2645 }
2646 
2647 // This is only legal with VOP3P instructions as an aid to op_sel matching.
2648 bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
2649   MachineInstr &MI) const {
2650   Register DstReg = MI.getOperand(0).getReg();
2651   Register Src0Reg = MI.getOperand(1).getReg();
2652   Register Src1Reg = MI.getOperand(2).getReg();
2653   ArrayRef<int> ShufMask = MI.getOperand(3).getShuffleMask();
2654 
2655   const LLT V2S16 = LLT::vector(2, 16);
2656   if (MRI->getType(DstReg) != V2S16 || MRI->getType(Src0Reg) != V2S16)
2657     return false;
2658 
2659   if (!AMDGPU::isLegalVOP3PShuffleMask(ShufMask))
2660     return false;
2661 
2662   assert(ShufMask.size() == 2);
2663   assert(STI.hasSDWA() && "no target has VOP3P but not SDWA");
2664 
2665   MachineBasicBlock *MBB = MI.getParent();
2666   const DebugLoc &DL = MI.getDebugLoc();
2667 
2668   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2669   const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2670   const TargetRegisterClass &RC = IsVALU ?
2671     AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2672 
2673   // Handle the degenerate case which should have folded out.
2674   if (ShufMask[0] == -1 && ShufMask[1] == -1) {
2675     BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), DstReg);
2676 
2677     MI.eraseFromParent();
2678     return RBI.constrainGenericRegister(DstReg, RC, *MRI);
2679   }
2680 
2681   // A legal VOP3P mask only reads one of the sources.
2682   int Mask[2];
2683   Register SrcVec = normalizeVOP3PMask(Mask, Src0Reg, Src1Reg, ShufMask);
2684 
2685   if (!RBI.constrainGenericRegister(DstReg, RC, *MRI) ||
2686       !RBI.constrainGenericRegister(SrcVec, RC, *MRI))
2687     return false;
2688 
2689   // TODO: This also should have been folded out
2690   if (isZeroOrUndef(Mask[0]) && isOneOrUndef(Mask[1])) {
2691     BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), DstReg)
2692       .addReg(SrcVec);
2693 
2694     MI.eraseFromParent();
2695     return true;
2696   }
2697 
2698   if (Mask[0] == 1 && Mask[1] == -1) {
2699     if (IsVALU) {
2700       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
2701         .addImm(16)
2702         .addReg(SrcVec);
2703     } else {
2704       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
2705         .addReg(SrcVec)
2706         .addImm(16);
2707     }
2708   } else if (Mask[0] == -1 && Mask[1] == 0) {
2709     if (IsVALU) {
2710       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), DstReg)
2711         .addImm(16)
2712         .addReg(SrcVec);
2713     } else {
2714       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHL_B32), DstReg)
2715         .addReg(SrcVec)
2716         .addImm(16);
2717     }
2718   } else if (Mask[0] == 0 && Mask[1] == 0) {
2719     if (IsVALU) {
2720       // Write low half of the register into the high half.
2721       MachineInstr *MovSDWA =
2722         BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2723         .addImm(0)                             // $src0_modifiers
2724         .addReg(SrcVec)                        // $src0
2725         .addImm(0)                             // $clamp
2726         .addImm(AMDGPU::SDWA::WORD_1)          // $dst_sel
2727         .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2728         .addImm(AMDGPU::SDWA::WORD_0)          // $src0_sel
2729         .addReg(SrcVec, RegState::Implicit);
2730       MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2731     } else {
2732       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
2733         .addReg(SrcVec)
2734         .addReg(SrcVec);
2735     }
2736   } else if (Mask[0] == 1 && Mask[1] == 1) {
2737     if (IsVALU) {
2738       // Write high half of the register into the low half.
2739       MachineInstr *MovSDWA =
2740         BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2741         .addImm(0)                             // $src0_modifiers
2742         .addReg(SrcVec)                        // $src0
2743         .addImm(0)                             // $clamp
2744         .addImm(AMDGPU::SDWA::WORD_0)          // $dst_sel
2745         .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2746         .addImm(AMDGPU::SDWA::WORD_1)          // $src0_sel
2747         .addReg(SrcVec, RegState::Implicit);
2748       MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2749     } else {
2750       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_HH_B32_B16), DstReg)
2751         .addReg(SrcVec)
2752         .addReg(SrcVec);
2753     }
2754   } else if (Mask[0] == 1 && Mask[1] == 0) {
2755     if (IsVALU) {
2756       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_ALIGNBIT_B32), DstReg)
2757         .addReg(SrcVec)
2758         .addReg(SrcVec)
2759         .addImm(16);
2760     } else {
2761       Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2762       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg)
2763         .addReg(SrcVec)
2764         .addImm(16);
2765       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
2766         .addReg(TmpReg)
2767         .addReg(SrcVec);
2768     }
2769   } else
2770     llvm_unreachable("all shuffle masks should be handled");
2771 
2772   MI.eraseFromParent();
2773   return true;
2774 }
2775 
2776 bool AMDGPUInstructionSelector::select(MachineInstr &I) {
2777   if (I.isPHI())
2778     return selectPHI(I);
2779 
2780   if (!I.isPreISelOpcode()) {
2781     if (I.isCopy())
2782       return selectCOPY(I);
2783     return true;
2784   }
2785 
2786   switch (I.getOpcode()) {
2787   case TargetOpcode::G_AND:
2788   case TargetOpcode::G_OR:
2789   case TargetOpcode::G_XOR:
2790     if (selectImpl(I, *CoverageInfo))
2791       return true;
2792     return selectG_AND_OR_XOR(I);
2793   case TargetOpcode::G_ADD:
2794   case TargetOpcode::G_SUB:
2795     if (selectImpl(I, *CoverageInfo))
2796       return true;
2797     return selectG_ADD_SUB(I);
2798   case TargetOpcode::G_UADDO:
2799   case TargetOpcode::G_USUBO:
2800   case TargetOpcode::G_UADDE:
2801   case TargetOpcode::G_USUBE:
2802     return selectG_UADDO_USUBO_UADDE_USUBE(I);
2803   case TargetOpcode::G_INTTOPTR:
2804   case TargetOpcode::G_BITCAST:
2805   case TargetOpcode::G_PTRTOINT:
2806     return selectCOPY(I);
2807   case TargetOpcode::G_CONSTANT:
2808   case TargetOpcode::G_FCONSTANT:
2809     return selectG_CONSTANT(I);
2810   case TargetOpcode::G_FNEG:
2811     if (selectImpl(I, *CoverageInfo))
2812       return true;
2813     return selectG_FNEG(I);
2814   case TargetOpcode::G_FABS:
2815     if (selectImpl(I, *CoverageInfo))
2816       return true;
2817     return selectG_FABS(I);
2818   case TargetOpcode::G_EXTRACT:
2819     return selectG_EXTRACT(I);
2820   case TargetOpcode::G_MERGE_VALUES:
2821   case TargetOpcode::G_BUILD_VECTOR:
2822   case TargetOpcode::G_CONCAT_VECTORS:
2823     return selectG_MERGE_VALUES(I);
2824   case TargetOpcode::G_UNMERGE_VALUES:
2825     return selectG_UNMERGE_VALUES(I);
2826   case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2827     return selectG_BUILD_VECTOR_TRUNC(I);
2828   case TargetOpcode::G_PTR_ADD:
2829     return selectG_PTR_ADD(I);
2830   case TargetOpcode::G_IMPLICIT_DEF:
2831     return selectG_IMPLICIT_DEF(I);
2832   case TargetOpcode::G_FREEZE:
2833     return selectCOPY(I);
2834   case TargetOpcode::G_INSERT:
2835     return selectG_INSERT(I);
2836   case TargetOpcode::G_INTRINSIC:
2837     return selectG_INTRINSIC(I);
2838   case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
2839     return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
2840   case TargetOpcode::G_ICMP:
2841     if (selectG_ICMP(I))
2842       return true;
2843     return selectImpl(I, *CoverageInfo);
2844   case TargetOpcode::G_LOAD:
2845   case TargetOpcode::G_ATOMIC_CMPXCHG:
2846   case TargetOpcode::G_ATOMICRMW_XCHG:
2847   case TargetOpcode::G_ATOMICRMW_ADD:
2848   case TargetOpcode::G_ATOMICRMW_SUB:
2849   case TargetOpcode::G_ATOMICRMW_AND:
2850   case TargetOpcode::G_ATOMICRMW_OR:
2851   case TargetOpcode::G_ATOMICRMW_XOR:
2852   case TargetOpcode::G_ATOMICRMW_MIN:
2853   case TargetOpcode::G_ATOMICRMW_MAX:
2854   case TargetOpcode::G_ATOMICRMW_UMIN:
2855   case TargetOpcode::G_ATOMICRMW_UMAX:
2856   case TargetOpcode::G_ATOMICRMW_FADD:
2857   case AMDGPU::G_AMDGPU_ATOMIC_INC:
2858   case AMDGPU::G_AMDGPU_ATOMIC_DEC:
2859     return selectG_LOAD_ATOMICRMW(I);
2860   case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
2861     return selectG_AMDGPU_ATOMIC_CMPXCHG(I);
2862   case TargetOpcode::G_SELECT:
2863     return selectG_SELECT(I);
2864   case TargetOpcode::G_STORE:
2865     return selectG_STORE(I);
2866   case TargetOpcode::G_TRUNC:
2867     return selectG_TRUNC(I);
2868   case TargetOpcode::G_SEXT:
2869   case TargetOpcode::G_ZEXT:
2870   case TargetOpcode::G_ANYEXT:
2871   case TargetOpcode::G_SEXT_INREG:
2872     if (selectImpl(I, *CoverageInfo))
2873       return true;
2874     return selectG_SZA_EXT(I);
2875   case TargetOpcode::G_BRCOND:
2876     return selectG_BRCOND(I);
2877   case TargetOpcode::G_FRAME_INDEX:
2878   case TargetOpcode::G_GLOBAL_VALUE:
2879     return selectG_FRAME_INDEX_GLOBAL_VALUE(I);
2880   case TargetOpcode::G_PTRMASK:
2881     return selectG_PTRMASK(I);
2882   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2883     return selectG_EXTRACT_VECTOR_ELT(I);
2884   case TargetOpcode::G_INSERT_VECTOR_ELT:
2885     return selectG_INSERT_VECTOR_ELT(I);
2886   case TargetOpcode::G_SHUFFLE_VECTOR:
2887     return selectG_SHUFFLE_VECTOR(I);
2888   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
2889   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
2890     const AMDGPU::ImageDimIntrinsicInfo *Intr
2891       = AMDGPU::getImageDimIntrinsicInfo(I.getIntrinsicID());
2892     assert(Intr && "not an image intrinsic with image pseudo");
2893     return selectImageIntrinsic(I, Intr);
2894   }
2895   default:
2896     return selectImpl(I, *CoverageInfo);
2897   }
2898   return false;
2899 }
2900 
2901 InstructionSelector::ComplexRendererFns
2902 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
2903   return {{
2904       [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
2905   }};
2906 
2907 }
2908 
2909 std::pair<Register, unsigned>
2910 AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root) const {
2911   Register Src = Root.getReg();
2912   Register OrigSrc = Src;
2913   unsigned Mods = 0;
2914   MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
2915 
2916   if (MI && MI->getOpcode() == AMDGPU::G_FNEG) {
2917     Src = MI->getOperand(1).getReg();
2918     Mods |= SISrcMods::NEG;
2919     MI = getDefIgnoringCopies(Src, *MRI);
2920   }
2921 
2922   if (MI && MI->getOpcode() == AMDGPU::G_FABS) {
2923     Src = MI->getOperand(1).getReg();
2924     Mods |= SISrcMods::ABS;
2925   }
2926 
2927   if (Mods != 0 &&
2928       RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
2929     MachineInstr *UseMI = Root.getParent();
2930 
2931     // If we looked through copies to find source modifiers on an SGPR operand,
2932     // we now have an SGPR register source. To avoid potentially violating the
2933     // constant bus restriction, we need to insert a copy to a VGPR.
2934     Register VGPRSrc = MRI->cloneVirtualRegister(OrigSrc);
2935     BuildMI(*UseMI->getParent(), UseMI, UseMI->getDebugLoc(),
2936             TII.get(AMDGPU::COPY), VGPRSrc)
2937       .addReg(Src);
2938     Src = VGPRSrc;
2939   }
2940 
2941   return std::make_pair(Src, Mods);
2942 }
2943 
2944 ///
2945 /// This will select either an SGPR or VGPR operand and will save us from
2946 /// having to write an extra tablegen pattern.
2947 InstructionSelector::ComplexRendererFns
2948 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
2949   return {{
2950       [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
2951   }};
2952 }
2953 
2954 InstructionSelector::ComplexRendererFns
2955 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
2956   Register Src;
2957   unsigned Mods;
2958   std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
2959 
2960   return {{
2961       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
2962       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
2963       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },    // clamp
2964       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }     // omod
2965   }};
2966 }
2967 
2968 InstructionSelector::ComplexRendererFns
2969 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
2970   return {{
2971       [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
2972       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
2973       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }  // omod
2974   }};
2975 }
2976 
2977 InstructionSelector::ComplexRendererFns
2978 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
2979   Register Src;
2980   unsigned Mods;
2981   std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
2982 
2983   return {{
2984       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
2985       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
2986   }};
2987 }
2988 
2989 InstructionSelector::ComplexRendererFns
2990 AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
2991   Register Reg = Root.getReg();
2992   const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
2993   if (Def && (Def->getOpcode() == AMDGPU::G_FNEG ||
2994               Def->getOpcode() == AMDGPU::G_FABS))
2995     return {};
2996   return {{
2997       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
2998   }};
2999 }
3000 
3001 std::pair<Register, unsigned>
3002 AMDGPUInstructionSelector::selectVOP3PModsImpl(
3003   Register Src, const MachineRegisterInfo &MRI) const {
3004   unsigned Mods = 0;
3005   MachineInstr *MI = MRI.getVRegDef(Src);
3006 
3007   if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
3008       // It's possible to see an f32 fneg here, but unlikely.
3009       // TODO: Treat f32 fneg as only high bit.
3010       MRI.getType(Src) == LLT::vector(2, 16)) {
3011     Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
3012     Src = MI->getOperand(1).getReg();
3013     MI = MRI.getVRegDef(Src);
3014   }
3015 
3016   // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
3017 
3018   // Packed instructions do not have abs modifiers.
3019   Mods |= SISrcMods::OP_SEL_1;
3020 
3021   return std::make_pair(Src, Mods);
3022 }
3023 
3024 InstructionSelector::ComplexRendererFns
3025 AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
3026   MachineRegisterInfo &MRI
3027     = Root.getParent()->getParent()->getParent()->getRegInfo();
3028 
3029   Register Src;
3030   unsigned Mods;
3031   std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
3032 
3033   return {{
3034       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3035       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
3036   }};
3037 }
3038 
3039 InstructionSelector::ComplexRendererFns
3040 AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {
3041   Register Src;
3042   unsigned Mods;
3043   std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3044   if (!TM.Options.NoNaNsFPMath && !isKnownNeverNaN(Src, *MRI))
3045     return None;
3046 
3047   return {{
3048       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3049       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
3050   }};
3051 }
3052 
3053 InstructionSelector::ComplexRendererFns
3054 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
3055   // FIXME: Handle op_sel
3056   return {{
3057       [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
3058       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods
3059   }};
3060 }
3061 
3062 InstructionSelector::ComplexRendererFns
3063 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
3064   SmallVector<GEPInfo, 4> AddrInfo;
3065   getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
3066 
3067   if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3068     return None;
3069 
3070   const GEPInfo &GEPInfo = AddrInfo[0];
3071   Optional<int64_t> EncodedImm =
3072       AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm, false);
3073   if (!EncodedImm)
3074     return None;
3075 
3076   unsigned PtrReg = GEPInfo.SgprParts[0];
3077   return {{
3078     [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3079     [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
3080   }};
3081 }
3082 
3083 InstructionSelector::ComplexRendererFns
3084 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
3085   SmallVector<GEPInfo, 4> AddrInfo;
3086   getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
3087 
3088   if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3089     return None;
3090 
3091   const GEPInfo &GEPInfo = AddrInfo[0];
3092   Register PtrReg = GEPInfo.SgprParts[0];
3093   Optional<int64_t> EncodedImm =
3094       AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
3095   if (!EncodedImm)
3096     return None;
3097 
3098   return {{
3099     [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3100     [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
3101   }};
3102 }
3103 
3104 InstructionSelector::ComplexRendererFns
3105 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
3106   MachineInstr *MI = Root.getParent();
3107   MachineBasicBlock *MBB = MI->getParent();
3108 
3109   SmallVector<GEPInfo, 4> AddrInfo;
3110   getAddrModeInfo(*MI, *MRI, AddrInfo);
3111 
3112   // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
3113   // then we can select all ptr + 32-bit offsets not just immediate offsets.
3114   if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3115     return None;
3116 
3117   const GEPInfo &GEPInfo = AddrInfo[0];
3118   // SGPR offset is unsigned.
3119   if (!GEPInfo.Imm || GEPInfo.Imm < 0 || !isUInt<32>(GEPInfo.Imm))
3120     return None;
3121 
3122   // If we make it this far we have a load with an 32-bit immediate offset.
3123   // It is OK to select this using a sgpr offset, because we have already
3124   // failed trying to select this load into one of the _IMM variants since
3125   // the _IMM Patterns are considered before the _SGPR patterns.
3126   Register PtrReg = GEPInfo.SgprParts[0];
3127   Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3128   BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg)
3129           .addImm(GEPInfo.Imm);
3130   return {{
3131     [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3132     [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }
3133   }};
3134 }
3135 
3136 template <bool Signed>
3137 InstructionSelector::ComplexRendererFns
3138 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const {
3139   MachineInstr *MI = Root.getParent();
3140 
3141   InstructionSelector::ComplexRendererFns Default = {{
3142       [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
3143       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },  // offset
3144       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }  // slc
3145     }};
3146 
3147   if (!STI.hasFlatInstOffsets())
3148     return Default;
3149 
3150   const MachineInstr *OpDef = MRI->getVRegDef(Root.getReg());
3151   if (!OpDef || OpDef->getOpcode() != AMDGPU::G_PTR_ADD)
3152     return Default;
3153 
3154   Optional<int64_t> Offset =
3155     getConstantVRegVal(OpDef->getOperand(2).getReg(), *MRI);
3156   if (!Offset.hasValue())
3157     return Default;
3158 
3159   unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
3160   if (!TII.isLegalFLATOffset(Offset.getValue(), AddrSpace, Signed))
3161     return Default;
3162 
3163   Register BasePtr = OpDef->getOperand(1).getReg();
3164 
3165   return {{
3166       [=](MachineInstrBuilder &MIB) { MIB.addReg(BasePtr); },
3167       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset.getValue()); },
3168       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }  // slc
3169     }};
3170 }
3171 
3172 InstructionSelector::ComplexRendererFns
3173 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
3174   return selectFlatOffsetImpl<false>(Root);
3175 }
3176 
3177 InstructionSelector::ComplexRendererFns
3178 AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const {
3179   return selectFlatOffsetImpl<true>(Root);
3180 }
3181 
3182 static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
3183   auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>();
3184   return PSV && PSV->isStack();
3185 }
3186 
3187 InstructionSelector::ComplexRendererFns
3188 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
3189   MachineInstr *MI = Root.getParent();
3190   MachineBasicBlock *MBB = MI->getParent();
3191   MachineFunction *MF = MBB->getParent();
3192   const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3193 
3194   int64_t Offset = 0;
3195   if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
3196       Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) {
3197     Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3198 
3199     // TODO: Should this be inside the render function? The iterator seems to
3200     // move.
3201     BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
3202             HighBits)
3203       .addImm(Offset & ~4095);
3204 
3205     return {{[=](MachineInstrBuilder &MIB) { // rsrc
3206                MIB.addReg(Info->getScratchRSrcReg());
3207              },
3208              [=](MachineInstrBuilder &MIB) { // vaddr
3209                MIB.addReg(HighBits);
3210              },
3211              [=](MachineInstrBuilder &MIB) { // soffset
3212                const MachineMemOperand *MMO = *MI->memoperands_begin();
3213                const MachinePointerInfo &PtrInfo = MMO->getPointerInfo();
3214 
3215                if (isStackPtrRelative(PtrInfo))
3216                  MIB.addReg(Info->getStackPtrOffsetReg());
3217                else
3218                  MIB.addImm(0);
3219              },
3220              [=](MachineInstrBuilder &MIB) { // offset
3221                MIB.addImm(Offset & 4095);
3222              }}};
3223   }
3224 
3225   assert(Offset == 0 || Offset == -1);
3226 
3227   // Try to fold a frame index directly into the MUBUF vaddr field, and any
3228   // offsets.
3229   Optional<int> FI;
3230   Register VAddr = Root.getReg();
3231   if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) {
3232     if (isBaseWithConstantOffset(Root, *MRI)) {
3233       const MachineOperand &LHS = RootDef->getOperand(1);
3234       const MachineOperand &RHS = RootDef->getOperand(2);
3235       const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg());
3236       const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg());
3237       if (LHSDef && RHSDef) {
3238         int64_t PossibleOffset =
3239             RHSDef->getOperand(1).getCImm()->getSExtValue();
3240         if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) &&
3241             (!STI.privateMemoryResourceIsRangeChecked() ||
3242              KnownBits->signBitIsZero(LHS.getReg()))) {
3243           if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
3244             FI = LHSDef->getOperand(1).getIndex();
3245           else
3246             VAddr = LHS.getReg();
3247           Offset = PossibleOffset;
3248         }
3249       }
3250     } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
3251       FI = RootDef->getOperand(1).getIndex();
3252     }
3253   }
3254 
3255   return {{[=](MachineInstrBuilder &MIB) { // rsrc
3256              MIB.addReg(Info->getScratchRSrcReg());
3257            },
3258            [=](MachineInstrBuilder &MIB) { // vaddr
3259              if (FI.hasValue())
3260                MIB.addFrameIndex(FI.getValue());
3261              else
3262                MIB.addReg(VAddr);
3263            },
3264            [=](MachineInstrBuilder &MIB) { // soffset
3265              // If we don't know this private access is a local stack object, it
3266              // needs to be relative to the entry point's scratch wave offset.
3267              // TODO: Should split large offsets that don't fit like above.
3268              // TODO: Don't use scratch wave offset just because the offset
3269              // didn't fit.
3270              if (!Info->isEntryFunction() && FI.hasValue())
3271                MIB.addReg(Info->getStackPtrOffsetReg());
3272              else
3273                MIB.addImm(0);
3274            },
3275            [=](MachineInstrBuilder &MIB) { // offset
3276              MIB.addImm(Offset);
3277            }}};
3278 }
3279 
3280 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
3281                                                 int64_t Offset,
3282                                                 unsigned OffsetBits) const {
3283   if ((OffsetBits == 16 && !isUInt<16>(Offset)) ||
3284       (OffsetBits == 8 && !isUInt<8>(Offset)))
3285     return false;
3286 
3287   if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
3288     return true;
3289 
3290   // On Southern Islands instruction with a negative base value and an offset
3291   // don't seem to work.
3292   return KnownBits->signBitIsZero(Base);
3293 }
3294 
3295 InstructionSelector::ComplexRendererFns
3296 AMDGPUInstructionSelector::selectMUBUFScratchOffset(
3297     MachineOperand &Root) const {
3298   MachineInstr *MI = Root.getParent();
3299   MachineBasicBlock *MBB = MI->getParent();
3300 
3301   int64_t Offset = 0;
3302   if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
3303       !SIInstrInfo::isLegalMUBUFImmOffset(Offset))
3304     return {};
3305 
3306   const MachineFunction *MF = MBB->getParent();
3307   const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3308   const MachineMemOperand *MMO = *MI->memoperands_begin();
3309   const MachinePointerInfo &PtrInfo = MMO->getPointerInfo();
3310 
3311   return {{
3312       [=](MachineInstrBuilder &MIB) { // rsrc
3313         MIB.addReg(Info->getScratchRSrcReg());
3314       },
3315       [=](MachineInstrBuilder &MIB) { // soffset
3316         if (isStackPtrRelative(PtrInfo))
3317           MIB.addReg(Info->getStackPtrOffsetReg());
3318         else
3319           MIB.addImm(0);
3320       },
3321       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
3322   }};
3323 }
3324 
3325 std::pair<Register, unsigned>
3326 AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
3327   const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
3328   if (!RootDef)
3329     return std::make_pair(Root.getReg(), 0);
3330 
3331   int64_t ConstAddr = 0;
3332 
3333   Register PtrBase;
3334   int64_t Offset;
3335   std::tie(PtrBase, Offset) =
3336     getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3337 
3338   if (Offset) {
3339     if (isDSOffsetLegal(PtrBase, Offset, 16)) {
3340       // (add n0, c0)
3341       return std::make_pair(PtrBase, Offset);
3342     }
3343   } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
3344     // TODO
3345 
3346 
3347   } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
3348     // TODO
3349 
3350   }
3351 
3352   return std::make_pair(Root.getReg(), 0);
3353 }
3354 
3355 InstructionSelector::ComplexRendererFns
3356 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
3357   Register Reg;
3358   unsigned Offset;
3359   std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
3360   return {{
3361       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3362       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
3363     }};
3364 }
3365 
3366 InstructionSelector::ComplexRendererFns
3367 AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
3368   Register Reg;
3369   unsigned Offset;
3370   std::tie(Reg, Offset) = selectDS64Bit4ByteAlignedImpl(Root);
3371   return {{
3372       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3373       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
3374       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
3375     }};
3376 }
3377 
3378 std::pair<Register, unsigned>
3379 AMDGPUInstructionSelector::selectDS64Bit4ByteAlignedImpl(MachineOperand &Root) const {
3380   const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
3381   if (!RootDef)
3382     return std::make_pair(Root.getReg(), 0);
3383 
3384   int64_t ConstAddr = 0;
3385 
3386   Register PtrBase;
3387   int64_t Offset;
3388   std::tie(PtrBase, Offset) =
3389     getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3390 
3391   if (Offset) {
3392     int64_t DWordOffset0 = Offset / 4;
3393     int64_t DWordOffset1 = DWordOffset0 + 1;
3394     if (isDSOffsetLegal(PtrBase, DWordOffset1, 8)) {
3395       // (add n0, c0)
3396       return std::make_pair(PtrBase, DWordOffset0);
3397     }
3398   } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
3399     // TODO
3400 
3401   } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
3402     // TODO
3403 
3404   }
3405 
3406   return std::make_pair(Root.getReg(), 0);
3407 }
3408 
3409 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
3410 /// the base value with the constant offset. There may be intervening copies
3411 /// between \p Root and the identified constant. Returns \p Root, 0 if this does
3412 /// not match the pattern.
3413 std::pair<Register, int64_t>
3414 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
3415   Register Root, const MachineRegisterInfo &MRI) const {
3416   MachineInstr *RootI = MRI.getVRegDef(Root);
3417   if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
3418     return {Root, 0};
3419 
3420   MachineOperand &RHS = RootI->getOperand(2);
3421   Optional<ValueAndVReg> MaybeOffset
3422     = getConstantVRegValWithLookThrough(RHS.getReg(), MRI, true);
3423   if (!MaybeOffset)
3424     return {Root, 0};
3425   return {RootI->getOperand(1).getReg(), MaybeOffset->Value};
3426 }
3427 
3428 static void addZeroImm(MachineInstrBuilder &MIB) {
3429   MIB.addImm(0);
3430 }
3431 
3432 /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
3433 /// BasePtr is not valid, a null base pointer will be used.
3434 static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI,
3435                           uint32_t FormatLo, uint32_t FormatHi,
3436                           Register BasePtr) {
3437   Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
3438   Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
3439   Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3440   Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
3441 
3442   B.buildInstr(AMDGPU::S_MOV_B32)
3443     .addDef(RSrc2)
3444     .addImm(FormatLo);
3445   B.buildInstr(AMDGPU::S_MOV_B32)
3446     .addDef(RSrc3)
3447     .addImm(FormatHi);
3448 
3449   // Build the half of the subregister with the constants before building the
3450   // full 128-bit register. If we are building multiple resource descriptors,
3451   // this will allow CSEing of the 2-component register.
3452   B.buildInstr(AMDGPU::REG_SEQUENCE)
3453     .addDef(RSrcHi)
3454     .addReg(RSrc2)
3455     .addImm(AMDGPU::sub0)
3456     .addReg(RSrc3)
3457     .addImm(AMDGPU::sub1);
3458 
3459   Register RSrcLo = BasePtr;
3460   if (!BasePtr) {
3461     RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3462     B.buildInstr(AMDGPU::S_MOV_B64)
3463       .addDef(RSrcLo)
3464       .addImm(0);
3465   }
3466 
3467   B.buildInstr(AMDGPU::REG_SEQUENCE)
3468     .addDef(RSrc)
3469     .addReg(RSrcLo)
3470     .addImm(AMDGPU::sub0_sub1)
3471     .addReg(RSrcHi)
3472     .addImm(AMDGPU::sub2_sub3);
3473 
3474   return RSrc;
3475 }
3476 
3477 static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
3478                                 const SIInstrInfo &TII, Register BasePtr) {
3479   uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
3480 
3481   // FIXME: Why are half the "default" bits ignored based on the addressing
3482   // mode?
3483   return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
3484 }
3485 
3486 static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
3487                                const SIInstrInfo &TII, Register BasePtr) {
3488   uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
3489 
3490   // FIXME: Why are half the "default" bits ignored based on the addressing
3491   // mode?
3492   return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
3493 }
3494 
3495 AMDGPUInstructionSelector::MUBUFAddressData
3496 AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
3497   MUBUFAddressData Data;
3498   Data.N0 = Src;
3499 
3500   Register PtrBase;
3501   int64_t Offset;
3502 
3503   std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
3504   if (isUInt<32>(Offset)) {
3505     Data.N0 = PtrBase;
3506     Data.Offset = Offset;
3507   }
3508 
3509   if (MachineInstr *InputAdd
3510       = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
3511     Data.N2 = InputAdd->getOperand(1).getReg();
3512     Data.N3 = InputAdd->getOperand(2).getReg();
3513 
3514     // FIXME: Need to fix extra SGPR->VGPRcopies inserted
3515     // FIXME: Don't know this was defined by operand 0
3516     //
3517     // TODO: Remove this when we have copy folding optimizations after
3518     // RegBankSelect.
3519     Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
3520     Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
3521   }
3522 
3523   return Data;
3524 }
3525 
3526 /// Return if the addr64 mubuf mode should be used for the given address.
3527 bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
3528   // (ptr_add N2, N3) -> addr64, or
3529   // (ptr_add (ptr_add N2, N3), C1) -> addr64
3530   if (Addr.N2)
3531     return true;
3532 
3533   const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
3534   return N0Bank->getID() == AMDGPU::VGPRRegBankID;
3535 }
3536 
3537 /// Split an immediate offset \p ImmOffset depending on whether it fits in the
3538 /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
3539 /// component.
3540 void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
3541   MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
3542   if (SIInstrInfo::isLegalMUBUFImmOffset(ImmOffset))
3543     return;
3544 
3545   // Illegal offset, store it in soffset.
3546   SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3547   B.buildInstr(AMDGPU::S_MOV_B32)
3548     .addDef(SOffset)
3549     .addImm(ImmOffset);
3550   ImmOffset = 0;
3551 }
3552 
3553 bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
3554   MachineOperand &Root, Register &VAddr, Register &RSrcReg,
3555   Register &SOffset, int64_t &Offset) const {
3556   // FIXME: Predicates should stop this from reaching here.
3557   // addr64 bit was removed for volcanic islands.
3558   if (!STI.hasAddr64() || STI.useFlatForGlobal())
3559     return false;
3560 
3561   MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
3562   if (!shouldUseAddr64(AddrData))
3563     return false;
3564 
3565   Register N0 = AddrData.N0;
3566   Register N2 = AddrData.N2;
3567   Register N3 = AddrData.N3;
3568   Offset = AddrData.Offset;
3569 
3570   // Base pointer for the SRD.
3571   Register SRDPtr;
3572 
3573   if (N2) {
3574     if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
3575       assert(N3);
3576       if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
3577         // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
3578         // addr64, and construct the default resource from a 0 address.
3579         VAddr = N0;
3580       } else {
3581         SRDPtr = N3;
3582         VAddr = N2;
3583       }
3584     } else {
3585       // N2 is not divergent.
3586       SRDPtr = N2;
3587       VAddr = N3;
3588     }
3589   } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
3590     // Use the default null pointer in the resource
3591     VAddr = N0;
3592   } else {
3593     // N0 -> offset, or
3594     // (N0 + C1) -> offset
3595     SRDPtr = N0;
3596   }
3597 
3598   MachineIRBuilder B(*Root.getParent());
3599   RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
3600   splitIllegalMUBUFOffset(B, SOffset, Offset);
3601   return true;
3602 }
3603 
3604 bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
3605   MachineOperand &Root, Register &RSrcReg, Register &SOffset,
3606   int64_t &Offset) const {
3607   MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
3608   if (shouldUseAddr64(AddrData))
3609     return false;
3610 
3611   // N0 -> offset, or
3612   // (N0 + C1) -> offset
3613   Register SRDPtr = AddrData.N0;
3614   Offset = AddrData.Offset;
3615 
3616   // TODO: Look through extensions for 32-bit soffset.
3617   MachineIRBuilder B(*Root.getParent());
3618 
3619   RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
3620   splitIllegalMUBUFOffset(B, SOffset, Offset);
3621   return true;
3622 }
3623 
3624 InstructionSelector::ComplexRendererFns
3625 AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
3626   Register VAddr;
3627   Register RSrcReg;
3628   Register SOffset;
3629   int64_t Offset = 0;
3630 
3631   if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
3632     return {};
3633 
3634   // FIXME: Use defaulted operands for trailing 0s and remove from the complex
3635   // pattern.
3636   return {{
3637       [=](MachineInstrBuilder &MIB) {  // rsrc
3638         MIB.addReg(RSrcReg);
3639       },
3640       [=](MachineInstrBuilder &MIB) { // vaddr
3641         MIB.addReg(VAddr);
3642       },
3643       [=](MachineInstrBuilder &MIB) { // soffset
3644         if (SOffset)
3645           MIB.addReg(SOffset);
3646         else
3647           MIB.addImm(0);
3648       },
3649       [=](MachineInstrBuilder &MIB) { // offset
3650         MIB.addImm(Offset);
3651       },
3652       addZeroImm, //  glc
3653       addZeroImm, //  slc
3654       addZeroImm, //  tfe
3655       addZeroImm, //  dlc
3656       addZeroImm  //  swz
3657     }};
3658 }
3659 
3660 InstructionSelector::ComplexRendererFns
3661 AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
3662   Register RSrcReg;
3663   Register SOffset;
3664   int64_t Offset = 0;
3665 
3666   if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
3667     return {};
3668 
3669   return {{
3670       [=](MachineInstrBuilder &MIB) {  // rsrc
3671         MIB.addReg(RSrcReg);
3672       },
3673       [=](MachineInstrBuilder &MIB) { // soffset
3674         if (SOffset)
3675           MIB.addReg(SOffset);
3676         else
3677           MIB.addImm(0);
3678       },
3679       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
3680       addZeroImm, //  glc
3681       addZeroImm, //  slc
3682       addZeroImm, //  tfe
3683       addZeroImm, //  dlc
3684       addZeroImm  //  swz
3685     }};
3686 }
3687 
3688 InstructionSelector::ComplexRendererFns
3689 AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const {
3690   Register VAddr;
3691   Register RSrcReg;
3692   Register SOffset;
3693   int64_t Offset = 0;
3694 
3695   if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
3696     return {};
3697 
3698   // FIXME: Use defaulted operands for trailing 0s and remove from the complex
3699   // pattern.
3700   return {{
3701       [=](MachineInstrBuilder &MIB) {  // rsrc
3702         MIB.addReg(RSrcReg);
3703       },
3704       [=](MachineInstrBuilder &MIB) { // vaddr
3705         MIB.addReg(VAddr);
3706       },
3707       [=](MachineInstrBuilder &MIB) { // soffset
3708         if (SOffset)
3709           MIB.addReg(SOffset);
3710         else
3711           MIB.addImm(0);
3712       },
3713       [=](MachineInstrBuilder &MIB) { // offset
3714         MIB.addImm(Offset);
3715       },
3716       addZeroImm //  slc
3717     }};
3718 }
3719 
3720 InstructionSelector::ComplexRendererFns
3721 AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const {
3722   Register RSrcReg;
3723   Register SOffset;
3724   int64_t Offset = 0;
3725 
3726   if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
3727     return {};
3728 
3729   return {{
3730       [=](MachineInstrBuilder &MIB) {  // rsrc
3731         MIB.addReg(RSrcReg);
3732       },
3733       [=](MachineInstrBuilder &MIB) { // soffset
3734         if (SOffset)
3735           MIB.addReg(SOffset);
3736         else
3737           MIB.addImm(0);
3738       },
3739       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
3740       addZeroImm //  slc
3741     }};
3742 }
3743 
3744 /// Get an immediate that must be 32-bits, and treated as zero extended.
3745 static Optional<uint64_t> getConstantZext32Val(Register Reg,
3746                                                const MachineRegisterInfo &MRI) {
3747   // getConstantVRegVal sexts any values, so see if that matters.
3748   Optional<int64_t> OffsetVal = getConstantVRegVal(Reg, MRI);
3749   if (!OffsetVal || !isInt<32>(*OffsetVal))
3750     return None;
3751   return Lo_32(*OffsetVal);
3752 }
3753 
3754 InstructionSelector::ComplexRendererFns
3755 AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
3756   Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
3757   if (!OffsetVal)
3758     return {};
3759 
3760   Optional<int64_t> EncodedImm =
3761       AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
3762   if (!EncodedImm)
3763     return {};
3764 
3765   return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }  }};
3766 }
3767 
3768 InstructionSelector::ComplexRendererFns
3769 AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
3770   assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
3771 
3772   Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
3773   if (!OffsetVal)
3774     return {};
3775 
3776   Optional<int64_t> EncodedImm
3777     = AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal);
3778   if (!EncodedImm)
3779     return {};
3780 
3781   return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }  }};
3782 }
3783 
3784 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
3785                                                  const MachineInstr &MI,
3786                                                  int OpIdx) const {
3787   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
3788          "Expected G_CONSTANT");
3789   MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
3790 }
3791 
3792 void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
3793                                                 const MachineInstr &MI,
3794                                                 int OpIdx) const {
3795   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
3796          "Expected G_CONSTANT");
3797   MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
3798 }
3799 
3800 void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB,
3801                                                  const MachineInstr &MI,
3802                                                  int OpIdx) const {
3803   assert(OpIdx == -1);
3804 
3805   const MachineOperand &Op = MI.getOperand(1);
3806   if (MI.getOpcode() == TargetOpcode::G_FCONSTANT)
3807     MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
3808   else {
3809     assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
3810     MIB.addImm(Op.getCImm()->getSExtValue());
3811   }
3812 }
3813 
3814 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
3815                                                 const MachineInstr &MI,
3816                                                 int OpIdx) const {
3817   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
3818          "Expected G_CONSTANT");
3819   MIB.addImm(MI.getOperand(1).getCImm()->getValue().countPopulation());
3820 }
3821 
3822 /// This only really exists to satisfy DAG type checking machinery, so is a
3823 /// no-op here.
3824 void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
3825                                                 const MachineInstr &MI,
3826                                                 int OpIdx) const {
3827   MIB.addImm(MI.getOperand(OpIdx).getImm());
3828 }
3829 
3830 void AMDGPUInstructionSelector::renderExtractGLC(MachineInstrBuilder &MIB,
3831                                                  const MachineInstr &MI,
3832                                                  int OpIdx) const {
3833   assert(OpIdx >= 0 && "expected to match an immediate operand");
3834   MIB.addImm(MI.getOperand(OpIdx).getImm() & 1);
3835 }
3836 
3837 void AMDGPUInstructionSelector::renderExtractSLC(MachineInstrBuilder &MIB,
3838                                                  const MachineInstr &MI,
3839                                                  int OpIdx) const {
3840   assert(OpIdx >= 0 && "expected to match an immediate operand");
3841   MIB.addImm((MI.getOperand(OpIdx).getImm() >> 1) & 1);
3842 }
3843 
3844 void AMDGPUInstructionSelector::renderExtractDLC(MachineInstrBuilder &MIB,
3845                                                  const MachineInstr &MI,
3846                                                  int OpIdx) const {
3847   assert(OpIdx >= 0 && "expected to match an immediate operand");
3848   MIB.addImm((MI.getOperand(OpIdx).getImm() >> 2) & 1);
3849 }
3850 
3851 void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
3852                                                  const MachineInstr &MI,
3853                                                  int OpIdx) const {
3854   assert(OpIdx >= 0 && "expected to match an immediate operand");
3855   MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1);
3856 }
3857 
3858 bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const {
3859   return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm());
3860 }
3861 
3862 bool AMDGPUInstructionSelector::isInlineImmediate32(int64_t Imm) const {
3863   return AMDGPU::isInlinableLiteral32(Imm, STI.hasInv2PiInlineImm());
3864 }
3865 
3866 bool AMDGPUInstructionSelector::isInlineImmediate64(int64_t Imm) const {
3867   return AMDGPU::isInlinableLiteral64(Imm, STI.hasInv2PiInlineImm());
3868 }
3869 
3870 bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
3871   return TII.isInlineConstant(Imm);
3872 }
3873