1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the InstructionSelector class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUInstructionSelector.h"
15 #include "AMDGPUInstrInfo.h"
16 #include "AMDGPUGlobalISelUtils.h"
17 #include "AMDGPURegisterBankInfo.h"
18 #include "AMDGPUSubtarget.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
23 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
24 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
25 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
26 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
27 #include "llvm/CodeGen/GlobalISel/Utils.h"
28 #include "llvm/CodeGen/MachineBasicBlock.h"
29 #include "llvm/CodeGen/MachineFunction.h"
30 #include "llvm/CodeGen/MachineInstr.h"
31 #include "llvm/CodeGen/MachineInstrBuilder.h"
32 #include "llvm/CodeGen/MachineRegisterInfo.h"
33 #include "llvm/IR/Type.h"
34 #include "llvm/Support/Debug.h"
35 #include "llvm/Support/raw_ostream.h"
36 
37 #define DEBUG_TYPE "amdgpu-isel"
38 
39 using namespace llvm;
40 using namespace MIPatternMatch;
41 
42 static cl::opt<bool> AllowRiskySelect(
43   "amdgpu-global-isel-risky-select",
44   cl::desc("Allow GlobalISel to select cases that are likely to not work yet"),
45   cl::init(false),
46   cl::ReallyHidden);
47 
48 #define GET_GLOBALISEL_IMPL
49 #define AMDGPUSubtarget GCNSubtarget
50 #include "AMDGPUGenGlobalISel.inc"
51 #undef GET_GLOBALISEL_IMPL
52 #undef AMDGPUSubtarget
53 
54 AMDGPUInstructionSelector::AMDGPUInstructionSelector(
55     const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
56     const AMDGPUTargetMachine &TM)
57     : InstructionSelector(), TII(*STI.getInstrInfo()),
58       TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
59       STI(STI),
60       EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG),
61 #define GET_GLOBALISEL_PREDICATES_INIT
62 #include "AMDGPUGenGlobalISel.inc"
63 #undef GET_GLOBALISEL_PREDICATES_INIT
64 #define GET_GLOBALISEL_TEMPORARIES_INIT
65 #include "AMDGPUGenGlobalISel.inc"
66 #undef GET_GLOBALISEL_TEMPORARIES_INIT
67 {
68 }
69 
70 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
71 
72 void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB,
73                                         CodeGenCoverage &CoverageInfo) {
74   MRI = &MF.getRegInfo();
75   InstructionSelector::setupMF(MF, KB, CoverageInfo);
76 }
77 
78 bool AMDGPUInstructionSelector::isVCC(Register Reg,
79                                       const MachineRegisterInfo &MRI) const {
80   // The verifier is oblivious to s1 being a valid value for wavesize registers.
81   if (Reg.isPhysical())
82     return false;
83 
84   auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
85   const TargetRegisterClass *RC =
86       RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
87   if (RC) {
88     const LLT Ty = MRI.getType(Reg);
89     return RC->hasSuperClassEq(TRI.getBoolRC()) &&
90            Ty.isValid() && Ty.getSizeInBits() == 1;
91   }
92 
93   const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
94   return RB->getID() == AMDGPU::VCCRegBankID;
95 }
96 
97 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
98                                                         unsigned NewOpc) const {
99   MI.setDesc(TII.get(NewOpc));
100   MI.RemoveOperand(1); // Remove intrinsic ID.
101   MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
102 
103   MachineOperand &Dst = MI.getOperand(0);
104   MachineOperand &Src = MI.getOperand(1);
105 
106   // TODO: This should be legalized to s32 if needed
107   if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
108     return false;
109 
110   const TargetRegisterClass *DstRC
111     = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
112   const TargetRegisterClass *SrcRC
113     = TRI.getConstrainedRegClassForOperand(Src, *MRI);
114   if (!DstRC || DstRC != SrcRC)
115     return false;
116 
117   return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
118          RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
119 }
120 
121 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
122   const DebugLoc &DL = I.getDebugLoc();
123   MachineBasicBlock *BB = I.getParent();
124   I.setDesc(TII.get(TargetOpcode::COPY));
125 
126   const MachineOperand &Src = I.getOperand(1);
127   MachineOperand &Dst = I.getOperand(0);
128   Register DstReg = Dst.getReg();
129   Register SrcReg = Src.getReg();
130 
131   if (isVCC(DstReg, *MRI)) {
132     if (SrcReg == AMDGPU::SCC) {
133       const TargetRegisterClass *RC
134         = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
135       if (!RC)
136         return true;
137       return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
138     }
139 
140     if (!isVCC(SrcReg, *MRI)) {
141       // TODO: Should probably leave the copy and let copyPhysReg expand it.
142       if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
143         return false;
144 
145       const TargetRegisterClass *SrcRC
146         = TRI.getConstrainedRegClassForOperand(Src, *MRI);
147 
148       Register MaskedReg = MRI->createVirtualRegister(SrcRC);
149 
150       // We can't trust the high bits at this point, so clear them.
151 
152       // TODO: Skip masking high bits if def is known boolean.
153 
154       unsigned AndOpc = TRI.isSGPRClass(SrcRC) ?
155         AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
156       BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
157         .addImm(1)
158         .addReg(SrcReg);
159       BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
160         .addImm(0)
161         .addReg(MaskedReg);
162 
163       if (!MRI->getRegClassOrNull(SrcReg))
164         MRI->setRegClass(SrcReg, SrcRC);
165       I.eraseFromParent();
166       return true;
167     }
168 
169     const TargetRegisterClass *RC =
170       TRI.getConstrainedRegClassForOperand(Dst, *MRI);
171     if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
172       return false;
173 
174     return true;
175   }
176 
177   for (const MachineOperand &MO : I.operands()) {
178     if (Register::isPhysicalRegister(MO.getReg()))
179       continue;
180 
181     const TargetRegisterClass *RC =
182             TRI.getConstrainedRegClassForOperand(MO, *MRI);
183     if (!RC)
184       continue;
185     RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
186   }
187   return true;
188 }
189 
190 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
191   const Register DefReg = I.getOperand(0).getReg();
192   const LLT DefTy = MRI->getType(DefReg);
193   if (DefTy == LLT::scalar(1)) {
194     if (!AllowRiskySelect) {
195       LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n");
196       return false;
197     }
198 
199     LLVM_DEBUG(dbgs() << "Selecting risky boolean phi\n");
200   }
201 
202   // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
203 
204   const RegClassOrRegBank &RegClassOrBank =
205     MRI->getRegClassOrRegBank(DefReg);
206 
207   const TargetRegisterClass *DefRC
208     = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
209   if (!DefRC) {
210     if (!DefTy.isValid()) {
211       LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
212       return false;
213     }
214 
215     const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
216     DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI);
217     if (!DefRC) {
218       LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
219       return false;
220     }
221   }
222 
223   // TODO: Verify that all registers have the same bank
224   I.setDesc(TII.get(TargetOpcode::PHI));
225   return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
226 }
227 
228 MachineOperand
229 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
230                                            const TargetRegisterClass &SubRC,
231                                            unsigned SubIdx) const {
232 
233   MachineInstr *MI = MO.getParent();
234   MachineBasicBlock *BB = MO.getParent()->getParent();
235   Register DstReg = MRI->createVirtualRegister(&SubRC);
236 
237   if (MO.isReg()) {
238     unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
239     Register Reg = MO.getReg();
240     BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
241             .addReg(Reg, 0, ComposedSubIdx);
242 
243     return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
244                                      MO.isKill(), MO.isDead(), MO.isUndef(),
245                                      MO.isEarlyClobber(), 0, MO.isDebug(),
246                                      MO.isInternalRead());
247   }
248 
249   assert(MO.isImm());
250 
251   APInt Imm(64, MO.getImm());
252 
253   switch (SubIdx) {
254   default:
255     llvm_unreachable("do not know to split immediate with this sub index.");
256   case AMDGPU::sub0:
257     return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
258   case AMDGPU::sub1:
259     return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
260   }
261 }
262 
263 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
264   switch (Opc) {
265   case AMDGPU::G_AND:
266     return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
267   case AMDGPU::G_OR:
268     return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
269   case AMDGPU::G_XOR:
270     return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
271   default:
272     llvm_unreachable("not a bit op");
273   }
274 }
275 
276 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
277   Register DstReg = I.getOperand(0).getReg();
278   unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
279 
280   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
281   if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
282       DstRB->getID() != AMDGPU::VCCRegBankID)
283     return false;
284 
285   bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
286                             STI.isWave64());
287   I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
288 
289   // Dead implicit-def of scc
290   I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
291                                          true, // isImp
292                                          false, // isKill
293                                          true)); // isDead
294   return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
295 }
296 
297 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
298   MachineBasicBlock *BB = I.getParent();
299   MachineFunction *MF = BB->getParent();
300   Register DstReg = I.getOperand(0).getReg();
301   const DebugLoc &DL = I.getDebugLoc();
302   LLT Ty = MRI->getType(DstReg);
303   if (Ty.isVector())
304     return false;
305 
306   unsigned Size = Ty.getSizeInBits();
307   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
308   const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
309   const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
310 
311   if (Size == 32) {
312     if (IsSALU) {
313       const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
314       MachineInstr *Add =
315         BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
316         .add(I.getOperand(1))
317         .add(I.getOperand(2));
318       I.eraseFromParent();
319       return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
320     }
321 
322     if (STI.hasAddNoCarry()) {
323       const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
324       I.setDesc(TII.get(Opc));
325       I.addOperand(*MF, MachineOperand::CreateImm(0));
326       I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
327       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
328     }
329 
330     const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
331 
332     Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
333     MachineInstr *Add
334       = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
335       .addDef(UnusedCarry, RegState::Dead)
336       .add(I.getOperand(1))
337       .add(I.getOperand(2))
338       .addImm(0);
339     I.eraseFromParent();
340     return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
341   }
342 
343   assert(!Sub && "illegal sub should not reach here");
344 
345   const TargetRegisterClass &RC
346     = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
347   const TargetRegisterClass &HalfRC
348     = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
349 
350   MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
351   MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
352   MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
353   MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
354 
355   Register DstLo = MRI->createVirtualRegister(&HalfRC);
356   Register DstHi = MRI->createVirtualRegister(&HalfRC);
357 
358   if (IsSALU) {
359     BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
360       .add(Lo1)
361       .add(Lo2);
362     BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
363       .add(Hi1)
364       .add(Hi2);
365   } else {
366     const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
367     Register CarryReg = MRI->createVirtualRegister(CarryRC);
368     BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
369       .addDef(CarryReg)
370       .add(Lo1)
371       .add(Lo2)
372       .addImm(0);
373     MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
374       .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
375       .add(Hi1)
376       .add(Hi2)
377       .addReg(CarryReg, RegState::Kill)
378       .addImm(0);
379 
380     if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
381       return false;
382   }
383 
384   BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
385     .addReg(DstLo)
386     .addImm(AMDGPU::sub0)
387     .addReg(DstHi)
388     .addImm(AMDGPU::sub1);
389 
390 
391   if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
392     return false;
393 
394   I.eraseFromParent();
395   return true;
396 }
397 
398 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
399   MachineInstr &I) const {
400   MachineBasicBlock *BB = I.getParent();
401   MachineFunction *MF = BB->getParent();
402   const DebugLoc &DL = I.getDebugLoc();
403   Register Dst0Reg = I.getOperand(0).getReg();
404   Register Dst1Reg = I.getOperand(1).getReg();
405   const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
406                      I.getOpcode() == AMDGPU::G_UADDE;
407   const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
408                           I.getOpcode() == AMDGPU::G_USUBE;
409 
410   if (isVCC(Dst1Reg, *MRI)) {
411     unsigned NoCarryOpc =
412         IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
413     unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
414     I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
415     I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
416     I.addOperand(*MF, MachineOperand::CreateImm(0));
417     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
418   }
419 
420   Register Src0Reg = I.getOperand(2).getReg();
421   Register Src1Reg = I.getOperand(3).getReg();
422 
423   if (HasCarryIn) {
424     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
425       .addReg(I.getOperand(4).getReg());
426   }
427 
428   unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
429   unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
430 
431   BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
432     .add(I.getOperand(2))
433     .add(I.getOperand(3));
434   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
435     .addReg(AMDGPU::SCC);
436 
437   if (!MRI->getRegClassOrNull(Dst1Reg))
438     MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
439 
440   if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
441       !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
442       !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
443     return false;
444 
445   if (HasCarryIn &&
446       !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
447                                     AMDGPU::SReg_32RegClass, *MRI))
448     return false;
449 
450   I.eraseFromParent();
451   return true;
452 }
453 
454 // TODO: We should probably legalize these to only using 32-bit results.
455 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
456   MachineBasicBlock *BB = I.getParent();
457   Register DstReg = I.getOperand(0).getReg();
458   Register SrcReg = I.getOperand(1).getReg();
459   LLT DstTy = MRI->getType(DstReg);
460   LLT SrcTy = MRI->getType(SrcReg);
461   const unsigned SrcSize = SrcTy.getSizeInBits();
462   unsigned DstSize = DstTy.getSizeInBits();
463 
464   // TODO: Should handle any multiple of 32 offset.
465   unsigned Offset = I.getOperand(2).getImm();
466   if (Offset % 32 != 0 || DstSize > 128)
467     return false;
468 
469   // 16-bit operations really use 32-bit registers.
470   // FIXME: Probably should not allow 16-bit G_EXTRACT results.
471   if (DstSize == 16)
472     DstSize = 32;
473 
474   const TargetRegisterClass *DstRC =
475     TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
476   if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
477     return false;
478 
479   const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
480   const TargetRegisterClass *SrcRC =
481     TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
482   if (!SrcRC)
483     return false;
484   unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32,
485                                                          DstSize / 32);
486   SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
487   if (!SrcRC)
488     return false;
489 
490   SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
491                                     *SrcRC, I.getOperand(1));
492   const DebugLoc &DL = I.getDebugLoc();
493   BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
494     .addReg(SrcReg, 0, SubReg);
495 
496   I.eraseFromParent();
497   return true;
498 }
499 
500 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
501   MachineBasicBlock *BB = MI.getParent();
502   Register DstReg = MI.getOperand(0).getReg();
503   LLT DstTy = MRI->getType(DstReg);
504   LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
505 
506   const unsigned SrcSize = SrcTy.getSizeInBits();
507   if (SrcSize < 32)
508     return selectImpl(MI, *CoverageInfo);
509 
510   const DebugLoc &DL = MI.getDebugLoc();
511   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
512   const unsigned DstSize = DstTy.getSizeInBits();
513   const TargetRegisterClass *DstRC =
514     TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
515   if (!DstRC)
516     return false;
517 
518   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
519   MachineInstrBuilder MIB =
520     BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
521   for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
522     MachineOperand &Src = MI.getOperand(I + 1);
523     MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
524     MIB.addImm(SubRegs[I]);
525 
526     const TargetRegisterClass *SrcRC
527       = TRI.getConstrainedRegClassForOperand(Src, *MRI);
528     if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
529       return false;
530   }
531 
532   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
533     return false;
534 
535   MI.eraseFromParent();
536   return true;
537 }
538 
539 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
540   MachineBasicBlock *BB = MI.getParent();
541   const int NumDst = MI.getNumOperands() - 1;
542 
543   MachineOperand &Src = MI.getOperand(NumDst);
544 
545   Register SrcReg = Src.getReg();
546   Register DstReg0 = MI.getOperand(0).getReg();
547   LLT DstTy = MRI->getType(DstReg0);
548   LLT SrcTy = MRI->getType(SrcReg);
549 
550   const unsigned DstSize = DstTy.getSizeInBits();
551   const unsigned SrcSize = SrcTy.getSizeInBits();
552   const DebugLoc &DL = MI.getDebugLoc();
553   const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
554 
555   const TargetRegisterClass *SrcRC =
556     TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
557   if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
558     return false;
559 
560   // Note we could have mixed SGPR and VGPR destination banks for an SGPR
561   // source, and this relies on the fact that the same subregister indices are
562   // used for both.
563   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
564   for (int I = 0, E = NumDst; I != E; ++I) {
565     MachineOperand &Dst = MI.getOperand(I);
566     BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
567       .addReg(SrcReg, 0, SubRegs[I]);
568 
569     // Make sure the subregister index is valid for the source register.
570     SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
571     if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
572       return false;
573 
574     const TargetRegisterClass *DstRC =
575       TRI.getConstrainedRegClassForOperand(Dst, *MRI);
576     if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
577       return false;
578   }
579 
580   MI.eraseFromParent();
581   return true;
582 }
583 
584 bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
585   MachineInstr &MI) const {
586   if (selectImpl(MI, *CoverageInfo))
587     return true;
588 
589   const LLT S32 = LLT::scalar(32);
590   const LLT V2S16 = LLT::vector(2, 16);
591 
592   Register Dst = MI.getOperand(0).getReg();
593   if (MRI->getType(Dst) != V2S16)
594     return false;
595 
596   const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
597   if (DstBank->getID() != AMDGPU::SGPRRegBankID)
598     return false;
599 
600   Register Src0 = MI.getOperand(1).getReg();
601   Register Src1 = MI.getOperand(2).getReg();
602   if (MRI->getType(Src0) != S32)
603     return false;
604 
605   const DebugLoc &DL = MI.getDebugLoc();
606   MachineBasicBlock *BB = MI.getParent();
607 
608   auto ConstSrc1 = getConstantVRegValWithLookThrough(Src1, *MRI, true, true);
609   if (ConstSrc1) {
610     auto ConstSrc0 = getConstantVRegValWithLookThrough(Src0, *MRI, true, true);
611     if (ConstSrc0) {
612       uint32_t Lo16 = static_cast<uint32_t>(ConstSrc0->Value) & 0xffff;
613       uint32_t Hi16 = static_cast<uint32_t>(ConstSrc1->Value) & 0xffff;
614 
615       BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst)
616         .addImm(Lo16 | (Hi16 << 16));
617       MI.eraseFromParent();
618       return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
619     }
620   }
621 
622   // TODO: This should probably be a combine somewhere
623   // (build_vector_trunc $src0, undef -> copy $src0
624   MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
625   if (Src1Def && Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
626     MI.setDesc(TII.get(AMDGPU::COPY));
627     MI.RemoveOperand(2);
628     return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI) &&
629            RBI.constrainGenericRegister(Src0, AMDGPU::SReg_32RegClass, *MRI);
630   }
631 
632   Register ShiftSrc0;
633   Register ShiftSrc1;
634   int64_t ShiftAmt;
635 
636   // With multiple uses of the shift, this will duplicate the shift and
637   // increase register pressure.
638   //
639   // (build_vector_trunc (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
640   //  => (S_PACK_HH_B32_B16 $src0, $src1)
641   // (build_vector_trunc $src0, (lshr_oneuse SReg_32:$src1, 16))
642   //  => (S_PACK_LH_B32_B16 $src0, $src1)
643   // (build_vector_trunc $src0, $src1)
644   //  => (S_PACK_LL_B32_B16 $src0, $src1)
645 
646   // FIXME: This is an inconvenient way to check a specific value
647   bool Shift0 = mi_match(
648     Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_ICst(ShiftAmt)))) &&
649     ShiftAmt == 16;
650 
651   bool Shift1 = mi_match(
652     Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_ICst(ShiftAmt)))) &&
653     ShiftAmt == 16;
654 
655   unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
656   if (Shift0 && Shift1) {
657     Opc = AMDGPU::S_PACK_HH_B32_B16;
658     MI.getOperand(1).setReg(ShiftSrc0);
659     MI.getOperand(2).setReg(ShiftSrc1);
660   } else if (Shift1) {
661     Opc = AMDGPU::S_PACK_LH_B32_B16;
662     MI.getOperand(2).setReg(ShiftSrc1);
663   } else if (Shift0 && ConstSrc1 && ConstSrc1->Value == 0) {
664     // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
665     auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
666       .addReg(ShiftSrc0)
667       .addImm(16);
668 
669     MI.eraseFromParent();
670     return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
671   }
672 
673   MI.setDesc(TII.get(Opc));
674   return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
675 }
676 
677 bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const {
678   return selectG_ADD_SUB(I);
679 }
680 
681 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
682   const MachineOperand &MO = I.getOperand(0);
683 
684   // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
685   // regbank check here is to know why getConstrainedRegClassForOperand failed.
686   const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
687   if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
688       (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
689     I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
690     return true;
691   }
692 
693   return false;
694 }
695 
696 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
697   MachineBasicBlock *BB = I.getParent();
698 
699   Register DstReg = I.getOperand(0).getReg();
700   Register Src0Reg = I.getOperand(1).getReg();
701   Register Src1Reg = I.getOperand(2).getReg();
702   LLT Src1Ty = MRI->getType(Src1Reg);
703 
704   unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
705   unsigned InsSize = Src1Ty.getSizeInBits();
706 
707   int64_t Offset = I.getOperand(3).getImm();
708 
709   // FIXME: These cases should have been illegal and unnecessary to check here.
710   if (Offset % 32 != 0 || InsSize % 32 != 0)
711     return false;
712 
713   // Currently not handled by getSubRegFromChannel.
714   if (InsSize > 128)
715     return false;
716 
717   unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
718   if (SubReg == AMDGPU::NoSubRegister)
719     return false;
720 
721   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
722   const TargetRegisterClass *DstRC =
723     TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
724   if (!DstRC)
725     return false;
726 
727   const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
728   const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
729   const TargetRegisterClass *Src0RC =
730     TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI);
731   const TargetRegisterClass *Src1RC =
732     TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI);
733 
734   // Deal with weird cases where the class only partially supports the subreg
735   // index.
736   Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
737   if (!Src0RC || !Src1RC)
738     return false;
739 
740   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
741       !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
742       !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
743     return false;
744 
745   const DebugLoc &DL = I.getDebugLoc();
746   BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
747     .addReg(Src0Reg)
748     .addReg(Src1Reg)
749     .addImm(SubReg);
750 
751   I.eraseFromParent();
752   return true;
753 }
754 
755 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
756   if (STI.getLDSBankCount() != 16)
757     return selectImpl(MI, *CoverageInfo);
758 
759   Register Dst = MI.getOperand(0).getReg();
760   Register Src0 = MI.getOperand(2).getReg();
761   Register M0Val = MI.getOperand(6).getReg();
762   if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
763       !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
764       !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
765     return false;
766 
767   // This requires 2 instructions. It is possible to write a pattern to support
768   // this, but the generated isel emitter doesn't correctly deal with multiple
769   // output instructions using the same physical register input. The copy to m0
770   // is incorrectly placed before the second instruction.
771   //
772   // TODO: Match source modifiers.
773 
774   Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
775   const DebugLoc &DL = MI.getDebugLoc();
776   MachineBasicBlock *MBB = MI.getParent();
777 
778   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
779     .addReg(M0Val);
780   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
781     .addImm(2)
782     .addImm(MI.getOperand(4).getImm())  // $attr
783     .addImm(MI.getOperand(3).getImm()); // $attrchan
784 
785   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
786     .addImm(0)                          // $src0_modifiers
787     .addReg(Src0)                       // $src0
788     .addImm(MI.getOperand(4).getImm())  // $attr
789     .addImm(MI.getOperand(3).getImm())  // $attrchan
790     .addImm(0)                          // $src2_modifiers
791     .addReg(InterpMov)                  // $src2 - 2 f16 values selected by high
792     .addImm(MI.getOperand(5).getImm())  // $high
793     .addImm(0)                          // $clamp
794     .addImm(0);                         // $omod
795 
796   MI.eraseFromParent();
797   return true;
798 }
799 
800 // We need to handle this here because tablegen doesn't support matching
801 // instructions with multiple outputs.
802 bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
803   Register Dst0 = MI.getOperand(0).getReg();
804   Register Dst1 = MI.getOperand(1).getReg();
805 
806   LLT Ty = MRI->getType(Dst0);
807   unsigned Opc;
808   if (Ty == LLT::scalar(32))
809     Opc = AMDGPU::V_DIV_SCALE_F32;
810   else if (Ty == LLT::scalar(64))
811     Opc = AMDGPU::V_DIV_SCALE_F64;
812   else
813     return false;
814 
815   const DebugLoc &DL = MI.getDebugLoc();
816   MachineBasicBlock *MBB = MI.getParent();
817 
818   Register Numer = MI.getOperand(3).getReg();
819   Register Denom = MI.getOperand(4).getReg();
820   unsigned ChooseDenom = MI.getOperand(5).getImm();
821 
822   Register Src0 = ChooseDenom != 0 ? Numer : Denom;
823 
824   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
825     .addDef(Dst1)
826     .addUse(Src0)
827     .addUse(Denom)
828     .addUse(Numer);
829 
830   MI.eraseFromParent();
831   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
832 }
833 
834 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
835   unsigned IntrinsicID = I.getIntrinsicID();
836   switch (IntrinsicID) {
837   case Intrinsic::amdgcn_if_break: {
838     MachineBasicBlock *BB = I.getParent();
839 
840     // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
841     // SelectionDAG uses for wave32 vs wave64.
842     BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
843       .add(I.getOperand(0))
844       .add(I.getOperand(2))
845       .add(I.getOperand(3));
846 
847     Register DstReg = I.getOperand(0).getReg();
848     Register Src0Reg = I.getOperand(2).getReg();
849     Register Src1Reg = I.getOperand(3).getReg();
850 
851     I.eraseFromParent();
852 
853     for (Register Reg : { DstReg, Src0Reg, Src1Reg })
854       MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
855 
856     return true;
857   }
858   case Intrinsic::amdgcn_interp_p1_f16:
859     return selectInterpP1F16(I);
860   case Intrinsic::amdgcn_wqm:
861     return constrainCopyLikeIntrin(I, AMDGPU::WQM);
862   case Intrinsic::amdgcn_softwqm:
863     return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
864   case Intrinsic::amdgcn_wwm:
865     return constrainCopyLikeIntrin(I, AMDGPU::WWM);
866   case Intrinsic::amdgcn_div_scale:
867     return selectDivScale(I);
868   case Intrinsic::amdgcn_icmp:
869     return selectIntrinsicIcmp(I);
870   case Intrinsic::amdgcn_ballot:
871     return selectBallot(I);
872   case Intrinsic::amdgcn_reloc_constant:
873     return selectRelocConstant(I);
874   case Intrinsic::returnaddress:
875     return selectReturnAddress(I);
876   default:
877     return selectImpl(I, *CoverageInfo);
878   }
879 }
880 
881 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) {
882   if (Size != 32 && Size != 64)
883     return -1;
884   switch (P) {
885   default:
886     llvm_unreachable("Unknown condition code!");
887   case CmpInst::ICMP_NE:
888     return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64;
889   case CmpInst::ICMP_EQ:
890     return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64;
891   case CmpInst::ICMP_SGT:
892     return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64;
893   case CmpInst::ICMP_SGE:
894     return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64;
895   case CmpInst::ICMP_SLT:
896     return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64;
897   case CmpInst::ICMP_SLE:
898     return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64;
899   case CmpInst::ICMP_UGT:
900     return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64;
901   case CmpInst::ICMP_UGE:
902     return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64;
903   case CmpInst::ICMP_ULT:
904     return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64;
905   case CmpInst::ICMP_ULE:
906     return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64;
907   }
908 }
909 
910 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
911                                               unsigned Size) const {
912   if (Size == 64) {
913     if (!STI.hasScalarCompareEq64())
914       return -1;
915 
916     switch (P) {
917     case CmpInst::ICMP_NE:
918       return AMDGPU::S_CMP_LG_U64;
919     case CmpInst::ICMP_EQ:
920       return AMDGPU::S_CMP_EQ_U64;
921     default:
922       return -1;
923     }
924   }
925 
926   if (Size != 32)
927     return -1;
928 
929   switch (P) {
930   case CmpInst::ICMP_NE:
931     return AMDGPU::S_CMP_LG_U32;
932   case CmpInst::ICMP_EQ:
933     return AMDGPU::S_CMP_EQ_U32;
934   case CmpInst::ICMP_SGT:
935     return AMDGPU::S_CMP_GT_I32;
936   case CmpInst::ICMP_SGE:
937     return AMDGPU::S_CMP_GE_I32;
938   case CmpInst::ICMP_SLT:
939     return AMDGPU::S_CMP_LT_I32;
940   case CmpInst::ICMP_SLE:
941     return AMDGPU::S_CMP_LE_I32;
942   case CmpInst::ICMP_UGT:
943     return AMDGPU::S_CMP_GT_U32;
944   case CmpInst::ICMP_UGE:
945     return AMDGPU::S_CMP_GE_U32;
946   case CmpInst::ICMP_ULT:
947     return AMDGPU::S_CMP_LT_U32;
948   case CmpInst::ICMP_ULE:
949     return AMDGPU::S_CMP_LE_U32;
950   default:
951     llvm_unreachable("Unknown condition code!");
952   }
953 }
954 
955 bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
956   MachineBasicBlock *BB = I.getParent();
957   const DebugLoc &DL = I.getDebugLoc();
958 
959   Register SrcReg = I.getOperand(2).getReg();
960   unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
961 
962   auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
963 
964   Register CCReg = I.getOperand(0).getReg();
965   if (!isVCC(CCReg, *MRI)) {
966     int Opcode = getS_CMPOpcode(Pred, Size);
967     if (Opcode == -1)
968       return false;
969     MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
970             .add(I.getOperand(2))
971             .add(I.getOperand(3));
972     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
973       .addReg(AMDGPU::SCC);
974     bool Ret =
975         constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
976         RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
977     I.eraseFromParent();
978     return Ret;
979   }
980 
981   int Opcode = getV_CMPOpcode(Pred, Size);
982   if (Opcode == -1)
983     return false;
984 
985   MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode),
986             I.getOperand(0).getReg())
987             .add(I.getOperand(2))
988             .add(I.getOperand(3));
989   RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
990                                *TRI.getBoolRC(), *MRI);
991   bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
992   I.eraseFromParent();
993   return Ret;
994 }
995 
996 bool AMDGPUInstructionSelector::selectIntrinsicIcmp(MachineInstr &I) const {
997   Register Dst = I.getOperand(0).getReg();
998   if (isVCC(Dst, *MRI))
999     return false;
1000 
1001   if (MRI->getType(Dst).getSizeInBits() != STI.getWavefrontSize())
1002     return false;
1003 
1004   MachineBasicBlock *BB = I.getParent();
1005   const DebugLoc &DL = I.getDebugLoc();
1006   Register SrcReg = I.getOperand(2).getReg();
1007   unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1008   auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1009 
1010   int Opcode = getV_CMPOpcode(Pred, Size);
1011   if (Opcode == -1)
1012     return false;
1013 
1014   MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst)
1015                            .add(I.getOperand(2))
1016                            .add(I.getOperand(3));
1017   RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), *TRI.getBoolRC(),
1018                                *MRI);
1019   bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1020   I.eraseFromParent();
1021   return Ret;
1022 }
1023 
1024 bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1025   MachineBasicBlock *BB = I.getParent();
1026   const DebugLoc &DL = I.getDebugLoc();
1027   Register DstReg = I.getOperand(0).getReg();
1028   const unsigned Size = MRI->getType(DstReg).getSizeInBits();
1029   const bool Is64 = Size == 64;
1030 
1031   if (Size != STI.getWavefrontSize())
1032     return false;
1033 
1034   Optional<ValueAndVReg> Arg =
1035       getConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI, true);
1036 
1037   if (Arg.hasValue()) {
1038     const int64_t Value = Arg.getValue().Value;
1039     if (Value == 0) {
1040       unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1041       BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
1042     } else if (Value == -1) { // all ones
1043       Register SrcReg = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
1044       BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
1045     } else
1046       return false;
1047   } else {
1048     Register SrcReg = I.getOperand(2).getReg();
1049     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
1050   }
1051 
1052   I.eraseFromParent();
1053   return true;
1054 }
1055 
1056 bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1057   Register DstReg = I.getOperand(0).getReg();
1058   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1059   const TargetRegisterClass *DstRC =
1060     TRI.getRegClassForSizeOnBank(32, *DstBank, *MRI);
1061   if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1062     return false;
1063 
1064   const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1065 
1066   Module *M = MF->getFunction().getParent();
1067   const MDNode *Metadata = I.getOperand(2).getMetadata();
1068   auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1069   auto RelocSymbol = cast<GlobalVariable>(
1070     M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1071 
1072   MachineBasicBlock *BB = I.getParent();
1073   BuildMI(*BB, &I, I.getDebugLoc(),
1074           TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1075     .addGlobalAddress(RelocSymbol, 0, SIInstrInfo::MO_ABS32_LO);
1076 
1077   I.eraseFromParent();
1078   return true;
1079 }
1080 
1081 bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1082   MachineBasicBlock *MBB = I.getParent();
1083   MachineFunction &MF = *MBB->getParent();
1084   const DebugLoc &DL = I.getDebugLoc();
1085 
1086   MachineOperand &Dst = I.getOperand(0);
1087   Register DstReg = Dst.getReg();
1088   unsigned Depth = I.getOperand(2).getImm();
1089 
1090   const TargetRegisterClass *RC
1091     = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1092   if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1093       !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1094     return false;
1095 
1096   // Check for kernel and shader functions
1097   if (Depth != 0 ||
1098       MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1099     BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1100       .addImm(0);
1101     I.eraseFromParent();
1102     return true;
1103   }
1104 
1105   MachineFrameInfo &MFI = MF.getFrameInfo();
1106   // There is a call to @llvm.returnaddress in this function
1107   MFI.setReturnAddressIsTaken(true);
1108 
1109   // Get the return address reg and mark it as an implicit live-in
1110   Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1111   Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1112                                              AMDGPU::SReg_64RegClass);
1113   BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1114     .addReg(LiveIn);
1115   I.eraseFromParent();
1116   return true;
1117 }
1118 
1119 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1120   // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
1121   // SelectionDAG uses for wave32 vs wave64.
1122   MachineBasicBlock *BB = MI.getParent();
1123   BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1124       .add(MI.getOperand(1));
1125 
1126   Register Reg = MI.getOperand(1).getReg();
1127   MI.eraseFromParent();
1128 
1129   if (!MRI->getRegClassOrNull(Reg))
1130     MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1131   return true;
1132 }
1133 
1134 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1135   MachineInstr &MI, Intrinsic::ID IntrID) const {
1136   MachineBasicBlock *MBB = MI.getParent();
1137   MachineFunction *MF = MBB->getParent();
1138   const DebugLoc &DL = MI.getDebugLoc();
1139 
1140   unsigned IndexOperand = MI.getOperand(7).getImm();
1141   bool WaveRelease = MI.getOperand(8).getImm() != 0;
1142   bool WaveDone = MI.getOperand(9).getImm() != 0;
1143 
1144   if (WaveDone && !WaveRelease)
1145     report_fatal_error("ds_ordered_count: wave_done requires wave_release");
1146 
1147   unsigned OrderedCountIndex = IndexOperand & 0x3f;
1148   IndexOperand &= ~0x3f;
1149   unsigned CountDw = 0;
1150 
1151   if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1152     CountDw = (IndexOperand >> 24) & 0xf;
1153     IndexOperand &= ~(0xf << 24);
1154 
1155     if (CountDw < 1 || CountDw > 4) {
1156       report_fatal_error(
1157         "ds_ordered_count: dword count must be between 1 and 4");
1158     }
1159   }
1160 
1161   if (IndexOperand)
1162     report_fatal_error("ds_ordered_count: bad index operand");
1163 
1164   unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1165   unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1166 
1167   unsigned Offset0 = OrderedCountIndex << 2;
1168   unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) |
1169                      (Instruction << 4);
1170 
1171   if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
1172     Offset1 |= (CountDw - 1) << 6;
1173 
1174   unsigned Offset = Offset0 | (Offset1 << 8);
1175 
1176   Register M0Val = MI.getOperand(2).getReg();
1177   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1178     .addReg(M0Val);
1179 
1180   Register DstReg = MI.getOperand(0).getReg();
1181   Register ValReg = MI.getOperand(3).getReg();
1182   MachineInstrBuilder DS =
1183     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1184       .addReg(ValReg)
1185       .addImm(Offset)
1186       .cloneMemRefs(MI);
1187 
1188   if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1189     return false;
1190 
1191   bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1192   MI.eraseFromParent();
1193   return Ret;
1194 }
1195 
1196 static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1197   switch (IntrID) {
1198   case Intrinsic::amdgcn_ds_gws_init:
1199     return AMDGPU::DS_GWS_INIT;
1200   case Intrinsic::amdgcn_ds_gws_barrier:
1201     return AMDGPU::DS_GWS_BARRIER;
1202   case Intrinsic::amdgcn_ds_gws_sema_v:
1203     return AMDGPU::DS_GWS_SEMA_V;
1204   case Intrinsic::amdgcn_ds_gws_sema_br:
1205     return AMDGPU::DS_GWS_SEMA_BR;
1206   case Intrinsic::amdgcn_ds_gws_sema_p:
1207     return AMDGPU::DS_GWS_SEMA_P;
1208   case Intrinsic::amdgcn_ds_gws_sema_release_all:
1209     return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1210   default:
1211     llvm_unreachable("not a gws intrinsic");
1212   }
1213 }
1214 
1215 bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1216                                                      Intrinsic::ID IID) const {
1217   if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1218       !STI.hasGWSSemaReleaseAll())
1219     return false;
1220 
1221   // intrinsic ID, vsrc, offset
1222   const bool HasVSrc = MI.getNumOperands() == 3;
1223   assert(HasVSrc || MI.getNumOperands() == 2);
1224 
1225   Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1226   const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1227   if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1228     return false;
1229 
1230   MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1231   assert(OffsetDef);
1232 
1233   unsigned ImmOffset;
1234 
1235   MachineBasicBlock *MBB = MI.getParent();
1236   const DebugLoc &DL = MI.getDebugLoc();
1237 
1238   MachineInstr *Readfirstlane = nullptr;
1239 
1240   // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1241   // incoming offset, in case there's an add of a constant. We'll have to put it
1242   // back later.
1243   if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1244     Readfirstlane = OffsetDef;
1245     BaseOffset = OffsetDef->getOperand(1).getReg();
1246     OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1247   }
1248 
1249   if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1250     // If we have a constant offset, try to use the 0 in m0 as the base.
1251     // TODO: Look into changing the default m0 initialization value. If the
1252     // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1253     // the immediate offset.
1254 
1255     ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1256     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1257       .addImm(0);
1258   } else {
1259     std::tie(BaseOffset, ImmOffset, OffsetDef)
1260       = AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset);
1261 
1262     if (Readfirstlane) {
1263       // We have the constant offset now, so put the readfirstlane back on the
1264       // variable component.
1265       if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1266         return false;
1267 
1268       Readfirstlane->getOperand(1).setReg(BaseOffset);
1269       BaseOffset = Readfirstlane->getOperand(0).getReg();
1270     } else {
1271       if (!RBI.constrainGenericRegister(BaseOffset,
1272                                         AMDGPU::SReg_32RegClass, *MRI))
1273         return false;
1274     }
1275 
1276     Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1277     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1278       .addReg(BaseOffset)
1279       .addImm(16);
1280 
1281     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1282       .addReg(M0Base);
1283   }
1284 
1285   // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1286   // offset field) % 64. Some versions of the programming guide omit the m0
1287   // part, or claim it's from offset 0.
1288   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1289 
1290   if (HasVSrc) {
1291     Register VSrc = MI.getOperand(1).getReg();
1292     MIB.addReg(VSrc);
1293     if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1294       return false;
1295   }
1296 
1297   MIB.addImm(ImmOffset)
1298      .addImm(-1) // $gds
1299      .cloneMemRefs(MI);
1300 
1301   MI.eraseFromParent();
1302   return true;
1303 }
1304 
1305 bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1306                                                       bool IsAppend) const {
1307   Register PtrBase = MI.getOperand(2).getReg();
1308   LLT PtrTy = MRI->getType(PtrBase);
1309   bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1310 
1311   unsigned Offset;
1312   std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1313 
1314   // TODO: Should this try to look through readfirstlane like GWS?
1315   if (!isDSOffsetLegal(PtrBase, Offset, 16)) {
1316     PtrBase = MI.getOperand(2).getReg();
1317     Offset = 0;
1318   }
1319 
1320   MachineBasicBlock *MBB = MI.getParent();
1321   const DebugLoc &DL = MI.getDebugLoc();
1322   const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1323 
1324   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1325     .addReg(PtrBase);
1326   if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1327     return false;
1328 
1329   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1330     .addImm(Offset)
1331     .addImm(IsGDS ? -1 : 0)
1332     .cloneMemRefs(MI);
1333   MI.eraseFromParent();
1334   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1335 }
1336 
1337 static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
1338                          bool &IsTexFail) {
1339   if (TexFailCtrl)
1340     IsTexFail = true;
1341 
1342   TFE = (TexFailCtrl & 0x1) ? 1 : 0;
1343   TexFailCtrl &= ~(uint64_t)0x1;
1344   LWE = (TexFailCtrl & 0x2) ? 1 : 0;
1345   TexFailCtrl &= ~(uint64_t)0x2;
1346 
1347   return TexFailCtrl == 0;
1348 }
1349 
1350 static bool parseCachePolicy(uint64_t Value,
1351                              bool *GLC, bool *SLC, bool *DLC) {
1352   if (GLC) {
1353     *GLC = (Value & 0x1) ? 1 : 0;
1354     Value &= ~(uint64_t)0x1;
1355   }
1356   if (SLC) {
1357     *SLC = (Value & 0x2) ? 1 : 0;
1358     Value &= ~(uint64_t)0x2;
1359   }
1360   if (DLC) {
1361     *DLC = (Value & 0x4) ? 1 : 0;
1362     Value &= ~(uint64_t)0x4;
1363   }
1364 
1365   return Value == 0;
1366 }
1367 
1368 bool AMDGPUInstructionSelector::selectImageIntrinsic(
1369   MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
1370   MachineBasicBlock *MBB = MI.getParent();
1371   const DebugLoc &DL = MI.getDebugLoc();
1372 
1373   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1374     AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1375 
1376   const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
1377   const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
1378       AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
1379   const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo =
1380       AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode);
1381   unsigned IntrOpcode = Intr->BaseOpcode;
1382   const bool IsGFX10 = STI.getGeneration() >= AMDGPUSubtarget::GFX10;
1383 
1384   const int VAddrIdx = getImageVAddrIdxBegin(BaseOpcode,
1385                                              MI.getNumExplicitDefs());
1386   int NumVAddr, NumGradients;
1387   std::tie(NumVAddr, NumGradients) = getImageNumVAddr(Intr, BaseOpcode);
1388 
1389   Register VDataIn, VDataOut;
1390   LLT VDataTy;
1391   int NumVDataDwords = -1;
1392   bool IsD16 = false;
1393 
1394   // XXX - Can we just get the second to last argument for ctrl?
1395   unsigned CtrlIdx; // Index of texfailctrl argument
1396   bool Unorm;
1397   if (!BaseOpcode->Sampler) {
1398     Unorm = true;
1399     CtrlIdx = VAddrIdx + NumVAddr + 1;
1400   } else {
1401     Unorm = MI.getOperand(VAddrIdx + NumVAddr + 2).getImm() != 0;
1402     CtrlIdx = VAddrIdx + NumVAddr + 3;
1403   }
1404 
1405   bool TFE;
1406   bool LWE;
1407   bool IsTexFail = false;
1408   if (!parseTexFail(MI.getOperand(CtrlIdx).getImm(), TFE, LWE, IsTexFail))
1409     return false;
1410 
1411   const int Flags = MI.getOperand(CtrlIdx + 2).getImm();
1412   const bool IsA16 = (Flags & 1) != 0;
1413   const bool IsG16 = (Flags & 2) != 0;
1414 
1415   // A16 implies 16 bit gradients
1416   if (IsA16 && !IsG16)
1417     return false;
1418 
1419   unsigned DMask = 0;
1420   unsigned DMaskLanes = 0;
1421 
1422   if (BaseOpcode->Atomic) {
1423     VDataOut = MI.getOperand(0).getReg();
1424     VDataIn = MI.getOperand(2).getReg();
1425     LLT Ty = MRI->getType(VDataIn);
1426 
1427     // Be careful to allow atomic swap on 16-bit element vectors.
1428     const bool Is64Bit = BaseOpcode->AtomicX2 ?
1429       Ty.getSizeInBits() == 128 :
1430       Ty.getSizeInBits() == 64;
1431 
1432     if (BaseOpcode->AtomicX2) {
1433       assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
1434 
1435       DMask = Is64Bit ? 0xf : 0x3;
1436       NumVDataDwords = Is64Bit ? 4 : 2;
1437     } else {
1438       DMask = Is64Bit ? 0x3 : 0x1;
1439       NumVDataDwords = Is64Bit ? 2 : 1;
1440     }
1441   } else {
1442     const int DMaskIdx = 2; // Input/output + intrinsic ID.
1443 
1444     DMask = MI.getOperand(DMaskIdx).getImm();
1445     DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
1446 
1447     if (BaseOpcode->Store) {
1448       VDataIn = MI.getOperand(1).getReg();
1449       VDataTy = MRI->getType(VDataIn);
1450       NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
1451     } else {
1452       VDataOut = MI.getOperand(0).getReg();
1453       VDataTy = MRI->getType(VDataOut);
1454       NumVDataDwords = DMaskLanes;
1455 
1456       // One memoperand is mandatory, except for getresinfo.
1457       // FIXME: Check this in verifier.
1458       if (!MI.memoperands_empty()) {
1459         const MachineMemOperand *MMO = *MI.memoperands_begin();
1460 
1461         // Infer d16 from the memory size, as the register type will be mangled by
1462         // unpacked subtargets, or by TFE.
1463         IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32;
1464 
1465         if (IsD16 && !STI.hasUnpackedD16VMem())
1466           NumVDataDwords = (DMaskLanes + 1) / 2;
1467       }
1468     }
1469   }
1470 
1471   // Optimize _L to _LZ when _L is zero
1472   if (LZMappingInfo) {
1473     // The legalizer replaced the register with an immediate 0 if we need to
1474     // change the opcode.
1475     const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1);
1476     if (Lod.isImm()) {
1477       assert(Lod.getImm() == 0);
1478       IntrOpcode = LZMappingInfo->LZ;  // set new opcode to _lz variant of _l
1479     }
1480   }
1481 
1482   // Optimize _mip away, when 'lod' is zero
1483   if (MIPMappingInfo) {
1484     const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1);
1485     if (Lod.isImm()) {
1486       assert(Lod.getImm() == 0);
1487       IntrOpcode = MIPMappingInfo->NONMIP;  // set new opcode to variant without _mip
1488     }
1489   }
1490 
1491   // Set G16 opcode
1492   if (IsG16 && !IsA16) {
1493     const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
1494         AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
1495     assert(G16MappingInfo);
1496     IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
1497   }
1498 
1499   // TODO: Check this in verifier.
1500   assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
1501 
1502   bool GLC = false;
1503   bool SLC = false;
1504   bool DLC = false;
1505   if (BaseOpcode->Atomic) {
1506     GLC = true; // TODO no-return optimization
1507     if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), nullptr, &SLC,
1508                           IsGFX10 ? &DLC : nullptr))
1509       return false;
1510   } else {
1511     if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), &GLC, &SLC,
1512                           IsGFX10 ? &DLC : nullptr))
1513       return false;
1514   }
1515 
1516   int NumVAddrRegs = 0;
1517   int NumVAddrDwords = 0;
1518   for (int I = 0; I < NumVAddr; ++I) {
1519     // Skip the $noregs and 0s inserted during legalization.
1520     MachineOperand &AddrOp = MI.getOperand(VAddrIdx + I);
1521     if (!AddrOp.isReg())
1522       continue; // XXX - Break?
1523 
1524     Register Addr = AddrOp.getReg();
1525     if (!Addr)
1526       break;
1527 
1528     ++NumVAddrRegs;
1529     NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
1530   }
1531 
1532   // The legalizer preprocessed the intrinsic arguments. If we aren't using
1533   // NSA, these should have beeen packed into a single value in the first
1534   // address register
1535   const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs;
1536   if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
1537     LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
1538     return false;
1539   }
1540 
1541   if (IsTexFail)
1542     ++NumVDataDwords;
1543 
1544   int Opcode = -1;
1545   if (IsGFX10) {
1546     Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1547                                    UseNSA ? AMDGPU::MIMGEncGfx10NSA
1548                                           : AMDGPU::MIMGEncGfx10Default,
1549                                    NumVDataDwords, NumVAddrDwords);
1550   } else {
1551     if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
1552       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
1553                                      NumVDataDwords, NumVAddrDwords);
1554     if (Opcode == -1)
1555       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
1556                                      NumVDataDwords, NumVAddrDwords);
1557   }
1558   assert(Opcode != -1);
1559 
1560   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
1561     .cloneMemRefs(MI);
1562 
1563   if (VDataOut) {
1564     if (BaseOpcode->AtomicX2) {
1565       const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
1566 
1567       Register TmpReg = MRI->createVirtualRegister(
1568         Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
1569       unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
1570 
1571       MIB.addDef(TmpReg);
1572       BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
1573         .addReg(TmpReg, RegState::Kill, SubReg);
1574 
1575     } else {
1576       MIB.addDef(VDataOut); // vdata output
1577     }
1578   }
1579 
1580   if (VDataIn)
1581     MIB.addReg(VDataIn); // vdata input
1582 
1583   for (int i = 0; i != NumVAddrRegs; ++i) {
1584     MachineOperand &SrcOp = MI.getOperand(VAddrIdx + i);
1585     if (SrcOp.isReg()) {
1586       assert(SrcOp.getReg() != 0);
1587       MIB.addReg(SrcOp.getReg());
1588     }
1589   }
1590 
1591   MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr).getReg()); // rsrc
1592   if (BaseOpcode->Sampler)
1593     MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr + 1).getReg()); // sampler
1594 
1595   MIB.addImm(DMask); // dmask
1596 
1597   if (IsGFX10)
1598     MIB.addImm(DimInfo->Encoding);
1599   MIB.addImm(Unorm);
1600   if (IsGFX10)
1601     MIB.addImm(DLC);
1602 
1603   MIB.addImm(GLC);
1604   MIB.addImm(SLC);
1605   MIB.addImm(IsA16 &&  // a16 or r128
1606              STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
1607   if (IsGFX10)
1608     MIB.addImm(IsA16 ? -1 : 0);
1609 
1610   MIB.addImm(TFE); // tfe
1611   MIB.addImm(LWE); // lwe
1612   if (!IsGFX10)
1613     MIB.addImm(DimInfo->DA ? -1 : 0);
1614   if (BaseOpcode->HasD16)
1615     MIB.addImm(IsD16 ? -1 : 0);
1616 
1617   MI.eraseFromParent();
1618   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1619 }
1620 
1621 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
1622     MachineInstr &I) const {
1623   unsigned IntrinsicID = I.getIntrinsicID();
1624   switch (IntrinsicID) {
1625   case Intrinsic::amdgcn_end_cf:
1626     return selectEndCfIntrinsic(I);
1627   case Intrinsic::amdgcn_ds_ordered_add:
1628   case Intrinsic::amdgcn_ds_ordered_swap:
1629     return selectDSOrderedIntrinsic(I, IntrinsicID);
1630   case Intrinsic::amdgcn_ds_gws_init:
1631   case Intrinsic::amdgcn_ds_gws_barrier:
1632   case Intrinsic::amdgcn_ds_gws_sema_v:
1633   case Intrinsic::amdgcn_ds_gws_sema_br:
1634   case Intrinsic::amdgcn_ds_gws_sema_p:
1635   case Intrinsic::amdgcn_ds_gws_sema_release_all:
1636     return selectDSGWSIntrinsic(I, IntrinsicID);
1637   case Intrinsic::amdgcn_ds_append:
1638     return selectDSAppendConsume(I, true);
1639   case Intrinsic::amdgcn_ds_consume:
1640     return selectDSAppendConsume(I, false);
1641   default: {
1642     return selectImpl(I, *CoverageInfo);
1643   }
1644   }
1645 }
1646 
1647 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
1648   if (selectImpl(I, *CoverageInfo))
1649     return true;
1650 
1651   MachineBasicBlock *BB = I.getParent();
1652   const DebugLoc &DL = I.getDebugLoc();
1653 
1654   Register DstReg = I.getOperand(0).getReg();
1655   unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
1656   assert(Size <= 32 || Size == 64);
1657   const MachineOperand &CCOp = I.getOperand(1);
1658   Register CCReg = CCOp.getReg();
1659   if (!isVCC(CCReg, *MRI)) {
1660     unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
1661                                          AMDGPU::S_CSELECT_B32;
1662     MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
1663             .addReg(CCReg);
1664 
1665     // The generic constrainSelectedInstRegOperands doesn't work for the scc register
1666     // bank, because it does not cover the register class that we used to represent
1667     // for it.  So we need to manually set the register class here.
1668     if (!MRI->getRegClassOrNull(CCReg))
1669         MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
1670     MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
1671             .add(I.getOperand(2))
1672             .add(I.getOperand(3));
1673 
1674     bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI) |
1675                constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
1676     I.eraseFromParent();
1677     return Ret;
1678   }
1679 
1680   // Wide VGPR select should have been split in RegBankSelect.
1681   if (Size > 32)
1682     return false;
1683 
1684   MachineInstr *Select =
1685       BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1686               .addImm(0)
1687               .add(I.getOperand(3))
1688               .addImm(0)
1689               .add(I.getOperand(2))
1690               .add(I.getOperand(1));
1691 
1692   bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
1693   I.eraseFromParent();
1694   return Ret;
1695 }
1696 
1697 static int sizeToSubRegIndex(unsigned Size) {
1698   switch (Size) {
1699   case 32:
1700     return AMDGPU::sub0;
1701   case 64:
1702     return AMDGPU::sub0_sub1;
1703   case 96:
1704     return AMDGPU::sub0_sub1_sub2;
1705   case 128:
1706     return AMDGPU::sub0_sub1_sub2_sub3;
1707   case 256:
1708     return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
1709   default:
1710     if (Size < 32)
1711       return AMDGPU::sub0;
1712     if (Size > 256)
1713       return -1;
1714     return sizeToSubRegIndex(PowerOf2Ceil(Size));
1715   }
1716 }
1717 
1718 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
1719   Register DstReg = I.getOperand(0).getReg();
1720   Register SrcReg = I.getOperand(1).getReg();
1721   const LLT DstTy = MRI->getType(DstReg);
1722   const LLT SrcTy = MRI->getType(SrcReg);
1723   const LLT S1 = LLT::scalar(1);
1724 
1725   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
1726   const RegisterBank *DstRB;
1727   if (DstTy == S1) {
1728     // This is a special case. We don't treat s1 for legalization artifacts as
1729     // vcc booleans.
1730     DstRB = SrcRB;
1731   } else {
1732     DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1733     if (SrcRB != DstRB)
1734       return false;
1735   }
1736 
1737   const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
1738 
1739   unsigned DstSize = DstTy.getSizeInBits();
1740   unsigned SrcSize = SrcTy.getSizeInBits();
1741 
1742   const TargetRegisterClass *SrcRC
1743     = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI);
1744   const TargetRegisterClass *DstRC
1745     = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI);
1746   if (!SrcRC || !DstRC)
1747     return false;
1748 
1749   if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
1750       !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
1751     LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
1752     return false;
1753   }
1754 
1755   if (DstTy == LLT::vector(2, 16) && SrcTy == LLT::vector(2, 32)) {
1756     MachineBasicBlock *MBB = I.getParent();
1757     const DebugLoc &DL = I.getDebugLoc();
1758 
1759     Register LoReg = MRI->createVirtualRegister(DstRC);
1760     Register HiReg = MRI->createVirtualRegister(DstRC);
1761     BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
1762       .addReg(SrcReg, 0, AMDGPU::sub0);
1763     BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
1764       .addReg(SrcReg, 0, AMDGPU::sub1);
1765 
1766     if (IsVALU && STI.hasSDWA()) {
1767       // Write the low 16-bits of the high element into the high 16-bits of the
1768       // low element.
1769       MachineInstr *MovSDWA =
1770         BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
1771         .addImm(0)                             // $src0_modifiers
1772         .addReg(HiReg)                         // $src0
1773         .addImm(0)                             // $clamp
1774         .addImm(AMDGPU::SDWA::WORD_1)          // $dst_sel
1775         .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
1776         .addImm(AMDGPU::SDWA::WORD_0)          // $src0_sel
1777         .addReg(LoReg, RegState::Implicit);
1778       MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
1779     } else {
1780       Register TmpReg0 = MRI->createVirtualRegister(DstRC);
1781       Register TmpReg1 = MRI->createVirtualRegister(DstRC);
1782       Register ImmReg = MRI->createVirtualRegister(DstRC);
1783       if (IsVALU) {
1784         BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
1785           .addImm(16)
1786           .addReg(HiReg);
1787       } else {
1788         BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
1789           .addReg(HiReg)
1790           .addImm(16);
1791       }
1792 
1793       unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
1794       unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
1795       unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
1796 
1797       BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
1798         .addImm(0xffff);
1799       BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
1800         .addReg(LoReg)
1801         .addReg(ImmReg);
1802       BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
1803         .addReg(TmpReg0)
1804         .addReg(TmpReg1);
1805     }
1806 
1807     I.eraseFromParent();
1808     return true;
1809   }
1810 
1811   if (!DstTy.isScalar())
1812     return false;
1813 
1814   if (SrcSize > 32) {
1815     int SubRegIdx = sizeToSubRegIndex(DstSize);
1816     if (SubRegIdx == -1)
1817       return false;
1818 
1819     // Deal with weird cases where the class only partially supports the subreg
1820     // index.
1821     const TargetRegisterClass *SrcWithSubRC
1822       = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
1823     if (!SrcWithSubRC)
1824       return false;
1825 
1826     if (SrcWithSubRC != SrcRC) {
1827       if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
1828         return false;
1829     }
1830 
1831     I.getOperand(1).setSubReg(SubRegIdx);
1832   }
1833 
1834   I.setDesc(TII.get(TargetOpcode::COPY));
1835   return true;
1836 }
1837 
1838 /// \returns true if a bitmask for \p Size bits will be an inline immediate.
1839 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
1840   Mask = maskTrailingOnes<unsigned>(Size);
1841   int SignedMask = static_cast<int>(Mask);
1842   return SignedMask >= -16 && SignedMask <= 64;
1843 }
1844 
1845 // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
1846 const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
1847   Register Reg, const MachineRegisterInfo &MRI,
1848   const TargetRegisterInfo &TRI) const {
1849   const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
1850   if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>())
1851     return RB;
1852 
1853   // Ignore the type, since we don't use vcc in artifacts.
1854   if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
1855     return &RBI.getRegBankFromRegClass(*RC, LLT());
1856   return nullptr;
1857 }
1858 
1859 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
1860   bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
1861   bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
1862   const DebugLoc &DL = I.getDebugLoc();
1863   MachineBasicBlock &MBB = *I.getParent();
1864   const Register DstReg = I.getOperand(0).getReg();
1865   const Register SrcReg = I.getOperand(1).getReg();
1866 
1867   const LLT DstTy = MRI->getType(DstReg);
1868   const LLT SrcTy = MRI->getType(SrcReg);
1869   const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
1870     I.getOperand(2).getImm() : SrcTy.getSizeInBits();
1871   const unsigned DstSize = DstTy.getSizeInBits();
1872   if (!DstTy.isScalar())
1873     return false;
1874 
1875   // Artifact casts should never use vcc.
1876   const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
1877 
1878   // FIXME: This should probably be illegal and split earlier.
1879   if (I.getOpcode() == AMDGPU::G_ANYEXT) {
1880     if (DstSize <= 32)
1881       return selectCOPY(I);
1882 
1883     const TargetRegisterClass *SrcRC =
1884         TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank, *MRI);
1885     const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1886     const TargetRegisterClass *DstRC =
1887         TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
1888 
1889     Register UndefReg = MRI->createVirtualRegister(SrcRC);
1890     BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
1891     BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1892       .addReg(SrcReg)
1893       .addImm(AMDGPU::sub0)
1894       .addReg(UndefReg)
1895       .addImm(AMDGPU::sub1);
1896     I.eraseFromParent();
1897 
1898     return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
1899            RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
1900   }
1901 
1902   if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
1903     // 64-bit should have been split up in RegBankSelect
1904 
1905     // Try to use an and with a mask if it will save code size.
1906     unsigned Mask;
1907     if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
1908       MachineInstr *ExtI =
1909       BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
1910         .addImm(Mask)
1911         .addReg(SrcReg);
1912       I.eraseFromParent();
1913       return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
1914     }
1915 
1916     const unsigned BFE = Signed ? AMDGPU::V_BFE_I32 : AMDGPU::V_BFE_U32;
1917     MachineInstr *ExtI =
1918       BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
1919       .addReg(SrcReg)
1920       .addImm(0) // Offset
1921       .addImm(SrcSize); // Width
1922     I.eraseFromParent();
1923     return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
1924   }
1925 
1926   if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
1927     const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
1928       AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
1929     if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
1930       return false;
1931 
1932     if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
1933       const unsigned SextOpc = SrcSize == 8 ?
1934         AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
1935       BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
1936         .addReg(SrcReg);
1937       I.eraseFromParent();
1938       return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
1939     }
1940 
1941     const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
1942     const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
1943 
1944     // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
1945     if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
1946       // We need a 64-bit register source, but the high bits don't matter.
1947       Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
1948       Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1949       unsigned SubReg = InReg ? AMDGPU::sub0 : 0;
1950 
1951       BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
1952       BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
1953         .addReg(SrcReg, 0, SubReg)
1954         .addImm(AMDGPU::sub0)
1955         .addReg(UndefReg)
1956         .addImm(AMDGPU::sub1);
1957 
1958       BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
1959         .addReg(ExtReg)
1960         .addImm(SrcSize << 16);
1961 
1962       I.eraseFromParent();
1963       return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
1964     }
1965 
1966     unsigned Mask;
1967     if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
1968       BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
1969         .addReg(SrcReg)
1970         .addImm(Mask);
1971     } else {
1972       BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
1973         .addReg(SrcReg)
1974         .addImm(SrcSize << 16);
1975     }
1976 
1977     I.eraseFromParent();
1978     return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
1979   }
1980 
1981   return false;
1982 }
1983 
1984 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
1985   MachineBasicBlock *BB = I.getParent();
1986   MachineOperand &ImmOp = I.getOperand(1);
1987 
1988   // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
1989   if (ImmOp.isFPImm()) {
1990     const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
1991     ImmOp.ChangeToImmediate(Imm.getZExtValue());
1992   } else if (ImmOp.isCImm()) {
1993     ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue());
1994   }
1995 
1996   Register DstReg = I.getOperand(0).getReg();
1997   unsigned Size;
1998   bool IsSgpr;
1999   const RegisterBank *RB = MRI->getRegBankOrNull(I.getOperand(0).getReg());
2000   if (RB) {
2001     IsSgpr = RB->getID() == AMDGPU::SGPRRegBankID;
2002     Size = MRI->getType(DstReg).getSizeInBits();
2003   } else {
2004     const TargetRegisterClass *RC = TRI.getRegClassForReg(*MRI, DstReg);
2005     IsSgpr = TRI.isSGPRClass(RC);
2006     Size = TRI.getRegSizeInBits(*RC);
2007   }
2008 
2009   if (Size != 32 && Size != 64)
2010     return false;
2011 
2012   unsigned Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
2013   if (Size == 32) {
2014     I.setDesc(TII.get(Opcode));
2015     I.addImplicitDefUseOperands(*MF);
2016     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2017   }
2018 
2019   const DebugLoc &DL = I.getDebugLoc();
2020 
2021   APInt Imm(Size, I.getOperand(1).getImm());
2022 
2023   MachineInstr *ResInst;
2024   if (IsSgpr && TII.isInlineConstant(Imm)) {
2025     ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
2026       .addImm(I.getOperand(1).getImm());
2027   } else {
2028     const TargetRegisterClass *RC = IsSgpr ?
2029       &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass;
2030     Register LoReg = MRI->createVirtualRegister(RC);
2031     Register HiReg = MRI->createVirtualRegister(RC);
2032 
2033     BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
2034       .addImm(Imm.trunc(32).getZExtValue());
2035 
2036     BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
2037       .addImm(Imm.ashr(32).getZExtValue());
2038 
2039     ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2040       .addReg(LoReg)
2041       .addImm(AMDGPU::sub0)
2042       .addReg(HiReg)
2043       .addImm(AMDGPU::sub1);
2044   }
2045 
2046   // We can't call constrainSelectedInstRegOperands here, because it doesn't
2047   // work for target independent opcodes
2048   I.eraseFromParent();
2049   const TargetRegisterClass *DstRC =
2050     TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI);
2051   if (!DstRC)
2052     return true;
2053   return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI);
2054 }
2055 
2056 bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2057   // Only manually handle the f64 SGPR case.
2058   //
2059   // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2060   // the bit ops theoretically have a second result due to the implicit def of
2061   // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2062   // that is easy by disabling the check. The result works, but uses a
2063   // nonsensical sreg32orlds_and_sreg_1 regclass.
2064   //
2065   // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2066   // the variadic REG_SEQUENCE operands.
2067 
2068   Register Dst = MI.getOperand(0).getReg();
2069   const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2070   if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2071       MRI->getType(Dst) != LLT::scalar(64))
2072     return false;
2073 
2074   Register Src = MI.getOperand(1).getReg();
2075   MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2076   if (Fabs)
2077     Src = Fabs->getOperand(1).getReg();
2078 
2079   if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2080       !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2081     return false;
2082 
2083   MachineBasicBlock *BB = MI.getParent();
2084   const DebugLoc &DL = MI.getDebugLoc();
2085   Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2086   Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2087   Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2088   Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2089 
2090   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2091     .addReg(Src, 0, AMDGPU::sub0);
2092   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2093     .addReg(Src, 0, AMDGPU::sub1);
2094   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2095     .addImm(0x80000000);
2096 
2097   // Set or toggle sign bit.
2098   unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2099   BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2100     .addReg(HiReg)
2101     .addReg(ConstReg);
2102   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2103     .addReg(LoReg)
2104     .addImm(AMDGPU::sub0)
2105     .addReg(OpReg)
2106     .addImm(AMDGPU::sub1);
2107   MI.eraseFromParent();
2108   return true;
2109 }
2110 
2111 // FIXME: This is a workaround for the same tablegen problems as G_FNEG
2112 bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2113   Register Dst = MI.getOperand(0).getReg();
2114   const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2115   if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2116       MRI->getType(Dst) != LLT::scalar(64))
2117     return false;
2118 
2119   Register Src = MI.getOperand(1).getReg();
2120   MachineBasicBlock *BB = MI.getParent();
2121   const DebugLoc &DL = MI.getDebugLoc();
2122   Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2123   Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2124   Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2125   Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2126 
2127   if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2128       !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2129     return false;
2130 
2131   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2132     .addReg(Src, 0, AMDGPU::sub0);
2133   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2134     .addReg(Src, 0, AMDGPU::sub1);
2135   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2136     .addImm(0x7fffffff);
2137 
2138   // Clear sign bit.
2139   // TODO: Should this used S_BITSET0_*?
2140   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2141     .addReg(HiReg)
2142     .addReg(ConstReg);
2143   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2144     .addReg(LoReg)
2145     .addImm(AMDGPU::sub0)
2146     .addReg(OpReg)
2147     .addImm(AMDGPU::sub1);
2148 
2149   MI.eraseFromParent();
2150   return true;
2151 }
2152 
2153 static bool isConstant(const MachineInstr &MI) {
2154   return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2155 }
2156 
2157 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2158     const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2159 
2160   const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg());
2161 
2162   assert(PtrMI);
2163 
2164   if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2165     return;
2166 
2167   GEPInfo GEPInfo(*PtrMI);
2168 
2169   for (unsigned i = 1; i != 3; ++i) {
2170     const MachineOperand &GEPOp = PtrMI->getOperand(i);
2171     const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2172     assert(OpDef);
2173     if (i == 2 && isConstant(*OpDef)) {
2174       // TODO: Could handle constant base + variable offset, but a combine
2175       // probably should have commuted it.
2176       assert(GEPInfo.Imm == 0);
2177       GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2178       continue;
2179     }
2180     const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2181     if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2182       GEPInfo.SgprParts.push_back(GEPOp.getReg());
2183     else
2184       GEPInfo.VgprParts.push_back(GEPOp.getReg());
2185   }
2186 
2187   AddrInfo.push_back(GEPInfo);
2188   getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2189 }
2190 
2191 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2192   if (!MI.hasOneMemOperand())
2193     return false;
2194 
2195   const MachineMemOperand *MMO = *MI.memoperands_begin();
2196   const Value *Ptr = MMO->getValue();
2197 
2198   // UndefValue means this is a load of a kernel input.  These are uniform.
2199   // Sometimes LDS instructions have constant pointers.
2200   // If Ptr is null, then that means this mem operand contains a
2201   // PseudoSourceValue like GOT.
2202   if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
2203       isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
2204     return true;
2205 
2206   if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
2207     return true;
2208 
2209   const Instruction *I = dyn_cast<Instruction>(Ptr);
2210   return I && I->getMetadata("amdgpu.uniform");
2211 }
2212 
2213 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2214   for (const GEPInfo &GEPInfo : AddrInfo) {
2215     if (!GEPInfo.VgprParts.empty())
2216       return true;
2217   }
2218   return false;
2219 }
2220 
2221 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2222   const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2223   unsigned AS = PtrTy.getAddressSpace();
2224   if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) &&
2225       STI.ldsRequiresM0Init()) {
2226     MachineBasicBlock *BB = I.getParent();
2227 
2228     // If DS instructions require M0 initializtion, insert it before selecting.
2229     BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2230       .addImm(-1);
2231   }
2232 }
2233 
2234 bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
2235   MachineInstr &I) const {
2236   initM0(I);
2237   return selectImpl(I, *CoverageInfo);
2238 }
2239 
2240 // TODO: No rtn optimization.
2241 bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG(
2242   MachineInstr &MI) const {
2243   Register PtrReg = MI.getOperand(1).getReg();
2244   const LLT PtrTy = MRI->getType(PtrReg);
2245   if (PtrTy.getAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
2246       STI.useFlatForGlobal())
2247     return selectImpl(MI, *CoverageInfo);
2248 
2249   Register DstReg = MI.getOperand(0).getReg();
2250   const LLT Ty = MRI->getType(DstReg);
2251   const bool Is64 = Ty.getSizeInBits() == 64;
2252   const unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2253   Register TmpReg = MRI->createVirtualRegister(
2254     Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2255 
2256   const DebugLoc &DL = MI.getDebugLoc();
2257   MachineBasicBlock *BB = MI.getParent();
2258 
2259   Register VAddr, RSrcReg, SOffset;
2260   int64_t Offset = 0;
2261 
2262   unsigned Opcode;
2263   if (selectMUBUFOffsetImpl(MI.getOperand(1), RSrcReg, SOffset, Offset)) {
2264     Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN :
2265                              AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN;
2266   } else if (selectMUBUFAddr64Impl(MI.getOperand(1), VAddr,
2267                                    RSrcReg, SOffset, Offset)) {
2268     Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN :
2269                     AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN;
2270   } else
2271     return selectImpl(MI, *CoverageInfo);
2272 
2273   auto MIB = BuildMI(*BB, &MI, DL, TII.get(Opcode), TmpReg)
2274     .addReg(MI.getOperand(2).getReg());
2275 
2276   if (VAddr)
2277     MIB.addReg(VAddr);
2278 
2279   MIB.addReg(RSrcReg);
2280   if (SOffset)
2281     MIB.addReg(SOffset);
2282   else
2283     MIB.addImm(0);
2284 
2285   MIB.addImm(Offset);
2286   MIB.addImm(0); // slc
2287   MIB.cloneMemRefs(MI);
2288 
2289   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), DstReg)
2290     .addReg(TmpReg, RegState::Kill, SubReg);
2291 
2292   MI.eraseFromParent();
2293 
2294   MRI->setRegClass(
2295     DstReg, Is64 ? &AMDGPU::VReg_64RegClass : &AMDGPU::VGPR_32RegClass);
2296   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2297 }
2298 
2299 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
2300   MachineBasicBlock *BB = I.getParent();
2301   MachineOperand &CondOp = I.getOperand(0);
2302   Register CondReg = CondOp.getReg();
2303   const DebugLoc &DL = I.getDebugLoc();
2304 
2305   unsigned BrOpcode;
2306   Register CondPhysReg;
2307   const TargetRegisterClass *ConstrainRC;
2308 
2309   // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
2310   // whether the branch is uniform when selecting the instruction. In
2311   // GlobalISel, we should push that decision into RegBankSelect. Assume for now
2312   // RegBankSelect knows what it's doing if the branch condition is scc, even
2313   // though it currently does not.
2314   if (!isVCC(CondReg, *MRI)) {
2315     if (MRI->getType(CondReg) != LLT::scalar(32))
2316       return false;
2317 
2318     CondPhysReg = AMDGPU::SCC;
2319     BrOpcode = AMDGPU::S_CBRANCH_SCC1;
2320     ConstrainRC = &AMDGPU::SReg_32RegClass;
2321   } else {
2322     // FIXME: Do we have to insert an and with exec here, like in SelectionDAG?
2323     // We sort of know that a VCC producer based on the register bank, that ands
2324     // inactive lanes with 0. What if there was a logical operation with vcc
2325     // producers in different blocks/with different exec masks?
2326     // FIXME: Should scc->vcc copies and with exec?
2327     CondPhysReg = TRI.getVCC();
2328     BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
2329     ConstrainRC = TRI.getBoolRC();
2330   }
2331 
2332   if (!MRI->getRegClassOrNull(CondReg))
2333     MRI->setRegClass(CondReg, ConstrainRC);
2334 
2335   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
2336     .addReg(CondReg);
2337   BuildMI(*BB, &I, DL, TII.get(BrOpcode))
2338     .addMBB(I.getOperand(1).getMBB());
2339 
2340   I.eraseFromParent();
2341   return true;
2342 }
2343 
2344 bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
2345   MachineInstr &I) const {
2346   Register DstReg = I.getOperand(0).getReg();
2347   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2348   const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2349   I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
2350   if (IsVGPR)
2351     I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
2352 
2353   return RBI.constrainGenericRegister(
2354     DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
2355 }
2356 
2357 bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
2358   Register DstReg = I.getOperand(0).getReg();
2359   Register SrcReg = I.getOperand(1).getReg();
2360   Register MaskReg = I.getOperand(2).getReg();
2361   LLT Ty = MRI->getType(DstReg);
2362   LLT MaskTy = MRI->getType(MaskReg);
2363 
2364   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2365   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2366   const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
2367   const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2368   if (DstRB != SrcRB) // Should only happen for hand written MIR.
2369     return false;
2370 
2371   unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2372   const TargetRegisterClass &RegRC
2373     = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2374 
2375   const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB,
2376                                                                   *MRI);
2377   const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB,
2378                                                                   *MRI);
2379   const TargetRegisterClass *MaskRC =
2380       TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB, *MRI);
2381 
2382   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2383       !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2384       !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
2385     return false;
2386 
2387   MachineBasicBlock *BB = I.getParent();
2388   const DebugLoc &DL = I.getDebugLoc();
2389   if (Ty.getSizeInBits() == 32) {
2390     assert(MaskTy.getSizeInBits() == 32 &&
2391            "ptrmask should have been narrowed during legalize");
2392 
2393     BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
2394       .addReg(SrcReg)
2395       .addReg(MaskReg);
2396     I.eraseFromParent();
2397     return true;
2398   }
2399 
2400   Register HiReg = MRI->createVirtualRegister(&RegRC);
2401   Register LoReg = MRI->createVirtualRegister(&RegRC);
2402 
2403   // Extract the subregisters from the source pointer.
2404   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
2405     .addReg(SrcReg, 0, AMDGPU::sub0);
2406   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
2407     .addReg(SrcReg, 0, AMDGPU::sub1);
2408 
2409   Register MaskedLo, MaskedHi;
2410 
2411   // Try to avoid emitting a bit operation when we only need to touch half of
2412   // the 64-bit pointer.
2413   APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64);
2414 
2415   const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
2416   const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
2417   if ((MaskOnes & MaskLo32) == MaskLo32) {
2418     // If all the bits in the low half are 1, we only need a copy for it.
2419     MaskedLo = LoReg;
2420   } else {
2421     // Extract the mask subregister and apply the and.
2422     Register MaskLo = MRI->createVirtualRegister(&RegRC);
2423     MaskedLo = MRI->createVirtualRegister(&RegRC);
2424 
2425     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
2426       .addReg(MaskReg, 0, AMDGPU::sub0);
2427     BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
2428       .addReg(LoReg)
2429       .addReg(MaskLo);
2430   }
2431 
2432   if ((MaskOnes & MaskHi32) == MaskHi32) {
2433     // If all the bits in the high half are 1, we only need a copy for it.
2434     MaskedHi = HiReg;
2435   } else {
2436     Register MaskHi = MRI->createVirtualRegister(&RegRC);
2437     MaskedHi = MRI->createVirtualRegister(&RegRC);
2438 
2439     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
2440       .addReg(MaskReg, 0, AMDGPU::sub1);
2441     BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
2442       .addReg(HiReg)
2443       .addReg(MaskHi);
2444   }
2445 
2446   BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2447     .addReg(MaskedLo)
2448     .addImm(AMDGPU::sub0)
2449     .addReg(MaskedHi)
2450     .addImm(AMDGPU::sub1);
2451   I.eraseFromParent();
2452   return true;
2453 }
2454 
2455 /// Return the register to use for the index value, and the subregister to use
2456 /// for the indirectly accessed register.
2457 static std::pair<Register, unsigned>
2458 computeIndirectRegIndex(MachineRegisterInfo &MRI,
2459                         const SIRegisterInfo &TRI,
2460                         const TargetRegisterClass *SuperRC,
2461                         Register IdxReg,
2462                         unsigned EltSize) {
2463   Register IdxBaseReg;
2464   int Offset;
2465   MachineInstr *Unused;
2466 
2467   std::tie(IdxBaseReg, Offset, Unused)
2468     = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg);
2469   if (IdxBaseReg == AMDGPU::NoRegister) {
2470     // This will happen if the index is a known constant. This should ordinarily
2471     // be legalized out, but handle it as a register just in case.
2472     assert(Offset == 0);
2473     IdxBaseReg = IdxReg;
2474   }
2475 
2476   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
2477 
2478   // Skip out of bounds offsets, or else we would end up using an undefined
2479   // register.
2480   if (static_cast<unsigned>(Offset) >= SubRegs.size())
2481     return std::make_pair(IdxReg, SubRegs[0]);
2482   return std::make_pair(IdxBaseReg, SubRegs[Offset]);
2483 }
2484 
2485 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
2486   MachineInstr &MI) const {
2487   Register DstReg = MI.getOperand(0).getReg();
2488   Register SrcReg = MI.getOperand(1).getReg();
2489   Register IdxReg = MI.getOperand(2).getReg();
2490 
2491   LLT DstTy = MRI->getType(DstReg);
2492   LLT SrcTy = MRI->getType(SrcReg);
2493 
2494   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2495   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2496   const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
2497 
2498   // The index must be scalar. If it wasn't RegBankSelect should have moved this
2499   // into a waterfall loop.
2500   if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
2501     return false;
2502 
2503   const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB,
2504                                                                   *MRI);
2505   const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(DstTy, *DstRB,
2506                                                                   *MRI);
2507   if (!SrcRC || !DstRC)
2508     return false;
2509   if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2510       !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2511       !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
2512     return false;
2513 
2514   MachineBasicBlock *BB = MI.getParent();
2515   const DebugLoc &DL = MI.getDebugLoc();
2516   const bool Is64 = DstTy.getSizeInBits() == 64;
2517 
2518   unsigned SubReg;
2519   std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, SrcRC, IdxReg,
2520                                                      DstTy.getSizeInBits() / 8);
2521 
2522   if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
2523     if (DstTy.getSizeInBits() != 32 && !Is64)
2524       return false;
2525 
2526     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2527       .addReg(IdxReg);
2528 
2529     unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
2530     BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
2531       .addReg(SrcReg, 0, SubReg)
2532       .addReg(SrcReg, RegState::Implicit);
2533     MI.eraseFromParent();
2534     return true;
2535   }
2536 
2537   if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
2538     return false;
2539 
2540   if (!STI.useVGPRIndexMode()) {
2541     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2542       .addReg(IdxReg);
2543     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
2544       .addReg(SrcReg, 0, SubReg)
2545       .addReg(SrcReg, RegState::Implicit);
2546     MI.eraseFromParent();
2547     return true;
2548   }
2549 
2550   BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON))
2551     .addReg(IdxReg)
2552     .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE);
2553   BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), DstReg)
2554     .addReg(SrcReg, 0, SubReg)
2555     .addReg(SrcReg, RegState::Implicit)
2556     .addReg(AMDGPU::M0, RegState::Implicit);
2557   BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF));
2558 
2559   MI.eraseFromParent();
2560   return true;
2561 }
2562 
2563 // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
2564 bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
2565   MachineInstr &MI) const {
2566   Register DstReg = MI.getOperand(0).getReg();
2567   Register VecReg = MI.getOperand(1).getReg();
2568   Register ValReg = MI.getOperand(2).getReg();
2569   Register IdxReg = MI.getOperand(3).getReg();
2570 
2571   LLT VecTy = MRI->getType(DstReg);
2572   LLT ValTy = MRI->getType(ValReg);
2573   unsigned VecSize = VecTy.getSizeInBits();
2574   unsigned ValSize = ValTy.getSizeInBits();
2575 
2576   const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
2577   const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
2578   const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
2579 
2580   assert(VecTy.getElementType() == ValTy);
2581 
2582   // The index must be scalar. If it wasn't RegBankSelect should have moved this
2583   // into a waterfall loop.
2584   if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
2585     return false;
2586 
2587   const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB,
2588                                                                   *MRI);
2589   const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB,
2590                                                                   *MRI);
2591 
2592   if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
2593       !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
2594       !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
2595       !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
2596     return false;
2597 
2598   if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
2599     return false;
2600 
2601   unsigned SubReg;
2602   std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg,
2603                                                      ValSize / 8);
2604 
2605   const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
2606                          STI.useVGPRIndexMode();
2607 
2608   MachineBasicBlock *BB = MI.getParent();
2609   const DebugLoc &DL = MI.getDebugLoc();
2610 
2611   if (IndexMode) {
2612     BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON))
2613       .addReg(IdxReg)
2614       .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE);
2615   } else {
2616     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2617       .addReg(IdxReg);
2618   }
2619 
2620   const MCInstrDesc &RegWriteOp
2621     = TII.getIndirectRegWritePseudo(VecSize, ValSize,
2622                                     VecRB->getID() == AMDGPU::SGPRRegBankID);
2623   BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
2624     .addReg(VecReg)
2625     .addReg(ValReg)
2626     .addImm(SubReg);
2627 
2628   if (IndexMode)
2629     BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF));
2630 
2631   MI.eraseFromParent();
2632   return true;
2633 }
2634 
2635 static bool isZeroOrUndef(int X) {
2636   return X == 0 || X == -1;
2637 }
2638 
2639 static bool isOneOrUndef(int X) {
2640   return X == 1 || X == -1;
2641 }
2642 
2643 static bool isZeroOrOneOrUndef(int X) {
2644   return X == 0 || X == 1 || X == -1;
2645 }
2646 
2647 // Normalize a VOP3P shuffle mask to refer to the low/high half of a single
2648 // 32-bit register.
2649 static Register normalizeVOP3PMask(int NewMask[2], Register Src0, Register Src1,
2650                                    ArrayRef<int> Mask) {
2651   NewMask[0] = Mask[0];
2652   NewMask[1] = Mask[1];
2653   if (isZeroOrOneOrUndef(Mask[0]) && isZeroOrOneOrUndef(Mask[1]))
2654     return Src0;
2655 
2656   assert(NewMask[0] == 2 || NewMask[0] == 3 || NewMask[0] == -1);
2657   assert(NewMask[1] == 2 || NewMask[1] == 3 || NewMask[1] == -1);
2658 
2659   // Shift the mask inputs to be 0/1;
2660   NewMask[0] = NewMask[0] == -1 ? -1 : NewMask[0] - 2;
2661   NewMask[1] = NewMask[1] == -1 ? -1 : NewMask[1] - 2;
2662   return Src1;
2663 }
2664 
2665 // This is only legal with VOP3P instructions as an aid to op_sel matching.
2666 bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
2667   MachineInstr &MI) const {
2668   Register DstReg = MI.getOperand(0).getReg();
2669   Register Src0Reg = MI.getOperand(1).getReg();
2670   Register Src1Reg = MI.getOperand(2).getReg();
2671   ArrayRef<int> ShufMask = MI.getOperand(3).getShuffleMask();
2672 
2673   const LLT V2S16 = LLT::vector(2, 16);
2674   if (MRI->getType(DstReg) != V2S16 || MRI->getType(Src0Reg) != V2S16)
2675     return false;
2676 
2677   if (!AMDGPU::isLegalVOP3PShuffleMask(ShufMask))
2678     return false;
2679 
2680   assert(ShufMask.size() == 2);
2681   assert(STI.hasSDWA() && "no target has VOP3P but not SDWA");
2682 
2683   MachineBasicBlock *MBB = MI.getParent();
2684   const DebugLoc &DL = MI.getDebugLoc();
2685 
2686   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2687   const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2688   const TargetRegisterClass &RC = IsVALU ?
2689     AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2690 
2691   // Handle the degenerate case which should have folded out.
2692   if (ShufMask[0] == -1 && ShufMask[1] == -1) {
2693     BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), DstReg);
2694 
2695     MI.eraseFromParent();
2696     return RBI.constrainGenericRegister(DstReg, RC, *MRI);
2697   }
2698 
2699   // A legal VOP3P mask only reads one of the sources.
2700   int Mask[2];
2701   Register SrcVec = normalizeVOP3PMask(Mask, Src0Reg, Src1Reg, ShufMask);
2702 
2703   if (!RBI.constrainGenericRegister(DstReg, RC, *MRI) ||
2704       !RBI.constrainGenericRegister(SrcVec, RC, *MRI))
2705     return false;
2706 
2707   // TODO: This also should have been folded out
2708   if (isZeroOrUndef(Mask[0]) && isOneOrUndef(Mask[1])) {
2709     BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), DstReg)
2710       .addReg(SrcVec);
2711 
2712     MI.eraseFromParent();
2713     return true;
2714   }
2715 
2716   if (Mask[0] == 1 && Mask[1] == -1) {
2717     if (IsVALU) {
2718       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
2719         .addImm(16)
2720         .addReg(SrcVec);
2721     } else {
2722       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
2723         .addReg(SrcVec)
2724         .addImm(16);
2725     }
2726   } else if (Mask[0] == -1 && Mask[1] == 0) {
2727     if (IsVALU) {
2728       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), DstReg)
2729         .addImm(16)
2730         .addReg(SrcVec);
2731     } else {
2732       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHL_B32), DstReg)
2733         .addReg(SrcVec)
2734         .addImm(16);
2735     }
2736   } else if (Mask[0] == 0 && Mask[1] == 0) {
2737     if (IsVALU) {
2738       // Write low half of the register into the high half.
2739       MachineInstr *MovSDWA =
2740         BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2741         .addImm(0)                             // $src0_modifiers
2742         .addReg(SrcVec)                        // $src0
2743         .addImm(0)                             // $clamp
2744         .addImm(AMDGPU::SDWA::WORD_1)          // $dst_sel
2745         .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2746         .addImm(AMDGPU::SDWA::WORD_0)          // $src0_sel
2747         .addReg(SrcVec, RegState::Implicit);
2748       MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2749     } else {
2750       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
2751         .addReg(SrcVec)
2752         .addReg(SrcVec);
2753     }
2754   } else if (Mask[0] == 1 && Mask[1] == 1) {
2755     if (IsVALU) {
2756       // Write high half of the register into the low half.
2757       MachineInstr *MovSDWA =
2758         BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2759         .addImm(0)                             // $src0_modifiers
2760         .addReg(SrcVec)                        // $src0
2761         .addImm(0)                             // $clamp
2762         .addImm(AMDGPU::SDWA::WORD_0)          // $dst_sel
2763         .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2764         .addImm(AMDGPU::SDWA::WORD_1)          // $src0_sel
2765         .addReg(SrcVec, RegState::Implicit);
2766       MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2767     } else {
2768       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_HH_B32_B16), DstReg)
2769         .addReg(SrcVec)
2770         .addReg(SrcVec);
2771     }
2772   } else if (Mask[0] == 1 && Mask[1] == 0) {
2773     if (IsVALU) {
2774       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_ALIGNBIT_B32), DstReg)
2775         .addReg(SrcVec)
2776         .addReg(SrcVec)
2777         .addImm(16);
2778     } else {
2779       Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2780       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg)
2781         .addReg(SrcVec)
2782         .addImm(16);
2783       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
2784         .addReg(TmpReg)
2785         .addReg(SrcVec);
2786     }
2787   } else
2788     llvm_unreachable("all shuffle masks should be handled");
2789 
2790   MI.eraseFromParent();
2791   return true;
2792 }
2793 
2794 bool AMDGPUInstructionSelector::select(MachineInstr &I) {
2795   if (I.isPHI())
2796     return selectPHI(I);
2797 
2798   if (!I.isPreISelOpcode()) {
2799     if (I.isCopy())
2800       return selectCOPY(I);
2801     return true;
2802   }
2803 
2804   switch (I.getOpcode()) {
2805   case TargetOpcode::G_AND:
2806   case TargetOpcode::G_OR:
2807   case TargetOpcode::G_XOR:
2808     if (selectImpl(I, *CoverageInfo))
2809       return true;
2810     return selectG_AND_OR_XOR(I);
2811   case TargetOpcode::G_ADD:
2812   case TargetOpcode::G_SUB:
2813     if (selectImpl(I, *CoverageInfo))
2814       return true;
2815     return selectG_ADD_SUB(I);
2816   case TargetOpcode::G_UADDO:
2817   case TargetOpcode::G_USUBO:
2818   case TargetOpcode::G_UADDE:
2819   case TargetOpcode::G_USUBE:
2820     return selectG_UADDO_USUBO_UADDE_USUBE(I);
2821   case TargetOpcode::G_INTTOPTR:
2822   case TargetOpcode::G_BITCAST:
2823   case TargetOpcode::G_PTRTOINT:
2824     return selectCOPY(I);
2825   case TargetOpcode::G_CONSTANT:
2826   case TargetOpcode::G_FCONSTANT:
2827     return selectG_CONSTANT(I);
2828   case TargetOpcode::G_FNEG:
2829     if (selectImpl(I, *CoverageInfo))
2830       return true;
2831     return selectG_FNEG(I);
2832   case TargetOpcode::G_FABS:
2833     if (selectImpl(I, *CoverageInfo))
2834       return true;
2835     return selectG_FABS(I);
2836   case TargetOpcode::G_EXTRACT:
2837     return selectG_EXTRACT(I);
2838   case TargetOpcode::G_MERGE_VALUES:
2839   case TargetOpcode::G_BUILD_VECTOR:
2840   case TargetOpcode::G_CONCAT_VECTORS:
2841     return selectG_MERGE_VALUES(I);
2842   case TargetOpcode::G_UNMERGE_VALUES:
2843     return selectG_UNMERGE_VALUES(I);
2844   case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2845     return selectG_BUILD_VECTOR_TRUNC(I);
2846   case TargetOpcode::G_PTR_ADD:
2847     return selectG_PTR_ADD(I);
2848   case TargetOpcode::G_IMPLICIT_DEF:
2849     return selectG_IMPLICIT_DEF(I);
2850   case TargetOpcode::G_FREEZE:
2851     return selectCOPY(I);
2852   case TargetOpcode::G_INSERT:
2853     return selectG_INSERT(I);
2854   case TargetOpcode::G_INTRINSIC:
2855     return selectG_INTRINSIC(I);
2856   case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
2857     return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
2858   case TargetOpcode::G_ICMP:
2859     if (selectG_ICMP(I))
2860       return true;
2861     return selectImpl(I, *CoverageInfo);
2862   case TargetOpcode::G_LOAD:
2863   case TargetOpcode::G_STORE:
2864   case TargetOpcode::G_ATOMIC_CMPXCHG:
2865   case TargetOpcode::G_ATOMICRMW_XCHG:
2866   case TargetOpcode::G_ATOMICRMW_ADD:
2867   case TargetOpcode::G_ATOMICRMW_SUB:
2868   case TargetOpcode::G_ATOMICRMW_AND:
2869   case TargetOpcode::G_ATOMICRMW_OR:
2870   case TargetOpcode::G_ATOMICRMW_XOR:
2871   case TargetOpcode::G_ATOMICRMW_MIN:
2872   case TargetOpcode::G_ATOMICRMW_MAX:
2873   case TargetOpcode::G_ATOMICRMW_UMIN:
2874   case TargetOpcode::G_ATOMICRMW_UMAX:
2875   case TargetOpcode::G_ATOMICRMW_FADD:
2876   case AMDGPU::G_AMDGPU_ATOMIC_INC:
2877   case AMDGPU::G_AMDGPU_ATOMIC_DEC:
2878   case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
2879   case AMDGPU::G_AMDGPU_ATOMIC_FMAX:
2880     return selectG_LOAD_STORE_ATOMICRMW(I);
2881   case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
2882     return selectG_AMDGPU_ATOMIC_CMPXCHG(I);
2883   case TargetOpcode::G_SELECT:
2884     return selectG_SELECT(I);
2885   case TargetOpcode::G_TRUNC:
2886     return selectG_TRUNC(I);
2887   case TargetOpcode::G_SEXT:
2888   case TargetOpcode::G_ZEXT:
2889   case TargetOpcode::G_ANYEXT:
2890   case TargetOpcode::G_SEXT_INREG:
2891     if (selectImpl(I, *CoverageInfo))
2892       return true;
2893     return selectG_SZA_EXT(I);
2894   case TargetOpcode::G_BRCOND:
2895     return selectG_BRCOND(I);
2896   case TargetOpcode::G_GLOBAL_VALUE:
2897     return selectG_GLOBAL_VALUE(I);
2898   case TargetOpcode::G_PTRMASK:
2899     return selectG_PTRMASK(I);
2900   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2901     return selectG_EXTRACT_VECTOR_ELT(I);
2902   case TargetOpcode::G_INSERT_VECTOR_ELT:
2903     return selectG_INSERT_VECTOR_ELT(I);
2904   case TargetOpcode::G_SHUFFLE_VECTOR:
2905     return selectG_SHUFFLE_VECTOR(I);
2906   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
2907   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
2908     const AMDGPU::ImageDimIntrinsicInfo *Intr
2909       = AMDGPU::getImageDimIntrinsicInfo(I.getIntrinsicID());
2910     assert(Intr && "not an image intrinsic with image pseudo");
2911     return selectImageIntrinsic(I, Intr);
2912   }
2913   default:
2914     return selectImpl(I, *CoverageInfo);
2915   }
2916   return false;
2917 }
2918 
2919 InstructionSelector::ComplexRendererFns
2920 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
2921   return {{
2922       [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
2923   }};
2924 
2925 }
2926 
2927 std::pair<Register, unsigned>
2928 AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root) const {
2929   Register Src = Root.getReg();
2930   Register OrigSrc = Src;
2931   unsigned Mods = 0;
2932   MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
2933 
2934   if (MI && MI->getOpcode() == AMDGPU::G_FNEG) {
2935     Src = MI->getOperand(1).getReg();
2936     Mods |= SISrcMods::NEG;
2937     MI = getDefIgnoringCopies(Src, *MRI);
2938   }
2939 
2940   if (MI && MI->getOpcode() == AMDGPU::G_FABS) {
2941     Src = MI->getOperand(1).getReg();
2942     Mods |= SISrcMods::ABS;
2943   }
2944 
2945   if (Mods != 0 &&
2946       RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
2947     MachineInstr *UseMI = Root.getParent();
2948 
2949     // If we looked through copies to find source modifiers on an SGPR operand,
2950     // we now have an SGPR register source. To avoid potentially violating the
2951     // constant bus restriction, we need to insert a copy to a VGPR.
2952     Register VGPRSrc = MRI->cloneVirtualRegister(OrigSrc);
2953     BuildMI(*UseMI->getParent(), UseMI, UseMI->getDebugLoc(),
2954             TII.get(AMDGPU::COPY), VGPRSrc)
2955       .addReg(Src);
2956     Src = VGPRSrc;
2957   }
2958 
2959   return std::make_pair(Src, Mods);
2960 }
2961 
2962 ///
2963 /// This will select either an SGPR or VGPR operand and will save us from
2964 /// having to write an extra tablegen pattern.
2965 InstructionSelector::ComplexRendererFns
2966 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
2967   return {{
2968       [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
2969   }};
2970 }
2971 
2972 InstructionSelector::ComplexRendererFns
2973 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
2974   Register Src;
2975   unsigned Mods;
2976   std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
2977 
2978   return {{
2979       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
2980       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
2981       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },    // clamp
2982       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }     // omod
2983   }};
2984 }
2985 
2986 InstructionSelector::ComplexRendererFns
2987 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
2988   return {{
2989       [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
2990       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
2991       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }  // omod
2992   }};
2993 }
2994 
2995 InstructionSelector::ComplexRendererFns
2996 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
2997   Register Src;
2998   unsigned Mods;
2999   std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3000 
3001   return {{
3002       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3003       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
3004   }};
3005 }
3006 
3007 InstructionSelector::ComplexRendererFns
3008 AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
3009   Register Reg = Root.getReg();
3010   const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3011   if (Def && (Def->getOpcode() == AMDGPU::G_FNEG ||
3012               Def->getOpcode() == AMDGPU::G_FABS))
3013     return {};
3014   return {{
3015       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3016   }};
3017 }
3018 
3019 std::pair<Register, unsigned>
3020 AMDGPUInstructionSelector::selectVOP3PModsImpl(
3021   Register Src, const MachineRegisterInfo &MRI) const {
3022   unsigned Mods = 0;
3023   MachineInstr *MI = MRI.getVRegDef(Src);
3024 
3025   if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
3026       // It's possible to see an f32 fneg here, but unlikely.
3027       // TODO: Treat f32 fneg as only high bit.
3028       MRI.getType(Src) == LLT::vector(2, 16)) {
3029     Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
3030     Src = MI->getOperand(1).getReg();
3031     MI = MRI.getVRegDef(Src);
3032   }
3033 
3034   // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
3035 
3036   // Packed instructions do not have abs modifiers.
3037   Mods |= SISrcMods::OP_SEL_1;
3038 
3039   return std::make_pair(Src, Mods);
3040 }
3041 
3042 InstructionSelector::ComplexRendererFns
3043 AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
3044   MachineRegisterInfo &MRI
3045     = Root.getParent()->getParent()->getParent()->getRegInfo();
3046 
3047   Register Src;
3048   unsigned Mods;
3049   std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
3050 
3051   return {{
3052       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3053       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
3054   }};
3055 }
3056 
3057 InstructionSelector::ComplexRendererFns
3058 AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {
3059   Register Src;
3060   unsigned Mods;
3061   std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3062   if (!TM.Options.NoNaNsFPMath && !isKnownNeverNaN(Src, *MRI))
3063     return None;
3064 
3065   return {{
3066       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3067       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
3068   }};
3069 }
3070 
3071 InstructionSelector::ComplexRendererFns
3072 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
3073   // FIXME: Handle op_sel
3074   return {{
3075       [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
3076       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods
3077   }};
3078 }
3079 
3080 InstructionSelector::ComplexRendererFns
3081 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
3082   SmallVector<GEPInfo, 4> AddrInfo;
3083   getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
3084 
3085   if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3086     return None;
3087 
3088   const GEPInfo &GEPInfo = AddrInfo[0];
3089   Optional<int64_t> EncodedImm =
3090       AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm, false);
3091   if (!EncodedImm)
3092     return None;
3093 
3094   unsigned PtrReg = GEPInfo.SgprParts[0];
3095   return {{
3096     [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3097     [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
3098   }};
3099 }
3100 
3101 InstructionSelector::ComplexRendererFns
3102 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
3103   SmallVector<GEPInfo, 4> AddrInfo;
3104   getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
3105 
3106   if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3107     return None;
3108 
3109   const GEPInfo &GEPInfo = AddrInfo[0];
3110   Register PtrReg = GEPInfo.SgprParts[0];
3111   Optional<int64_t> EncodedImm =
3112       AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
3113   if (!EncodedImm)
3114     return None;
3115 
3116   return {{
3117     [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3118     [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
3119   }};
3120 }
3121 
3122 InstructionSelector::ComplexRendererFns
3123 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
3124   MachineInstr *MI = Root.getParent();
3125   MachineBasicBlock *MBB = MI->getParent();
3126 
3127   SmallVector<GEPInfo, 4> AddrInfo;
3128   getAddrModeInfo(*MI, *MRI, AddrInfo);
3129 
3130   // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
3131   // then we can select all ptr + 32-bit offsets not just immediate offsets.
3132   if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3133     return None;
3134 
3135   const GEPInfo &GEPInfo = AddrInfo[0];
3136   // SGPR offset is unsigned.
3137   if (!GEPInfo.Imm || GEPInfo.Imm < 0 || !isUInt<32>(GEPInfo.Imm))
3138     return None;
3139 
3140   // If we make it this far we have a load with an 32-bit immediate offset.
3141   // It is OK to select this using a sgpr offset, because we have already
3142   // failed trying to select this load into one of the _IMM variants since
3143   // the _IMM Patterns are considered before the _SGPR patterns.
3144   Register PtrReg = GEPInfo.SgprParts[0];
3145   Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3146   BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg)
3147           .addImm(GEPInfo.Imm);
3148   return {{
3149     [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3150     [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }
3151   }};
3152 }
3153 
3154 template <bool Signed>
3155 InstructionSelector::ComplexRendererFns
3156 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const {
3157   MachineInstr *MI = Root.getParent();
3158 
3159   InstructionSelector::ComplexRendererFns Default = {{
3160       [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
3161       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },  // offset
3162       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }  // slc
3163     }};
3164 
3165   if (!STI.hasFlatInstOffsets())
3166     return Default;
3167 
3168   const MachineInstr *OpDef = MRI->getVRegDef(Root.getReg());
3169   if (!OpDef || OpDef->getOpcode() != AMDGPU::G_PTR_ADD)
3170     return Default;
3171 
3172   Optional<int64_t> Offset =
3173     getConstantVRegVal(OpDef->getOperand(2).getReg(), *MRI);
3174   if (!Offset.hasValue())
3175     return Default;
3176 
3177   unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
3178   if (!TII.isLegalFLATOffset(Offset.getValue(), AddrSpace, Signed))
3179     return Default;
3180 
3181   Register BasePtr = OpDef->getOperand(1).getReg();
3182 
3183   return {{
3184       [=](MachineInstrBuilder &MIB) { MIB.addReg(BasePtr); },
3185       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset.getValue()); },
3186       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }  // slc
3187     }};
3188 }
3189 
3190 InstructionSelector::ComplexRendererFns
3191 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
3192   return selectFlatOffsetImpl<false>(Root);
3193 }
3194 
3195 InstructionSelector::ComplexRendererFns
3196 AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const {
3197   return selectFlatOffsetImpl<true>(Root);
3198 }
3199 
3200 static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
3201   auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>();
3202   return PSV && PSV->isStack();
3203 }
3204 
3205 InstructionSelector::ComplexRendererFns
3206 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
3207   MachineInstr *MI = Root.getParent();
3208   MachineBasicBlock *MBB = MI->getParent();
3209   MachineFunction *MF = MBB->getParent();
3210   const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3211 
3212   int64_t Offset = 0;
3213   if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
3214       Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) {
3215     Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3216 
3217     // TODO: Should this be inside the render function? The iterator seems to
3218     // move.
3219     BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
3220             HighBits)
3221       .addImm(Offset & ~4095);
3222 
3223     return {{[=](MachineInstrBuilder &MIB) { // rsrc
3224                MIB.addReg(Info->getScratchRSrcReg());
3225              },
3226              [=](MachineInstrBuilder &MIB) { // vaddr
3227                MIB.addReg(HighBits);
3228              },
3229              [=](MachineInstrBuilder &MIB) { // soffset
3230                const MachineMemOperand *MMO = *MI->memoperands_begin();
3231                const MachinePointerInfo &PtrInfo = MMO->getPointerInfo();
3232 
3233                if (isStackPtrRelative(PtrInfo))
3234                  MIB.addReg(Info->getStackPtrOffsetReg());
3235                else
3236                  MIB.addImm(0);
3237              },
3238              [=](MachineInstrBuilder &MIB) { // offset
3239                MIB.addImm(Offset & 4095);
3240              }}};
3241   }
3242 
3243   assert(Offset == 0 || Offset == -1);
3244 
3245   // Try to fold a frame index directly into the MUBUF vaddr field, and any
3246   // offsets.
3247   Optional<int> FI;
3248   Register VAddr = Root.getReg();
3249   if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) {
3250     if (isBaseWithConstantOffset(Root, *MRI)) {
3251       const MachineOperand &LHS = RootDef->getOperand(1);
3252       const MachineOperand &RHS = RootDef->getOperand(2);
3253       const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg());
3254       const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg());
3255       if (LHSDef && RHSDef) {
3256         int64_t PossibleOffset =
3257             RHSDef->getOperand(1).getCImm()->getSExtValue();
3258         if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) &&
3259             (!STI.privateMemoryResourceIsRangeChecked() ||
3260              KnownBits->signBitIsZero(LHS.getReg()))) {
3261           if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
3262             FI = LHSDef->getOperand(1).getIndex();
3263           else
3264             VAddr = LHS.getReg();
3265           Offset = PossibleOffset;
3266         }
3267       }
3268     } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
3269       FI = RootDef->getOperand(1).getIndex();
3270     }
3271   }
3272 
3273   return {{[=](MachineInstrBuilder &MIB) { // rsrc
3274              MIB.addReg(Info->getScratchRSrcReg());
3275            },
3276            [=](MachineInstrBuilder &MIB) { // vaddr
3277              if (FI.hasValue())
3278                MIB.addFrameIndex(FI.getValue());
3279              else
3280                MIB.addReg(VAddr);
3281            },
3282            [=](MachineInstrBuilder &MIB) { // soffset
3283              // If we don't know this private access is a local stack object, it
3284              // needs to be relative to the entry point's scratch wave offset.
3285              // TODO: Should split large offsets that don't fit like above.
3286              // TODO: Don't use scratch wave offset just because the offset
3287              // didn't fit.
3288              if (!Info->isEntryFunction() && FI.hasValue())
3289                MIB.addReg(Info->getStackPtrOffsetReg());
3290              else
3291                MIB.addImm(0);
3292            },
3293            [=](MachineInstrBuilder &MIB) { // offset
3294              MIB.addImm(Offset);
3295            }}};
3296 }
3297 
3298 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
3299                                                 int64_t Offset,
3300                                                 unsigned OffsetBits) const {
3301   if ((OffsetBits == 16 && !isUInt<16>(Offset)) ||
3302       (OffsetBits == 8 && !isUInt<8>(Offset)))
3303     return false;
3304 
3305   if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
3306     return true;
3307 
3308   // On Southern Islands instruction with a negative base value and an offset
3309   // don't seem to work.
3310   return KnownBits->signBitIsZero(Base);
3311 }
3312 
3313 InstructionSelector::ComplexRendererFns
3314 AMDGPUInstructionSelector::selectMUBUFScratchOffset(
3315     MachineOperand &Root) const {
3316   MachineInstr *MI = Root.getParent();
3317   MachineBasicBlock *MBB = MI->getParent();
3318 
3319   int64_t Offset = 0;
3320   if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
3321       !SIInstrInfo::isLegalMUBUFImmOffset(Offset))
3322     return {};
3323 
3324   const MachineFunction *MF = MBB->getParent();
3325   const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3326   const MachineMemOperand *MMO = *MI->memoperands_begin();
3327   const MachinePointerInfo &PtrInfo = MMO->getPointerInfo();
3328 
3329   return {{
3330       [=](MachineInstrBuilder &MIB) { // rsrc
3331         MIB.addReg(Info->getScratchRSrcReg());
3332       },
3333       [=](MachineInstrBuilder &MIB) { // soffset
3334         if (isStackPtrRelative(PtrInfo))
3335           MIB.addReg(Info->getStackPtrOffsetReg());
3336         else
3337           MIB.addImm(0);
3338       },
3339       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
3340   }};
3341 }
3342 
3343 std::pair<Register, unsigned>
3344 AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
3345   const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
3346   if (!RootDef)
3347     return std::make_pair(Root.getReg(), 0);
3348 
3349   int64_t ConstAddr = 0;
3350 
3351   Register PtrBase;
3352   int64_t Offset;
3353   std::tie(PtrBase, Offset) =
3354     getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3355 
3356   if (Offset) {
3357     if (isDSOffsetLegal(PtrBase, Offset, 16)) {
3358       // (add n0, c0)
3359       return std::make_pair(PtrBase, Offset);
3360     }
3361   } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
3362     // TODO
3363 
3364 
3365   } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
3366     // TODO
3367 
3368   }
3369 
3370   return std::make_pair(Root.getReg(), 0);
3371 }
3372 
3373 InstructionSelector::ComplexRendererFns
3374 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
3375   Register Reg;
3376   unsigned Offset;
3377   std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
3378   return {{
3379       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3380       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
3381     }};
3382 }
3383 
3384 InstructionSelector::ComplexRendererFns
3385 AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
3386   Register Reg;
3387   unsigned Offset;
3388   std::tie(Reg, Offset) = selectDS64Bit4ByteAlignedImpl(Root);
3389   return {{
3390       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3391       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
3392       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
3393     }};
3394 }
3395 
3396 std::pair<Register, unsigned>
3397 AMDGPUInstructionSelector::selectDS64Bit4ByteAlignedImpl(MachineOperand &Root) const {
3398   const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
3399   if (!RootDef)
3400     return std::make_pair(Root.getReg(), 0);
3401 
3402   int64_t ConstAddr = 0;
3403 
3404   Register PtrBase;
3405   int64_t Offset;
3406   std::tie(PtrBase, Offset) =
3407     getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3408 
3409   if (Offset) {
3410     int64_t DWordOffset0 = Offset / 4;
3411     int64_t DWordOffset1 = DWordOffset0 + 1;
3412     if (isDSOffsetLegal(PtrBase, DWordOffset1, 8)) {
3413       // (add n0, c0)
3414       return std::make_pair(PtrBase, DWordOffset0);
3415     }
3416   } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
3417     // TODO
3418 
3419   } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
3420     // TODO
3421 
3422   }
3423 
3424   return std::make_pair(Root.getReg(), 0);
3425 }
3426 
3427 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
3428 /// the base value with the constant offset. There may be intervening copies
3429 /// between \p Root and the identified constant. Returns \p Root, 0 if this does
3430 /// not match the pattern.
3431 std::pair<Register, int64_t>
3432 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
3433   Register Root, const MachineRegisterInfo &MRI) const {
3434   MachineInstr *RootI = MRI.getVRegDef(Root);
3435   if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
3436     return {Root, 0};
3437 
3438   MachineOperand &RHS = RootI->getOperand(2);
3439   Optional<ValueAndVReg> MaybeOffset
3440     = getConstantVRegValWithLookThrough(RHS.getReg(), MRI, true);
3441   if (!MaybeOffset)
3442     return {Root, 0};
3443   return {RootI->getOperand(1).getReg(), MaybeOffset->Value};
3444 }
3445 
3446 static void addZeroImm(MachineInstrBuilder &MIB) {
3447   MIB.addImm(0);
3448 }
3449 
3450 /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
3451 /// BasePtr is not valid, a null base pointer will be used.
3452 static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI,
3453                           uint32_t FormatLo, uint32_t FormatHi,
3454                           Register BasePtr) {
3455   Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
3456   Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
3457   Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3458   Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
3459 
3460   B.buildInstr(AMDGPU::S_MOV_B32)
3461     .addDef(RSrc2)
3462     .addImm(FormatLo);
3463   B.buildInstr(AMDGPU::S_MOV_B32)
3464     .addDef(RSrc3)
3465     .addImm(FormatHi);
3466 
3467   // Build the half of the subregister with the constants before building the
3468   // full 128-bit register. If we are building multiple resource descriptors,
3469   // this will allow CSEing of the 2-component register.
3470   B.buildInstr(AMDGPU::REG_SEQUENCE)
3471     .addDef(RSrcHi)
3472     .addReg(RSrc2)
3473     .addImm(AMDGPU::sub0)
3474     .addReg(RSrc3)
3475     .addImm(AMDGPU::sub1);
3476 
3477   Register RSrcLo = BasePtr;
3478   if (!BasePtr) {
3479     RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3480     B.buildInstr(AMDGPU::S_MOV_B64)
3481       .addDef(RSrcLo)
3482       .addImm(0);
3483   }
3484 
3485   B.buildInstr(AMDGPU::REG_SEQUENCE)
3486     .addDef(RSrc)
3487     .addReg(RSrcLo)
3488     .addImm(AMDGPU::sub0_sub1)
3489     .addReg(RSrcHi)
3490     .addImm(AMDGPU::sub2_sub3);
3491 
3492   return RSrc;
3493 }
3494 
3495 static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
3496                                 const SIInstrInfo &TII, Register BasePtr) {
3497   uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
3498 
3499   // FIXME: Why are half the "default" bits ignored based on the addressing
3500   // mode?
3501   return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
3502 }
3503 
3504 static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
3505                                const SIInstrInfo &TII, Register BasePtr) {
3506   uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
3507 
3508   // FIXME: Why are half the "default" bits ignored based on the addressing
3509   // mode?
3510   return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
3511 }
3512 
3513 AMDGPUInstructionSelector::MUBUFAddressData
3514 AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
3515   MUBUFAddressData Data;
3516   Data.N0 = Src;
3517 
3518   Register PtrBase;
3519   int64_t Offset;
3520 
3521   std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
3522   if (isUInt<32>(Offset)) {
3523     Data.N0 = PtrBase;
3524     Data.Offset = Offset;
3525   }
3526 
3527   if (MachineInstr *InputAdd
3528       = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
3529     Data.N2 = InputAdd->getOperand(1).getReg();
3530     Data.N3 = InputAdd->getOperand(2).getReg();
3531 
3532     // FIXME: Need to fix extra SGPR->VGPRcopies inserted
3533     // FIXME: Don't know this was defined by operand 0
3534     //
3535     // TODO: Remove this when we have copy folding optimizations after
3536     // RegBankSelect.
3537     Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
3538     Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
3539   }
3540 
3541   return Data;
3542 }
3543 
3544 /// Return if the addr64 mubuf mode should be used for the given address.
3545 bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
3546   // (ptr_add N2, N3) -> addr64, or
3547   // (ptr_add (ptr_add N2, N3), C1) -> addr64
3548   if (Addr.N2)
3549     return true;
3550 
3551   const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
3552   return N0Bank->getID() == AMDGPU::VGPRRegBankID;
3553 }
3554 
3555 /// Split an immediate offset \p ImmOffset depending on whether it fits in the
3556 /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
3557 /// component.
3558 void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
3559   MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
3560   if (SIInstrInfo::isLegalMUBUFImmOffset(ImmOffset))
3561     return;
3562 
3563   // Illegal offset, store it in soffset.
3564   SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3565   B.buildInstr(AMDGPU::S_MOV_B32)
3566     .addDef(SOffset)
3567     .addImm(ImmOffset);
3568   ImmOffset = 0;
3569 }
3570 
3571 bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
3572   MachineOperand &Root, Register &VAddr, Register &RSrcReg,
3573   Register &SOffset, int64_t &Offset) const {
3574   // FIXME: Predicates should stop this from reaching here.
3575   // addr64 bit was removed for volcanic islands.
3576   if (!STI.hasAddr64() || STI.useFlatForGlobal())
3577     return false;
3578 
3579   MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
3580   if (!shouldUseAddr64(AddrData))
3581     return false;
3582 
3583   Register N0 = AddrData.N0;
3584   Register N2 = AddrData.N2;
3585   Register N3 = AddrData.N3;
3586   Offset = AddrData.Offset;
3587 
3588   // Base pointer for the SRD.
3589   Register SRDPtr;
3590 
3591   if (N2) {
3592     if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
3593       assert(N3);
3594       if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
3595         // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
3596         // addr64, and construct the default resource from a 0 address.
3597         VAddr = N0;
3598       } else {
3599         SRDPtr = N3;
3600         VAddr = N2;
3601       }
3602     } else {
3603       // N2 is not divergent.
3604       SRDPtr = N2;
3605       VAddr = N3;
3606     }
3607   } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
3608     // Use the default null pointer in the resource
3609     VAddr = N0;
3610   } else {
3611     // N0 -> offset, or
3612     // (N0 + C1) -> offset
3613     SRDPtr = N0;
3614   }
3615 
3616   MachineIRBuilder B(*Root.getParent());
3617   RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
3618   splitIllegalMUBUFOffset(B, SOffset, Offset);
3619   return true;
3620 }
3621 
3622 bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
3623   MachineOperand &Root, Register &RSrcReg, Register &SOffset,
3624   int64_t &Offset) const {
3625   MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
3626   if (shouldUseAddr64(AddrData))
3627     return false;
3628 
3629   // N0 -> offset, or
3630   // (N0 + C1) -> offset
3631   Register SRDPtr = AddrData.N0;
3632   Offset = AddrData.Offset;
3633 
3634   // TODO: Look through extensions for 32-bit soffset.
3635   MachineIRBuilder B(*Root.getParent());
3636 
3637   RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
3638   splitIllegalMUBUFOffset(B, SOffset, Offset);
3639   return true;
3640 }
3641 
3642 InstructionSelector::ComplexRendererFns
3643 AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
3644   Register VAddr;
3645   Register RSrcReg;
3646   Register SOffset;
3647   int64_t Offset = 0;
3648 
3649   if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
3650     return {};
3651 
3652   // FIXME: Use defaulted operands for trailing 0s and remove from the complex
3653   // pattern.
3654   return {{
3655       [=](MachineInstrBuilder &MIB) {  // rsrc
3656         MIB.addReg(RSrcReg);
3657       },
3658       [=](MachineInstrBuilder &MIB) { // vaddr
3659         MIB.addReg(VAddr);
3660       },
3661       [=](MachineInstrBuilder &MIB) { // soffset
3662         if (SOffset)
3663           MIB.addReg(SOffset);
3664         else
3665           MIB.addImm(0);
3666       },
3667       [=](MachineInstrBuilder &MIB) { // offset
3668         MIB.addImm(Offset);
3669       },
3670       addZeroImm, //  glc
3671       addZeroImm, //  slc
3672       addZeroImm, //  tfe
3673       addZeroImm, //  dlc
3674       addZeroImm  //  swz
3675     }};
3676 }
3677 
3678 InstructionSelector::ComplexRendererFns
3679 AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
3680   Register RSrcReg;
3681   Register SOffset;
3682   int64_t Offset = 0;
3683 
3684   if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
3685     return {};
3686 
3687   return {{
3688       [=](MachineInstrBuilder &MIB) {  // rsrc
3689         MIB.addReg(RSrcReg);
3690       },
3691       [=](MachineInstrBuilder &MIB) { // soffset
3692         if (SOffset)
3693           MIB.addReg(SOffset);
3694         else
3695           MIB.addImm(0);
3696       },
3697       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
3698       addZeroImm, //  glc
3699       addZeroImm, //  slc
3700       addZeroImm, //  tfe
3701       addZeroImm, //  dlc
3702       addZeroImm  //  swz
3703     }};
3704 }
3705 
3706 InstructionSelector::ComplexRendererFns
3707 AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const {
3708   Register VAddr;
3709   Register RSrcReg;
3710   Register SOffset;
3711   int64_t Offset = 0;
3712 
3713   if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
3714     return {};
3715 
3716   // FIXME: Use defaulted operands for trailing 0s and remove from the complex
3717   // pattern.
3718   return {{
3719       [=](MachineInstrBuilder &MIB) {  // rsrc
3720         MIB.addReg(RSrcReg);
3721       },
3722       [=](MachineInstrBuilder &MIB) { // vaddr
3723         MIB.addReg(VAddr);
3724       },
3725       [=](MachineInstrBuilder &MIB) { // soffset
3726         if (SOffset)
3727           MIB.addReg(SOffset);
3728         else
3729           MIB.addImm(0);
3730       },
3731       [=](MachineInstrBuilder &MIB) { // offset
3732         MIB.addImm(Offset);
3733       },
3734       addZeroImm //  slc
3735     }};
3736 }
3737 
3738 InstructionSelector::ComplexRendererFns
3739 AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const {
3740   Register RSrcReg;
3741   Register SOffset;
3742   int64_t Offset = 0;
3743 
3744   if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
3745     return {};
3746 
3747   return {{
3748       [=](MachineInstrBuilder &MIB) {  // rsrc
3749         MIB.addReg(RSrcReg);
3750       },
3751       [=](MachineInstrBuilder &MIB) { // soffset
3752         if (SOffset)
3753           MIB.addReg(SOffset);
3754         else
3755           MIB.addImm(0);
3756       },
3757       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
3758       addZeroImm //  slc
3759     }};
3760 }
3761 
3762 /// Get an immediate that must be 32-bits, and treated as zero extended.
3763 static Optional<uint64_t> getConstantZext32Val(Register Reg,
3764                                                const MachineRegisterInfo &MRI) {
3765   // getConstantVRegVal sexts any values, so see if that matters.
3766   Optional<int64_t> OffsetVal = getConstantVRegVal(Reg, MRI);
3767   if (!OffsetVal || !isInt<32>(*OffsetVal))
3768     return None;
3769   return Lo_32(*OffsetVal);
3770 }
3771 
3772 InstructionSelector::ComplexRendererFns
3773 AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
3774   Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
3775   if (!OffsetVal)
3776     return {};
3777 
3778   Optional<int64_t> EncodedImm =
3779       AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
3780   if (!EncodedImm)
3781     return {};
3782 
3783   return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }  }};
3784 }
3785 
3786 InstructionSelector::ComplexRendererFns
3787 AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
3788   assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
3789 
3790   Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
3791   if (!OffsetVal)
3792     return {};
3793 
3794   Optional<int64_t> EncodedImm
3795     = AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal);
3796   if (!EncodedImm)
3797     return {};
3798 
3799   return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }  }};
3800 }
3801 
3802 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
3803                                                  const MachineInstr &MI,
3804                                                  int OpIdx) const {
3805   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
3806          "Expected G_CONSTANT");
3807   MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
3808 }
3809 
3810 void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
3811                                                 const MachineInstr &MI,
3812                                                 int OpIdx) const {
3813   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
3814          "Expected G_CONSTANT");
3815   MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
3816 }
3817 
3818 void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB,
3819                                                  const MachineInstr &MI,
3820                                                  int OpIdx) const {
3821   assert(OpIdx == -1);
3822 
3823   const MachineOperand &Op = MI.getOperand(1);
3824   if (MI.getOpcode() == TargetOpcode::G_FCONSTANT)
3825     MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
3826   else {
3827     assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
3828     MIB.addImm(Op.getCImm()->getSExtValue());
3829   }
3830 }
3831 
3832 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
3833                                                 const MachineInstr &MI,
3834                                                 int OpIdx) const {
3835   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
3836          "Expected G_CONSTANT");
3837   MIB.addImm(MI.getOperand(1).getCImm()->getValue().countPopulation());
3838 }
3839 
3840 /// This only really exists to satisfy DAG type checking machinery, so is a
3841 /// no-op here.
3842 void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
3843                                                 const MachineInstr &MI,
3844                                                 int OpIdx) const {
3845   MIB.addImm(MI.getOperand(OpIdx).getImm());
3846 }
3847 
3848 void AMDGPUInstructionSelector::renderExtractGLC(MachineInstrBuilder &MIB,
3849                                                  const MachineInstr &MI,
3850                                                  int OpIdx) const {
3851   assert(OpIdx >= 0 && "expected to match an immediate operand");
3852   MIB.addImm(MI.getOperand(OpIdx).getImm() & 1);
3853 }
3854 
3855 void AMDGPUInstructionSelector::renderExtractSLC(MachineInstrBuilder &MIB,
3856                                                  const MachineInstr &MI,
3857                                                  int OpIdx) const {
3858   assert(OpIdx >= 0 && "expected to match an immediate operand");
3859   MIB.addImm((MI.getOperand(OpIdx).getImm() >> 1) & 1);
3860 }
3861 
3862 void AMDGPUInstructionSelector::renderExtractDLC(MachineInstrBuilder &MIB,
3863                                                  const MachineInstr &MI,
3864                                                  int OpIdx) const {
3865   assert(OpIdx >= 0 && "expected to match an immediate operand");
3866   MIB.addImm((MI.getOperand(OpIdx).getImm() >> 2) & 1);
3867 }
3868 
3869 void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
3870                                                  const MachineInstr &MI,
3871                                                  int OpIdx) const {
3872   assert(OpIdx >= 0 && "expected to match an immediate operand");
3873   MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1);
3874 }
3875 
3876 void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
3877                                                  const MachineInstr &MI,
3878                                                  int OpIdx) const {
3879   MIB.addFrameIndex((MI.getOperand(1).getIndex()));
3880 }
3881 
3882 bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const {
3883   return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm());
3884 }
3885 
3886 bool AMDGPUInstructionSelector::isInlineImmediate32(int64_t Imm) const {
3887   return AMDGPU::isInlinableLiteral32(Imm, STI.hasInv2PiInlineImm());
3888 }
3889 
3890 bool AMDGPUInstructionSelector::isInlineImmediate64(int64_t Imm) const {
3891   return AMDGPU::isInlinableLiteral64(Imm, STI.hasInv2PiInlineImm());
3892 }
3893 
3894 bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
3895   return TII.isInlineConstant(Imm);
3896 }
3897