1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the InstructionSelector class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUInstructionSelector.h"
15 #include "AMDGPUInstrInfo.h"
16 #include "AMDGPUGlobalISelUtils.h"
17 #include "AMDGPURegisterBankInfo.h"
18 #include "AMDGPUSubtarget.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
23 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
24 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
25 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
26 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
27 #include "llvm/CodeGen/GlobalISel/Utils.h"
28 #include "llvm/CodeGen/MachineBasicBlock.h"
29 #include "llvm/CodeGen/MachineFunction.h"
30 #include "llvm/CodeGen/MachineInstr.h"
31 #include "llvm/CodeGen/MachineInstrBuilder.h"
32 #include "llvm/CodeGen/MachineRegisterInfo.h"
33 #include "llvm/IR/Type.h"
34 #include "llvm/Support/Debug.h"
35 #include "llvm/Support/raw_ostream.h"
36 
37 #define DEBUG_TYPE "amdgpu-isel"
38 
39 using namespace llvm;
40 using namespace MIPatternMatch;
41 
42 static cl::opt<bool> AllowRiskySelect(
43   "amdgpu-global-isel-risky-select",
44   cl::desc("Allow GlobalISel to select cases that are likely to not work yet"),
45   cl::init(false),
46   cl::ReallyHidden);
47 
48 #define GET_GLOBALISEL_IMPL
49 #define AMDGPUSubtarget GCNSubtarget
50 #include "AMDGPUGenGlobalISel.inc"
51 #undef GET_GLOBALISEL_IMPL
52 #undef AMDGPUSubtarget
53 
54 AMDGPUInstructionSelector::AMDGPUInstructionSelector(
55     const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
56     const AMDGPUTargetMachine &TM)
57     : InstructionSelector(), TII(*STI.getInstrInfo()),
58       TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
59       STI(STI),
60       EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG),
61 #define GET_GLOBALISEL_PREDICATES_INIT
62 #include "AMDGPUGenGlobalISel.inc"
63 #undef GET_GLOBALISEL_PREDICATES_INIT
64 #define GET_GLOBALISEL_TEMPORARIES_INIT
65 #include "AMDGPUGenGlobalISel.inc"
66 #undef GET_GLOBALISEL_TEMPORARIES_INIT
67 {
68 }
69 
70 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
71 
72 void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB,
73                                         CodeGenCoverage &CoverageInfo) {
74   MRI = &MF.getRegInfo();
75   InstructionSelector::setupMF(MF, KB, CoverageInfo);
76 }
77 
78 bool AMDGPUInstructionSelector::isVCC(Register Reg,
79                                       const MachineRegisterInfo &MRI) const {
80   // The verifier is oblivious to s1 being a valid value for wavesize registers.
81   if (Reg.isPhysical())
82     return false;
83 
84   auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
85   const TargetRegisterClass *RC =
86       RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
87   if (RC) {
88     const LLT Ty = MRI.getType(Reg);
89     return RC->hasSuperClassEq(TRI.getBoolRC()) &&
90            Ty.isValid() && Ty.getSizeInBits() == 1;
91   }
92 
93   const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
94   return RB->getID() == AMDGPU::VCCRegBankID;
95 }
96 
97 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
98                                                         unsigned NewOpc) const {
99   MI.setDesc(TII.get(NewOpc));
100   MI.RemoveOperand(1); // Remove intrinsic ID.
101   MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
102 
103   MachineOperand &Dst = MI.getOperand(0);
104   MachineOperand &Src = MI.getOperand(1);
105 
106   // TODO: This should be legalized to s32 if needed
107   if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
108     return false;
109 
110   const TargetRegisterClass *DstRC
111     = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
112   const TargetRegisterClass *SrcRC
113     = TRI.getConstrainedRegClassForOperand(Src, *MRI);
114   if (!DstRC || DstRC != SrcRC)
115     return false;
116 
117   return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
118          RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
119 }
120 
121 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
122   const DebugLoc &DL = I.getDebugLoc();
123   MachineBasicBlock *BB = I.getParent();
124   I.setDesc(TII.get(TargetOpcode::COPY));
125 
126   const MachineOperand &Src = I.getOperand(1);
127   MachineOperand &Dst = I.getOperand(0);
128   Register DstReg = Dst.getReg();
129   Register SrcReg = Src.getReg();
130 
131   if (isVCC(DstReg, *MRI)) {
132     if (SrcReg == AMDGPU::SCC) {
133       const TargetRegisterClass *RC
134         = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
135       if (!RC)
136         return true;
137       return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
138     }
139 
140     if (!isVCC(SrcReg, *MRI)) {
141       // TODO: Should probably leave the copy and let copyPhysReg expand it.
142       if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
143         return false;
144 
145       const TargetRegisterClass *SrcRC
146         = TRI.getConstrainedRegClassForOperand(Src, *MRI);
147 
148       Register MaskedReg = MRI->createVirtualRegister(SrcRC);
149 
150       // We can't trust the high bits at this point, so clear them.
151 
152       // TODO: Skip masking high bits if def is known boolean.
153 
154       unsigned AndOpc = TRI.isSGPRClass(SrcRC) ?
155         AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
156       BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
157         .addImm(1)
158         .addReg(SrcReg);
159       BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
160         .addImm(0)
161         .addReg(MaskedReg);
162 
163       if (!MRI->getRegClassOrNull(SrcReg))
164         MRI->setRegClass(SrcReg, SrcRC);
165       I.eraseFromParent();
166       return true;
167     }
168 
169     const TargetRegisterClass *RC =
170       TRI.getConstrainedRegClassForOperand(Dst, *MRI);
171     if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
172       return false;
173 
174     return true;
175   }
176 
177   for (const MachineOperand &MO : I.operands()) {
178     if (Register::isPhysicalRegister(MO.getReg()))
179       continue;
180 
181     const TargetRegisterClass *RC =
182             TRI.getConstrainedRegClassForOperand(MO, *MRI);
183     if (!RC)
184       continue;
185     RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
186   }
187   return true;
188 }
189 
190 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
191   const Register DefReg = I.getOperand(0).getReg();
192   const LLT DefTy = MRI->getType(DefReg);
193   if (DefTy == LLT::scalar(1)) {
194     if (!AllowRiskySelect) {
195       LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n");
196       return false;
197     }
198 
199     LLVM_DEBUG(dbgs() << "Selecting risky boolean phi\n");
200   }
201 
202   // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
203 
204   const RegClassOrRegBank &RegClassOrBank =
205     MRI->getRegClassOrRegBank(DefReg);
206 
207   const TargetRegisterClass *DefRC
208     = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
209   if (!DefRC) {
210     if (!DefTy.isValid()) {
211       LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
212       return false;
213     }
214 
215     const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
216     DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI);
217     if (!DefRC) {
218       LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
219       return false;
220     }
221   }
222 
223   // TODO: Verify that all registers have the same bank
224   I.setDesc(TII.get(TargetOpcode::PHI));
225   return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
226 }
227 
228 MachineOperand
229 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
230                                            const TargetRegisterClass &SubRC,
231                                            unsigned SubIdx) const {
232 
233   MachineInstr *MI = MO.getParent();
234   MachineBasicBlock *BB = MO.getParent()->getParent();
235   Register DstReg = MRI->createVirtualRegister(&SubRC);
236 
237   if (MO.isReg()) {
238     unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
239     Register Reg = MO.getReg();
240     BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
241             .addReg(Reg, 0, ComposedSubIdx);
242 
243     return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
244                                      MO.isKill(), MO.isDead(), MO.isUndef(),
245                                      MO.isEarlyClobber(), 0, MO.isDebug(),
246                                      MO.isInternalRead());
247   }
248 
249   assert(MO.isImm());
250 
251   APInt Imm(64, MO.getImm());
252 
253   switch (SubIdx) {
254   default:
255     llvm_unreachable("do not know to split immediate with this sub index.");
256   case AMDGPU::sub0:
257     return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
258   case AMDGPU::sub1:
259     return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
260   }
261 }
262 
263 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
264   switch (Opc) {
265   case AMDGPU::G_AND:
266     return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
267   case AMDGPU::G_OR:
268     return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
269   case AMDGPU::G_XOR:
270     return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
271   default:
272     llvm_unreachable("not a bit op");
273   }
274 }
275 
276 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
277   Register DstReg = I.getOperand(0).getReg();
278   unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
279 
280   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
281   if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
282       DstRB->getID() != AMDGPU::VCCRegBankID)
283     return false;
284 
285   bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
286                             STI.isWave64());
287   I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
288 
289   // Dead implicit-def of scc
290   I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
291                                          true, // isImp
292                                          false, // isKill
293                                          true)); // isDead
294   return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
295 }
296 
297 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
298   MachineBasicBlock *BB = I.getParent();
299   MachineFunction *MF = BB->getParent();
300   Register DstReg = I.getOperand(0).getReg();
301   const DebugLoc &DL = I.getDebugLoc();
302   LLT Ty = MRI->getType(DstReg);
303   if (Ty.isVector())
304     return false;
305 
306   unsigned Size = Ty.getSizeInBits();
307   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
308   const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
309   const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
310 
311   if (Size == 32) {
312     if (IsSALU) {
313       const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
314       MachineInstr *Add =
315         BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
316         .add(I.getOperand(1))
317         .add(I.getOperand(2));
318       I.eraseFromParent();
319       return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
320     }
321 
322     if (STI.hasAddNoCarry()) {
323       const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
324       I.setDesc(TII.get(Opc));
325       I.addOperand(*MF, MachineOperand::CreateImm(0));
326       I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
327       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
328     }
329 
330     const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
331 
332     Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
333     MachineInstr *Add
334       = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
335       .addDef(UnusedCarry, RegState::Dead)
336       .add(I.getOperand(1))
337       .add(I.getOperand(2))
338       .addImm(0);
339     I.eraseFromParent();
340     return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
341   }
342 
343   assert(!Sub && "illegal sub should not reach here");
344 
345   const TargetRegisterClass &RC
346     = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
347   const TargetRegisterClass &HalfRC
348     = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
349 
350   MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
351   MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
352   MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
353   MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
354 
355   Register DstLo = MRI->createVirtualRegister(&HalfRC);
356   Register DstHi = MRI->createVirtualRegister(&HalfRC);
357 
358   if (IsSALU) {
359     BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
360       .add(Lo1)
361       .add(Lo2);
362     BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
363       .add(Hi1)
364       .add(Hi2);
365   } else {
366     const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
367     Register CarryReg = MRI->createVirtualRegister(CarryRC);
368     BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
369       .addDef(CarryReg)
370       .add(Lo1)
371       .add(Lo2)
372       .addImm(0);
373     MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
374       .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
375       .add(Hi1)
376       .add(Hi2)
377       .addReg(CarryReg, RegState::Kill)
378       .addImm(0);
379 
380     if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
381       return false;
382   }
383 
384   BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
385     .addReg(DstLo)
386     .addImm(AMDGPU::sub0)
387     .addReg(DstHi)
388     .addImm(AMDGPU::sub1);
389 
390 
391   if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
392     return false;
393 
394   I.eraseFromParent();
395   return true;
396 }
397 
398 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
399   MachineInstr &I) const {
400   MachineBasicBlock *BB = I.getParent();
401   MachineFunction *MF = BB->getParent();
402   const DebugLoc &DL = I.getDebugLoc();
403   Register Dst0Reg = I.getOperand(0).getReg();
404   Register Dst1Reg = I.getOperand(1).getReg();
405   const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
406                      I.getOpcode() == AMDGPU::G_UADDE;
407   const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
408                           I.getOpcode() == AMDGPU::G_USUBE;
409 
410   if (isVCC(Dst1Reg, *MRI)) {
411     unsigned NoCarryOpc =
412         IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
413     unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
414     I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
415     I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
416     I.addOperand(*MF, MachineOperand::CreateImm(0));
417     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
418   }
419 
420   Register Src0Reg = I.getOperand(2).getReg();
421   Register Src1Reg = I.getOperand(3).getReg();
422 
423   if (HasCarryIn) {
424     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
425       .addReg(I.getOperand(4).getReg());
426   }
427 
428   unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
429   unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
430 
431   BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
432     .add(I.getOperand(2))
433     .add(I.getOperand(3));
434   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
435     .addReg(AMDGPU::SCC);
436 
437   if (!MRI->getRegClassOrNull(Dst1Reg))
438     MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
439 
440   if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
441       !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
442       !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
443     return false;
444 
445   if (HasCarryIn &&
446       !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
447                                     AMDGPU::SReg_32RegClass, *MRI))
448     return false;
449 
450   I.eraseFromParent();
451   return true;
452 }
453 
454 // TODO: We should probably legalize these to only using 32-bit results.
455 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
456   MachineBasicBlock *BB = I.getParent();
457   Register DstReg = I.getOperand(0).getReg();
458   Register SrcReg = I.getOperand(1).getReg();
459   LLT DstTy = MRI->getType(DstReg);
460   LLT SrcTy = MRI->getType(SrcReg);
461   const unsigned SrcSize = SrcTy.getSizeInBits();
462   unsigned DstSize = DstTy.getSizeInBits();
463 
464   // TODO: Should handle any multiple of 32 offset.
465   unsigned Offset = I.getOperand(2).getImm();
466   if (Offset % 32 != 0 || DstSize > 128)
467     return false;
468 
469   // 16-bit operations really use 32-bit registers.
470   // FIXME: Probably should not allow 16-bit G_EXTRACT results.
471   if (DstSize == 16)
472     DstSize = 32;
473 
474   const TargetRegisterClass *DstRC =
475     TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
476   if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
477     return false;
478 
479   const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
480   const TargetRegisterClass *SrcRC =
481     TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
482   if (!SrcRC)
483     return false;
484   unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32,
485                                                          DstSize / 32);
486   SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
487   if (!SrcRC)
488     return false;
489 
490   SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
491                                     *SrcRC, I.getOperand(1));
492   const DebugLoc &DL = I.getDebugLoc();
493   BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
494     .addReg(SrcReg, 0, SubReg);
495 
496   I.eraseFromParent();
497   return true;
498 }
499 
500 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
501   MachineBasicBlock *BB = MI.getParent();
502   Register DstReg = MI.getOperand(0).getReg();
503   LLT DstTy = MRI->getType(DstReg);
504   LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
505 
506   const unsigned SrcSize = SrcTy.getSizeInBits();
507   if (SrcSize < 32)
508     return selectImpl(MI, *CoverageInfo);
509 
510   const DebugLoc &DL = MI.getDebugLoc();
511   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
512   const unsigned DstSize = DstTy.getSizeInBits();
513   const TargetRegisterClass *DstRC =
514     TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
515   if (!DstRC)
516     return false;
517 
518   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
519   MachineInstrBuilder MIB =
520     BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
521   for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
522     MachineOperand &Src = MI.getOperand(I + 1);
523     MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
524     MIB.addImm(SubRegs[I]);
525 
526     const TargetRegisterClass *SrcRC
527       = TRI.getConstrainedRegClassForOperand(Src, *MRI);
528     if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
529       return false;
530   }
531 
532   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
533     return false;
534 
535   MI.eraseFromParent();
536   return true;
537 }
538 
539 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
540   MachineBasicBlock *BB = MI.getParent();
541   const int NumDst = MI.getNumOperands() - 1;
542 
543   MachineOperand &Src = MI.getOperand(NumDst);
544 
545   Register SrcReg = Src.getReg();
546   Register DstReg0 = MI.getOperand(0).getReg();
547   LLT DstTy = MRI->getType(DstReg0);
548   LLT SrcTy = MRI->getType(SrcReg);
549 
550   const unsigned DstSize = DstTy.getSizeInBits();
551   const unsigned SrcSize = SrcTy.getSizeInBits();
552   const DebugLoc &DL = MI.getDebugLoc();
553   const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
554 
555   const TargetRegisterClass *SrcRC =
556     TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
557   if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
558     return false;
559 
560   // Note we could have mixed SGPR and VGPR destination banks for an SGPR
561   // source, and this relies on the fact that the same subregister indices are
562   // used for both.
563   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
564   for (int I = 0, E = NumDst; I != E; ++I) {
565     MachineOperand &Dst = MI.getOperand(I);
566     BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
567       .addReg(SrcReg, 0, SubRegs[I]);
568 
569     // Make sure the subregister index is valid for the source register.
570     SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
571     if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
572       return false;
573 
574     const TargetRegisterClass *DstRC =
575       TRI.getConstrainedRegClassForOperand(Dst, *MRI);
576     if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
577       return false;
578   }
579 
580   MI.eraseFromParent();
581   return true;
582 }
583 
584 bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
585   MachineInstr &MI) const {
586   if (selectImpl(MI, *CoverageInfo))
587     return true;
588 
589   const LLT S32 = LLT::scalar(32);
590   const LLT V2S16 = LLT::vector(2, 16);
591 
592   Register Dst = MI.getOperand(0).getReg();
593   if (MRI->getType(Dst) != V2S16)
594     return false;
595 
596   const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
597   if (DstBank->getID() != AMDGPU::SGPRRegBankID)
598     return false;
599 
600   Register Src0 = MI.getOperand(1).getReg();
601   Register Src1 = MI.getOperand(2).getReg();
602   if (MRI->getType(Src0) != S32)
603     return false;
604 
605   const DebugLoc &DL = MI.getDebugLoc();
606   MachineBasicBlock *BB = MI.getParent();
607 
608   auto ConstSrc1 = getConstantVRegValWithLookThrough(Src1, *MRI, true, true);
609   if (ConstSrc1) {
610     auto ConstSrc0 = getConstantVRegValWithLookThrough(Src0, *MRI, true, true);
611     if (ConstSrc0) {
612       uint32_t Lo16 = static_cast<uint32_t>(ConstSrc0->Value) & 0xffff;
613       uint32_t Hi16 = static_cast<uint32_t>(ConstSrc1->Value) & 0xffff;
614 
615       BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst)
616         .addImm(Lo16 | (Hi16 << 16));
617       MI.eraseFromParent();
618       return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
619     }
620   }
621 
622   // TODO: This should probably be a combine somewhere
623   // (build_vector_trunc $src0, undef -> copy $src0
624   MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
625   if (Src1Def && Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
626     MI.setDesc(TII.get(AMDGPU::COPY));
627     MI.RemoveOperand(2);
628     return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI) &&
629            RBI.constrainGenericRegister(Src0, AMDGPU::SReg_32RegClass, *MRI);
630   }
631 
632   Register ShiftSrc0;
633   Register ShiftSrc1;
634   int64_t ShiftAmt;
635 
636   // With multiple uses of the shift, this will duplicate the shift and
637   // increase register pressure.
638   //
639   // (build_vector_trunc (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
640   //  => (S_PACK_HH_B32_B16 $src0, $src1)
641   // (build_vector_trunc $src0, (lshr_oneuse SReg_32:$src1, 16))
642   //  => (S_PACK_LH_B32_B16 $src0, $src1)
643   // (build_vector_trunc $src0, $src1)
644   //  => (S_PACK_LL_B32_B16 $src0, $src1)
645 
646   // FIXME: This is an inconvenient way to check a specific value
647   bool Shift0 = mi_match(
648     Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_ICst(ShiftAmt)))) &&
649     ShiftAmt == 16;
650 
651   bool Shift1 = mi_match(
652     Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_ICst(ShiftAmt)))) &&
653     ShiftAmt == 16;
654 
655   unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
656   if (Shift0 && Shift1) {
657     Opc = AMDGPU::S_PACK_HH_B32_B16;
658     MI.getOperand(1).setReg(ShiftSrc0);
659     MI.getOperand(2).setReg(ShiftSrc1);
660   } else if (Shift1) {
661     Opc = AMDGPU::S_PACK_LH_B32_B16;
662     MI.getOperand(2).setReg(ShiftSrc1);
663   } else if (Shift0 && ConstSrc1 && ConstSrc1->Value == 0) {
664     // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
665     auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
666       .addReg(ShiftSrc0)
667       .addImm(16);
668 
669     MI.eraseFromParent();
670     return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
671   }
672 
673   MI.setDesc(TII.get(Opc));
674   return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
675 }
676 
677 bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const {
678   return selectG_ADD_SUB(I);
679 }
680 
681 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
682   const MachineOperand &MO = I.getOperand(0);
683 
684   // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
685   // regbank check here is to know why getConstrainedRegClassForOperand failed.
686   const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
687   if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
688       (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
689     I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
690     return true;
691   }
692 
693   return false;
694 }
695 
696 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
697   MachineBasicBlock *BB = I.getParent();
698 
699   Register DstReg = I.getOperand(0).getReg();
700   Register Src0Reg = I.getOperand(1).getReg();
701   Register Src1Reg = I.getOperand(2).getReg();
702   LLT Src1Ty = MRI->getType(Src1Reg);
703 
704   unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
705   unsigned InsSize = Src1Ty.getSizeInBits();
706 
707   int64_t Offset = I.getOperand(3).getImm();
708 
709   // FIXME: These cases should have been illegal and unnecessary to check here.
710   if (Offset % 32 != 0 || InsSize % 32 != 0)
711     return false;
712 
713   // Currently not handled by getSubRegFromChannel.
714   if (InsSize > 128)
715     return false;
716 
717   unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
718   if (SubReg == AMDGPU::NoSubRegister)
719     return false;
720 
721   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
722   const TargetRegisterClass *DstRC =
723     TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
724   if (!DstRC)
725     return false;
726 
727   const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
728   const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
729   const TargetRegisterClass *Src0RC =
730     TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI);
731   const TargetRegisterClass *Src1RC =
732     TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI);
733 
734   // Deal with weird cases where the class only partially supports the subreg
735   // index.
736   Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
737   if (!Src0RC || !Src1RC)
738     return false;
739 
740   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
741       !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
742       !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
743     return false;
744 
745   const DebugLoc &DL = I.getDebugLoc();
746   BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
747     .addReg(Src0Reg)
748     .addReg(Src1Reg)
749     .addImm(SubReg);
750 
751   I.eraseFromParent();
752   return true;
753 }
754 
755 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
756   if (STI.getLDSBankCount() != 16)
757     return selectImpl(MI, *CoverageInfo);
758 
759   Register Dst = MI.getOperand(0).getReg();
760   Register Src0 = MI.getOperand(2).getReg();
761   Register M0Val = MI.getOperand(6).getReg();
762   if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
763       !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
764       !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
765     return false;
766 
767   // This requires 2 instructions. It is possible to write a pattern to support
768   // this, but the generated isel emitter doesn't correctly deal with multiple
769   // output instructions using the same physical register input. The copy to m0
770   // is incorrectly placed before the second instruction.
771   //
772   // TODO: Match source modifiers.
773 
774   Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
775   const DebugLoc &DL = MI.getDebugLoc();
776   MachineBasicBlock *MBB = MI.getParent();
777 
778   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
779     .addReg(M0Val);
780   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
781     .addImm(2)
782     .addImm(MI.getOperand(4).getImm())  // $attr
783     .addImm(MI.getOperand(3).getImm()); // $attrchan
784 
785   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
786     .addImm(0)                          // $src0_modifiers
787     .addReg(Src0)                       // $src0
788     .addImm(MI.getOperand(4).getImm())  // $attr
789     .addImm(MI.getOperand(3).getImm())  // $attrchan
790     .addImm(0)                          // $src2_modifiers
791     .addReg(InterpMov)                  // $src2 - 2 f16 values selected by high
792     .addImm(MI.getOperand(5).getImm())  // $high
793     .addImm(0)                          // $clamp
794     .addImm(0);                         // $omod
795 
796   MI.eraseFromParent();
797   return true;
798 }
799 
800 // Writelane is special in that it can use SGPR and M0 (which would normally
801 // count as using the constant bus twice - but in this case it is allowed since
802 // the lane selector doesn't count as a use of the constant bus). However, it is
803 // still required to abide by the 1 SGPR rule. Fix this up if we might have
804 // multiple SGPRs.
805 bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
806   // With a constant bus limit of at least 2, there's no issue.
807   if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
808     return selectImpl(MI, *CoverageInfo);
809 
810   MachineBasicBlock *MBB = MI.getParent();
811   const DebugLoc &DL = MI.getDebugLoc();
812   Register VDst = MI.getOperand(0).getReg();
813   Register Val = MI.getOperand(2).getReg();
814   Register LaneSelect = MI.getOperand(3).getReg();
815   Register VDstIn = MI.getOperand(4).getReg();
816 
817   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
818 
819   Optional<ValueAndVReg> ConstSelect =
820     getConstantVRegValWithLookThrough(LaneSelect, *MRI, true, true);
821   if (ConstSelect) {
822     // The selector has to be an inline immediate, so we can use whatever for
823     // the other operands.
824     MIB.addReg(Val);
825     MIB.addImm(ConstSelect->Value &
826                maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
827   } else {
828     Optional<ValueAndVReg> ConstVal =
829       getConstantVRegValWithLookThrough(Val, *MRI, true, true);
830 
831     // If the value written is an inline immediate, we can get away without a
832     // copy to m0.
833     if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value,
834                                                  STI.hasInv2PiInlineImm())) {
835       MIB.addImm(ConstVal->Value);
836       MIB.addReg(LaneSelect);
837     } else {
838       MIB.addReg(Val);
839 
840       // If the lane selector was originally in a VGPR and copied with
841       // readfirstlane, there's a hazard to read the same SGPR from the
842       // VALU. Constrain to a different SGPR to help avoid needing a nop later.
843       RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
844 
845       BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
846         .addReg(LaneSelect);
847       MIB.addReg(AMDGPU::M0);
848     }
849   }
850 
851   MIB.addReg(VDstIn);
852 
853   MI.eraseFromParent();
854   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
855 }
856 
857 // We need to handle this here because tablegen doesn't support matching
858 // instructions with multiple outputs.
859 bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
860   Register Dst0 = MI.getOperand(0).getReg();
861   Register Dst1 = MI.getOperand(1).getReg();
862 
863   LLT Ty = MRI->getType(Dst0);
864   unsigned Opc;
865   if (Ty == LLT::scalar(32))
866     Opc = AMDGPU::V_DIV_SCALE_F32;
867   else if (Ty == LLT::scalar(64))
868     Opc = AMDGPU::V_DIV_SCALE_F64;
869   else
870     return false;
871 
872   const DebugLoc &DL = MI.getDebugLoc();
873   MachineBasicBlock *MBB = MI.getParent();
874 
875   Register Numer = MI.getOperand(3).getReg();
876   Register Denom = MI.getOperand(4).getReg();
877   unsigned ChooseDenom = MI.getOperand(5).getImm();
878 
879   Register Src0 = ChooseDenom != 0 ? Numer : Denom;
880 
881   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
882     .addDef(Dst1)
883     .addUse(Src0)
884     .addUse(Denom)
885     .addUse(Numer);
886 
887   MI.eraseFromParent();
888   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
889 }
890 
891 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
892   unsigned IntrinsicID = I.getIntrinsicID();
893   switch (IntrinsicID) {
894   case Intrinsic::amdgcn_if_break: {
895     MachineBasicBlock *BB = I.getParent();
896 
897     // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
898     // SelectionDAG uses for wave32 vs wave64.
899     BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
900       .add(I.getOperand(0))
901       .add(I.getOperand(2))
902       .add(I.getOperand(3));
903 
904     Register DstReg = I.getOperand(0).getReg();
905     Register Src0Reg = I.getOperand(2).getReg();
906     Register Src1Reg = I.getOperand(3).getReg();
907 
908     I.eraseFromParent();
909 
910     for (Register Reg : { DstReg, Src0Reg, Src1Reg })
911       MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
912 
913     return true;
914   }
915   case Intrinsic::amdgcn_interp_p1_f16:
916     return selectInterpP1F16(I);
917   case Intrinsic::amdgcn_wqm:
918     return constrainCopyLikeIntrin(I, AMDGPU::WQM);
919   case Intrinsic::amdgcn_softwqm:
920     return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
921   case Intrinsic::amdgcn_wwm:
922     return constrainCopyLikeIntrin(I, AMDGPU::WWM);
923   case Intrinsic::amdgcn_writelane:
924     return selectWritelane(I);
925   case Intrinsic::amdgcn_div_scale:
926     return selectDivScale(I);
927   case Intrinsic::amdgcn_icmp:
928     return selectIntrinsicIcmp(I);
929   case Intrinsic::amdgcn_ballot:
930     return selectBallot(I);
931   case Intrinsic::amdgcn_reloc_constant:
932     return selectRelocConstant(I);
933   case Intrinsic::returnaddress:
934     return selectReturnAddress(I);
935   default:
936     return selectImpl(I, *CoverageInfo);
937   }
938 }
939 
940 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) {
941   if (Size != 32 && Size != 64)
942     return -1;
943   switch (P) {
944   default:
945     llvm_unreachable("Unknown condition code!");
946   case CmpInst::ICMP_NE:
947     return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64;
948   case CmpInst::ICMP_EQ:
949     return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64;
950   case CmpInst::ICMP_SGT:
951     return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64;
952   case CmpInst::ICMP_SGE:
953     return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64;
954   case CmpInst::ICMP_SLT:
955     return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64;
956   case CmpInst::ICMP_SLE:
957     return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64;
958   case CmpInst::ICMP_UGT:
959     return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64;
960   case CmpInst::ICMP_UGE:
961     return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64;
962   case CmpInst::ICMP_ULT:
963     return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64;
964   case CmpInst::ICMP_ULE:
965     return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64;
966   }
967 }
968 
969 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
970                                               unsigned Size) const {
971   if (Size == 64) {
972     if (!STI.hasScalarCompareEq64())
973       return -1;
974 
975     switch (P) {
976     case CmpInst::ICMP_NE:
977       return AMDGPU::S_CMP_LG_U64;
978     case CmpInst::ICMP_EQ:
979       return AMDGPU::S_CMP_EQ_U64;
980     default:
981       return -1;
982     }
983   }
984 
985   if (Size != 32)
986     return -1;
987 
988   switch (P) {
989   case CmpInst::ICMP_NE:
990     return AMDGPU::S_CMP_LG_U32;
991   case CmpInst::ICMP_EQ:
992     return AMDGPU::S_CMP_EQ_U32;
993   case CmpInst::ICMP_SGT:
994     return AMDGPU::S_CMP_GT_I32;
995   case CmpInst::ICMP_SGE:
996     return AMDGPU::S_CMP_GE_I32;
997   case CmpInst::ICMP_SLT:
998     return AMDGPU::S_CMP_LT_I32;
999   case CmpInst::ICMP_SLE:
1000     return AMDGPU::S_CMP_LE_I32;
1001   case CmpInst::ICMP_UGT:
1002     return AMDGPU::S_CMP_GT_U32;
1003   case CmpInst::ICMP_UGE:
1004     return AMDGPU::S_CMP_GE_U32;
1005   case CmpInst::ICMP_ULT:
1006     return AMDGPU::S_CMP_LT_U32;
1007   case CmpInst::ICMP_ULE:
1008     return AMDGPU::S_CMP_LE_U32;
1009   default:
1010     llvm_unreachable("Unknown condition code!");
1011   }
1012 }
1013 
1014 bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
1015   MachineBasicBlock *BB = I.getParent();
1016   const DebugLoc &DL = I.getDebugLoc();
1017 
1018   Register SrcReg = I.getOperand(2).getReg();
1019   unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1020 
1021   auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1022 
1023   Register CCReg = I.getOperand(0).getReg();
1024   if (!isVCC(CCReg, *MRI)) {
1025     int Opcode = getS_CMPOpcode(Pred, Size);
1026     if (Opcode == -1)
1027       return false;
1028     MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1029             .add(I.getOperand(2))
1030             .add(I.getOperand(3));
1031     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1032       .addReg(AMDGPU::SCC);
1033     bool Ret =
1034         constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
1035         RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1036     I.eraseFromParent();
1037     return Ret;
1038   }
1039 
1040   int Opcode = getV_CMPOpcode(Pred, Size);
1041   if (Opcode == -1)
1042     return false;
1043 
1044   MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode),
1045             I.getOperand(0).getReg())
1046             .add(I.getOperand(2))
1047             .add(I.getOperand(3));
1048   RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
1049                                *TRI.getBoolRC(), *MRI);
1050   bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1051   I.eraseFromParent();
1052   return Ret;
1053 }
1054 
1055 bool AMDGPUInstructionSelector::selectIntrinsicIcmp(MachineInstr &I) const {
1056   Register Dst = I.getOperand(0).getReg();
1057   if (isVCC(Dst, *MRI))
1058     return false;
1059 
1060   if (MRI->getType(Dst).getSizeInBits() != STI.getWavefrontSize())
1061     return false;
1062 
1063   MachineBasicBlock *BB = I.getParent();
1064   const DebugLoc &DL = I.getDebugLoc();
1065   Register SrcReg = I.getOperand(2).getReg();
1066   unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1067   auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1068 
1069   int Opcode = getV_CMPOpcode(Pred, Size);
1070   if (Opcode == -1)
1071     return false;
1072 
1073   MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst)
1074                            .add(I.getOperand(2))
1075                            .add(I.getOperand(3));
1076   RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), *TRI.getBoolRC(),
1077                                *MRI);
1078   bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1079   I.eraseFromParent();
1080   return Ret;
1081 }
1082 
1083 bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1084   MachineBasicBlock *BB = I.getParent();
1085   const DebugLoc &DL = I.getDebugLoc();
1086   Register DstReg = I.getOperand(0).getReg();
1087   const unsigned Size = MRI->getType(DstReg).getSizeInBits();
1088   const bool Is64 = Size == 64;
1089 
1090   if (Size != STI.getWavefrontSize())
1091     return false;
1092 
1093   Optional<ValueAndVReg> Arg =
1094       getConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI, true);
1095 
1096   if (Arg.hasValue()) {
1097     const int64_t Value = Arg.getValue().Value;
1098     if (Value == 0) {
1099       unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1100       BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
1101     } else if (Value == -1) { // all ones
1102       Register SrcReg = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
1103       BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
1104     } else
1105       return false;
1106   } else {
1107     Register SrcReg = I.getOperand(2).getReg();
1108     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
1109   }
1110 
1111   I.eraseFromParent();
1112   return true;
1113 }
1114 
1115 bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1116   Register DstReg = I.getOperand(0).getReg();
1117   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1118   const TargetRegisterClass *DstRC =
1119     TRI.getRegClassForSizeOnBank(32, *DstBank, *MRI);
1120   if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1121     return false;
1122 
1123   const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1124 
1125   Module *M = MF->getFunction().getParent();
1126   const MDNode *Metadata = I.getOperand(2).getMetadata();
1127   auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1128   auto RelocSymbol = cast<GlobalVariable>(
1129     M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1130 
1131   MachineBasicBlock *BB = I.getParent();
1132   BuildMI(*BB, &I, I.getDebugLoc(),
1133           TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1134     .addGlobalAddress(RelocSymbol, 0, SIInstrInfo::MO_ABS32_LO);
1135 
1136   I.eraseFromParent();
1137   return true;
1138 }
1139 
1140 bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1141   MachineBasicBlock *MBB = I.getParent();
1142   MachineFunction &MF = *MBB->getParent();
1143   const DebugLoc &DL = I.getDebugLoc();
1144 
1145   MachineOperand &Dst = I.getOperand(0);
1146   Register DstReg = Dst.getReg();
1147   unsigned Depth = I.getOperand(2).getImm();
1148 
1149   const TargetRegisterClass *RC
1150     = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1151   if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1152       !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1153     return false;
1154 
1155   // Check for kernel and shader functions
1156   if (Depth != 0 ||
1157       MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1158     BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1159       .addImm(0);
1160     I.eraseFromParent();
1161     return true;
1162   }
1163 
1164   MachineFrameInfo &MFI = MF.getFrameInfo();
1165   // There is a call to @llvm.returnaddress in this function
1166   MFI.setReturnAddressIsTaken(true);
1167 
1168   // Get the return address reg and mark it as an implicit live-in
1169   Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1170   Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1171                                              AMDGPU::SReg_64RegClass);
1172   BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1173     .addReg(LiveIn);
1174   I.eraseFromParent();
1175   return true;
1176 }
1177 
1178 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1179   // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
1180   // SelectionDAG uses for wave32 vs wave64.
1181   MachineBasicBlock *BB = MI.getParent();
1182   BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1183       .add(MI.getOperand(1));
1184 
1185   Register Reg = MI.getOperand(1).getReg();
1186   MI.eraseFromParent();
1187 
1188   if (!MRI->getRegClassOrNull(Reg))
1189     MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1190   return true;
1191 }
1192 
1193 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1194   MachineInstr &MI, Intrinsic::ID IntrID) const {
1195   MachineBasicBlock *MBB = MI.getParent();
1196   MachineFunction *MF = MBB->getParent();
1197   const DebugLoc &DL = MI.getDebugLoc();
1198 
1199   unsigned IndexOperand = MI.getOperand(7).getImm();
1200   bool WaveRelease = MI.getOperand(8).getImm() != 0;
1201   bool WaveDone = MI.getOperand(9).getImm() != 0;
1202 
1203   if (WaveDone && !WaveRelease)
1204     report_fatal_error("ds_ordered_count: wave_done requires wave_release");
1205 
1206   unsigned OrderedCountIndex = IndexOperand & 0x3f;
1207   IndexOperand &= ~0x3f;
1208   unsigned CountDw = 0;
1209 
1210   if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1211     CountDw = (IndexOperand >> 24) & 0xf;
1212     IndexOperand &= ~(0xf << 24);
1213 
1214     if (CountDw < 1 || CountDw > 4) {
1215       report_fatal_error(
1216         "ds_ordered_count: dword count must be between 1 and 4");
1217     }
1218   }
1219 
1220   if (IndexOperand)
1221     report_fatal_error("ds_ordered_count: bad index operand");
1222 
1223   unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1224   unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1225 
1226   unsigned Offset0 = OrderedCountIndex << 2;
1227   unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) |
1228                      (Instruction << 4);
1229 
1230   if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
1231     Offset1 |= (CountDw - 1) << 6;
1232 
1233   unsigned Offset = Offset0 | (Offset1 << 8);
1234 
1235   Register M0Val = MI.getOperand(2).getReg();
1236   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1237     .addReg(M0Val);
1238 
1239   Register DstReg = MI.getOperand(0).getReg();
1240   Register ValReg = MI.getOperand(3).getReg();
1241   MachineInstrBuilder DS =
1242     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1243       .addReg(ValReg)
1244       .addImm(Offset)
1245       .cloneMemRefs(MI);
1246 
1247   if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1248     return false;
1249 
1250   bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1251   MI.eraseFromParent();
1252   return Ret;
1253 }
1254 
1255 static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1256   switch (IntrID) {
1257   case Intrinsic::amdgcn_ds_gws_init:
1258     return AMDGPU::DS_GWS_INIT;
1259   case Intrinsic::amdgcn_ds_gws_barrier:
1260     return AMDGPU::DS_GWS_BARRIER;
1261   case Intrinsic::amdgcn_ds_gws_sema_v:
1262     return AMDGPU::DS_GWS_SEMA_V;
1263   case Intrinsic::amdgcn_ds_gws_sema_br:
1264     return AMDGPU::DS_GWS_SEMA_BR;
1265   case Intrinsic::amdgcn_ds_gws_sema_p:
1266     return AMDGPU::DS_GWS_SEMA_P;
1267   case Intrinsic::amdgcn_ds_gws_sema_release_all:
1268     return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1269   default:
1270     llvm_unreachable("not a gws intrinsic");
1271   }
1272 }
1273 
1274 bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1275                                                      Intrinsic::ID IID) const {
1276   if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1277       !STI.hasGWSSemaReleaseAll())
1278     return false;
1279 
1280   // intrinsic ID, vsrc, offset
1281   const bool HasVSrc = MI.getNumOperands() == 3;
1282   assert(HasVSrc || MI.getNumOperands() == 2);
1283 
1284   Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1285   const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1286   if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1287     return false;
1288 
1289   MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1290   assert(OffsetDef);
1291 
1292   unsigned ImmOffset;
1293 
1294   MachineBasicBlock *MBB = MI.getParent();
1295   const DebugLoc &DL = MI.getDebugLoc();
1296 
1297   MachineInstr *Readfirstlane = nullptr;
1298 
1299   // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1300   // incoming offset, in case there's an add of a constant. We'll have to put it
1301   // back later.
1302   if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1303     Readfirstlane = OffsetDef;
1304     BaseOffset = OffsetDef->getOperand(1).getReg();
1305     OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1306   }
1307 
1308   if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1309     // If we have a constant offset, try to use the 0 in m0 as the base.
1310     // TODO: Look into changing the default m0 initialization value. If the
1311     // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1312     // the immediate offset.
1313 
1314     ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1315     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1316       .addImm(0);
1317   } else {
1318     std::tie(BaseOffset, ImmOffset, OffsetDef)
1319       = AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset);
1320 
1321     if (Readfirstlane) {
1322       // We have the constant offset now, so put the readfirstlane back on the
1323       // variable component.
1324       if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1325         return false;
1326 
1327       Readfirstlane->getOperand(1).setReg(BaseOffset);
1328       BaseOffset = Readfirstlane->getOperand(0).getReg();
1329     } else {
1330       if (!RBI.constrainGenericRegister(BaseOffset,
1331                                         AMDGPU::SReg_32RegClass, *MRI))
1332         return false;
1333     }
1334 
1335     Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1336     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1337       .addReg(BaseOffset)
1338       .addImm(16);
1339 
1340     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1341       .addReg(M0Base);
1342   }
1343 
1344   // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1345   // offset field) % 64. Some versions of the programming guide omit the m0
1346   // part, or claim it's from offset 0.
1347   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1348 
1349   if (HasVSrc) {
1350     Register VSrc = MI.getOperand(1).getReg();
1351     MIB.addReg(VSrc);
1352     if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1353       return false;
1354   }
1355 
1356   MIB.addImm(ImmOffset)
1357      .addImm(-1) // $gds
1358      .cloneMemRefs(MI);
1359 
1360   MI.eraseFromParent();
1361   return true;
1362 }
1363 
1364 bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1365                                                       bool IsAppend) const {
1366   Register PtrBase = MI.getOperand(2).getReg();
1367   LLT PtrTy = MRI->getType(PtrBase);
1368   bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1369 
1370   unsigned Offset;
1371   std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1372 
1373   // TODO: Should this try to look through readfirstlane like GWS?
1374   if (!isDSOffsetLegal(PtrBase, Offset, 16)) {
1375     PtrBase = MI.getOperand(2).getReg();
1376     Offset = 0;
1377   }
1378 
1379   MachineBasicBlock *MBB = MI.getParent();
1380   const DebugLoc &DL = MI.getDebugLoc();
1381   const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1382 
1383   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1384     .addReg(PtrBase);
1385   if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1386     return false;
1387 
1388   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1389     .addImm(Offset)
1390     .addImm(IsGDS ? -1 : 0)
1391     .cloneMemRefs(MI);
1392   MI.eraseFromParent();
1393   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1394 }
1395 
1396 static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
1397                          bool &IsTexFail) {
1398   if (TexFailCtrl)
1399     IsTexFail = true;
1400 
1401   TFE = (TexFailCtrl & 0x1) ? 1 : 0;
1402   TexFailCtrl &= ~(uint64_t)0x1;
1403   LWE = (TexFailCtrl & 0x2) ? 1 : 0;
1404   TexFailCtrl &= ~(uint64_t)0x2;
1405 
1406   return TexFailCtrl == 0;
1407 }
1408 
1409 static bool parseCachePolicy(uint64_t Value,
1410                              bool *GLC, bool *SLC, bool *DLC) {
1411   if (GLC) {
1412     *GLC = (Value & 0x1) ? 1 : 0;
1413     Value &= ~(uint64_t)0x1;
1414   }
1415   if (SLC) {
1416     *SLC = (Value & 0x2) ? 1 : 0;
1417     Value &= ~(uint64_t)0x2;
1418   }
1419   if (DLC) {
1420     *DLC = (Value & 0x4) ? 1 : 0;
1421     Value &= ~(uint64_t)0x4;
1422   }
1423 
1424   return Value == 0;
1425 }
1426 
1427 bool AMDGPUInstructionSelector::selectImageIntrinsic(
1428   MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
1429   MachineBasicBlock *MBB = MI.getParent();
1430   const DebugLoc &DL = MI.getDebugLoc();
1431 
1432   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1433     AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1434 
1435   const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
1436   const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
1437       AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
1438   const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo =
1439       AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode);
1440   unsigned IntrOpcode = Intr->BaseOpcode;
1441   const bool IsGFX10 = STI.getGeneration() >= AMDGPUSubtarget::GFX10;
1442 
1443   const int VAddrIdx = getImageVAddrIdxBegin(BaseOpcode,
1444                                              MI.getNumExplicitDefs());
1445   int NumVAddr, NumGradients;
1446   std::tie(NumVAddr, NumGradients) = getImageNumVAddr(Intr, BaseOpcode);
1447 
1448   Register VDataIn, VDataOut;
1449   LLT VDataTy;
1450   int NumVDataDwords = -1;
1451   bool IsD16 = false;
1452 
1453   // XXX - Can we just get the second to last argument for ctrl?
1454   unsigned CtrlIdx; // Index of texfailctrl argument
1455   bool Unorm;
1456   if (!BaseOpcode->Sampler) {
1457     Unorm = true;
1458     CtrlIdx = VAddrIdx + NumVAddr + 1;
1459   } else {
1460     Unorm = MI.getOperand(VAddrIdx + NumVAddr + 2).getImm() != 0;
1461     CtrlIdx = VAddrIdx + NumVAddr + 3;
1462   }
1463 
1464   bool TFE;
1465   bool LWE;
1466   bool IsTexFail = false;
1467   if (!parseTexFail(MI.getOperand(CtrlIdx).getImm(), TFE, LWE, IsTexFail))
1468     return false;
1469 
1470   const int Flags = MI.getOperand(CtrlIdx + 2).getImm();
1471   const bool IsA16 = (Flags & 1) != 0;
1472   const bool IsG16 = (Flags & 2) != 0;
1473 
1474   // A16 implies 16 bit gradients
1475   if (IsA16 && !IsG16)
1476     return false;
1477 
1478   unsigned DMask = 0;
1479   unsigned DMaskLanes = 0;
1480 
1481   if (BaseOpcode->Atomic) {
1482     VDataOut = MI.getOperand(0).getReg();
1483     VDataIn = MI.getOperand(2).getReg();
1484     LLT Ty = MRI->getType(VDataIn);
1485 
1486     // Be careful to allow atomic swap on 16-bit element vectors.
1487     const bool Is64Bit = BaseOpcode->AtomicX2 ?
1488       Ty.getSizeInBits() == 128 :
1489       Ty.getSizeInBits() == 64;
1490 
1491     if (BaseOpcode->AtomicX2) {
1492       assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
1493 
1494       DMask = Is64Bit ? 0xf : 0x3;
1495       NumVDataDwords = Is64Bit ? 4 : 2;
1496     } else {
1497       DMask = Is64Bit ? 0x3 : 0x1;
1498       NumVDataDwords = Is64Bit ? 2 : 1;
1499     }
1500   } else {
1501     const int DMaskIdx = 2; // Input/output + intrinsic ID.
1502 
1503     DMask = MI.getOperand(DMaskIdx).getImm();
1504     DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
1505 
1506     if (BaseOpcode->Store) {
1507       VDataIn = MI.getOperand(1).getReg();
1508       VDataTy = MRI->getType(VDataIn);
1509       NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
1510     } else {
1511       VDataOut = MI.getOperand(0).getReg();
1512       VDataTy = MRI->getType(VDataOut);
1513       NumVDataDwords = DMaskLanes;
1514 
1515       // One memoperand is mandatory, except for getresinfo.
1516       // FIXME: Check this in verifier.
1517       if (!MI.memoperands_empty()) {
1518         const MachineMemOperand *MMO = *MI.memoperands_begin();
1519 
1520         // Infer d16 from the memory size, as the register type will be mangled by
1521         // unpacked subtargets, or by TFE.
1522         IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32;
1523 
1524         if (IsD16 && !STI.hasUnpackedD16VMem())
1525           NumVDataDwords = (DMaskLanes + 1) / 2;
1526       }
1527     }
1528   }
1529 
1530   // Optimize _L to _LZ when _L is zero
1531   if (LZMappingInfo) {
1532     // The legalizer replaced the register with an immediate 0 if we need to
1533     // change the opcode.
1534     const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1);
1535     if (Lod.isImm()) {
1536       assert(Lod.getImm() == 0);
1537       IntrOpcode = LZMappingInfo->LZ;  // set new opcode to _lz variant of _l
1538     }
1539   }
1540 
1541   // Optimize _mip away, when 'lod' is zero
1542   if (MIPMappingInfo) {
1543     const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1);
1544     if (Lod.isImm()) {
1545       assert(Lod.getImm() == 0);
1546       IntrOpcode = MIPMappingInfo->NONMIP;  // set new opcode to variant without _mip
1547     }
1548   }
1549 
1550   // Set G16 opcode
1551   if (IsG16 && !IsA16) {
1552     const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
1553         AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
1554     assert(G16MappingInfo);
1555     IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
1556   }
1557 
1558   // TODO: Check this in verifier.
1559   assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
1560 
1561   bool GLC = false;
1562   bool SLC = false;
1563   bool DLC = false;
1564   if (BaseOpcode->Atomic) {
1565     GLC = true; // TODO no-return optimization
1566     if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), nullptr, &SLC,
1567                           IsGFX10 ? &DLC : nullptr))
1568       return false;
1569   } else {
1570     if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), &GLC, &SLC,
1571                           IsGFX10 ? &DLC : nullptr))
1572       return false;
1573   }
1574 
1575   int NumVAddrRegs = 0;
1576   int NumVAddrDwords = 0;
1577   for (int I = 0; I < NumVAddr; ++I) {
1578     // Skip the $noregs and 0s inserted during legalization.
1579     MachineOperand &AddrOp = MI.getOperand(VAddrIdx + I);
1580     if (!AddrOp.isReg())
1581       continue; // XXX - Break?
1582 
1583     Register Addr = AddrOp.getReg();
1584     if (!Addr)
1585       break;
1586 
1587     ++NumVAddrRegs;
1588     NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
1589   }
1590 
1591   // The legalizer preprocessed the intrinsic arguments. If we aren't using
1592   // NSA, these should have beeen packed into a single value in the first
1593   // address register
1594   const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs;
1595   if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
1596     LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
1597     return false;
1598   }
1599 
1600   if (IsTexFail)
1601     ++NumVDataDwords;
1602 
1603   int Opcode = -1;
1604   if (IsGFX10) {
1605     Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1606                                    UseNSA ? AMDGPU::MIMGEncGfx10NSA
1607                                           : AMDGPU::MIMGEncGfx10Default,
1608                                    NumVDataDwords, NumVAddrDwords);
1609   } else {
1610     if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
1611       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
1612                                      NumVDataDwords, NumVAddrDwords);
1613     if (Opcode == -1)
1614       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
1615                                      NumVDataDwords, NumVAddrDwords);
1616   }
1617   assert(Opcode != -1);
1618 
1619   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
1620     .cloneMemRefs(MI);
1621 
1622   if (VDataOut) {
1623     if (BaseOpcode->AtomicX2) {
1624       const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
1625 
1626       Register TmpReg = MRI->createVirtualRegister(
1627         Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
1628       unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
1629 
1630       MIB.addDef(TmpReg);
1631       BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
1632         .addReg(TmpReg, RegState::Kill, SubReg);
1633 
1634     } else {
1635       MIB.addDef(VDataOut); // vdata output
1636     }
1637   }
1638 
1639   if (VDataIn)
1640     MIB.addReg(VDataIn); // vdata input
1641 
1642   for (int i = 0; i != NumVAddrRegs; ++i) {
1643     MachineOperand &SrcOp = MI.getOperand(VAddrIdx + i);
1644     if (SrcOp.isReg()) {
1645       assert(SrcOp.getReg() != 0);
1646       MIB.addReg(SrcOp.getReg());
1647     }
1648   }
1649 
1650   MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr).getReg()); // rsrc
1651   if (BaseOpcode->Sampler)
1652     MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr + 1).getReg()); // sampler
1653 
1654   MIB.addImm(DMask); // dmask
1655 
1656   if (IsGFX10)
1657     MIB.addImm(DimInfo->Encoding);
1658   MIB.addImm(Unorm);
1659   if (IsGFX10)
1660     MIB.addImm(DLC);
1661 
1662   MIB.addImm(GLC);
1663   MIB.addImm(SLC);
1664   MIB.addImm(IsA16 &&  // a16 or r128
1665              STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
1666   if (IsGFX10)
1667     MIB.addImm(IsA16 ? -1 : 0);
1668 
1669   MIB.addImm(TFE); // tfe
1670   MIB.addImm(LWE); // lwe
1671   if (!IsGFX10)
1672     MIB.addImm(DimInfo->DA ? -1 : 0);
1673   if (BaseOpcode->HasD16)
1674     MIB.addImm(IsD16 ? -1 : 0);
1675 
1676   MI.eraseFromParent();
1677   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1678 }
1679 
1680 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
1681     MachineInstr &I) const {
1682   unsigned IntrinsicID = I.getIntrinsicID();
1683   switch (IntrinsicID) {
1684   case Intrinsic::amdgcn_end_cf:
1685     return selectEndCfIntrinsic(I);
1686   case Intrinsic::amdgcn_ds_ordered_add:
1687   case Intrinsic::amdgcn_ds_ordered_swap:
1688     return selectDSOrderedIntrinsic(I, IntrinsicID);
1689   case Intrinsic::amdgcn_ds_gws_init:
1690   case Intrinsic::amdgcn_ds_gws_barrier:
1691   case Intrinsic::amdgcn_ds_gws_sema_v:
1692   case Intrinsic::amdgcn_ds_gws_sema_br:
1693   case Intrinsic::amdgcn_ds_gws_sema_p:
1694   case Intrinsic::amdgcn_ds_gws_sema_release_all:
1695     return selectDSGWSIntrinsic(I, IntrinsicID);
1696   case Intrinsic::amdgcn_ds_append:
1697     return selectDSAppendConsume(I, true);
1698   case Intrinsic::amdgcn_ds_consume:
1699     return selectDSAppendConsume(I, false);
1700   default: {
1701     return selectImpl(I, *CoverageInfo);
1702   }
1703   }
1704 }
1705 
1706 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
1707   if (selectImpl(I, *CoverageInfo))
1708     return true;
1709 
1710   MachineBasicBlock *BB = I.getParent();
1711   const DebugLoc &DL = I.getDebugLoc();
1712 
1713   Register DstReg = I.getOperand(0).getReg();
1714   unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
1715   assert(Size <= 32 || Size == 64);
1716   const MachineOperand &CCOp = I.getOperand(1);
1717   Register CCReg = CCOp.getReg();
1718   if (!isVCC(CCReg, *MRI)) {
1719     unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
1720                                          AMDGPU::S_CSELECT_B32;
1721     MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
1722             .addReg(CCReg);
1723 
1724     // The generic constrainSelectedInstRegOperands doesn't work for the scc register
1725     // bank, because it does not cover the register class that we used to represent
1726     // for it.  So we need to manually set the register class here.
1727     if (!MRI->getRegClassOrNull(CCReg))
1728         MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
1729     MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
1730             .add(I.getOperand(2))
1731             .add(I.getOperand(3));
1732 
1733     bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI) |
1734                constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
1735     I.eraseFromParent();
1736     return Ret;
1737   }
1738 
1739   // Wide VGPR select should have been split in RegBankSelect.
1740   if (Size > 32)
1741     return false;
1742 
1743   MachineInstr *Select =
1744       BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1745               .addImm(0)
1746               .add(I.getOperand(3))
1747               .addImm(0)
1748               .add(I.getOperand(2))
1749               .add(I.getOperand(1));
1750 
1751   bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
1752   I.eraseFromParent();
1753   return Ret;
1754 }
1755 
1756 static int sizeToSubRegIndex(unsigned Size) {
1757   switch (Size) {
1758   case 32:
1759     return AMDGPU::sub0;
1760   case 64:
1761     return AMDGPU::sub0_sub1;
1762   case 96:
1763     return AMDGPU::sub0_sub1_sub2;
1764   case 128:
1765     return AMDGPU::sub0_sub1_sub2_sub3;
1766   case 256:
1767     return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
1768   default:
1769     if (Size < 32)
1770       return AMDGPU::sub0;
1771     if (Size > 256)
1772       return -1;
1773     return sizeToSubRegIndex(PowerOf2Ceil(Size));
1774   }
1775 }
1776 
1777 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
1778   Register DstReg = I.getOperand(0).getReg();
1779   Register SrcReg = I.getOperand(1).getReg();
1780   const LLT DstTy = MRI->getType(DstReg);
1781   const LLT SrcTy = MRI->getType(SrcReg);
1782   const LLT S1 = LLT::scalar(1);
1783 
1784   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
1785   const RegisterBank *DstRB;
1786   if (DstTy == S1) {
1787     // This is a special case. We don't treat s1 for legalization artifacts as
1788     // vcc booleans.
1789     DstRB = SrcRB;
1790   } else {
1791     DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1792     if (SrcRB != DstRB)
1793       return false;
1794   }
1795 
1796   const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
1797 
1798   unsigned DstSize = DstTy.getSizeInBits();
1799   unsigned SrcSize = SrcTy.getSizeInBits();
1800 
1801   const TargetRegisterClass *SrcRC
1802     = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI);
1803   const TargetRegisterClass *DstRC
1804     = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI);
1805   if (!SrcRC || !DstRC)
1806     return false;
1807 
1808   if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
1809       !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
1810     LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
1811     return false;
1812   }
1813 
1814   if (DstTy == LLT::vector(2, 16) && SrcTy == LLT::vector(2, 32)) {
1815     MachineBasicBlock *MBB = I.getParent();
1816     const DebugLoc &DL = I.getDebugLoc();
1817 
1818     Register LoReg = MRI->createVirtualRegister(DstRC);
1819     Register HiReg = MRI->createVirtualRegister(DstRC);
1820     BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
1821       .addReg(SrcReg, 0, AMDGPU::sub0);
1822     BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
1823       .addReg(SrcReg, 0, AMDGPU::sub1);
1824 
1825     if (IsVALU && STI.hasSDWA()) {
1826       // Write the low 16-bits of the high element into the high 16-bits of the
1827       // low element.
1828       MachineInstr *MovSDWA =
1829         BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
1830         .addImm(0)                             // $src0_modifiers
1831         .addReg(HiReg)                         // $src0
1832         .addImm(0)                             // $clamp
1833         .addImm(AMDGPU::SDWA::WORD_1)          // $dst_sel
1834         .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
1835         .addImm(AMDGPU::SDWA::WORD_0)          // $src0_sel
1836         .addReg(LoReg, RegState::Implicit);
1837       MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
1838     } else {
1839       Register TmpReg0 = MRI->createVirtualRegister(DstRC);
1840       Register TmpReg1 = MRI->createVirtualRegister(DstRC);
1841       Register ImmReg = MRI->createVirtualRegister(DstRC);
1842       if (IsVALU) {
1843         BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
1844           .addImm(16)
1845           .addReg(HiReg);
1846       } else {
1847         BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
1848           .addReg(HiReg)
1849           .addImm(16);
1850       }
1851 
1852       unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
1853       unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
1854       unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
1855 
1856       BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
1857         .addImm(0xffff);
1858       BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
1859         .addReg(LoReg)
1860         .addReg(ImmReg);
1861       BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
1862         .addReg(TmpReg0)
1863         .addReg(TmpReg1);
1864     }
1865 
1866     I.eraseFromParent();
1867     return true;
1868   }
1869 
1870   if (!DstTy.isScalar())
1871     return false;
1872 
1873   if (SrcSize > 32) {
1874     int SubRegIdx = sizeToSubRegIndex(DstSize);
1875     if (SubRegIdx == -1)
1876       return false;
1877 
1878     // Deal with weird cases where the class only partially supports the subreg
1879     // index.
1880     const TargetRegisterClass *SrcWithSubRC
1881       = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
1882     if (!SrcWithSubRC)
1883       return false;
1884 
1885     if (SrcWithSubRC != SrcRC) {
1886       if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
1887         return false;
1888     }
1889 
1890     I.getOperand(1).setSubReg(SubRegIdx);
1891   }
1892 
1893   I.setDesc(TII.get(TargetOpcode::COPY));
1894   return true;
1895 }
1896 
1897 /// \returns true if a bitmask for \p Size bits will be an inline immediate.
1898 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
1899   Mask = maskTrailingOnes<unsigned>(Size);
1900   int SignedMask = static_cast<int>(Mask);
1901   return SignedMask >= -16 && SignedMask <= 64;
1902 }
1903 
1904 // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
1905 const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
1906   Register Reg, const MachineRegisterInfo &MRI,
1907   const TargetRegisterInfo &TRI) const {
1908   const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
1909   if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>())
1910     return RB;
1911 
1912   // Ignore the type, since we don't use vcc in artifacts.
1913   if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
1914     return &RBI.getRegBankFromRegClass(*RC, LLT());
1915   return nullptr;
1916 }
1917 
1918 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
1919   bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
1920   bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
1921   const DebugLoc &DL = I.getDebugLoc();
1922   MachineBasicBlock &MBB = *I.getParent();
1923   const Register DstReg = I.getOperand(0).getReg();
1924   const Register SrcReg = I.getOperand(1).getReg();
1925 
1926   const LLT DstTy = MRI->getType(DstReg);
1927   const LLT SrcTy = MRI->getType(SrcReg);
1928   const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
1929     I.getOperand(2).getImm() : SrcTy.getSizeInBits();
1930   const unsigned DstSize = DstTy.getSizeInBits();
1931   if (!DstTy.isScalar())
1932     return false;
1933 
1934   // Artifact casts should never use vcc.
1935   const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
1936 
1937   // FIXME: This should probably be illegal and split earlier.
1938   if (I.getOpcode() == AMDGPU::G_ANYEXT) {
1939     if (DstSize <= 32)
1940       return selectCOPY(I);
1941 
1942     const TargetRegisterClass *SrcRC =
1943         TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank, *MRI);
1944     const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1945     const TargetRegisterClass *DstRC =
1946         TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
1947 
1948     Register UndefReg = MRI->createVirtualRegister(SrcRC);
1949     BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
1950     BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1951       .addReg(SrcReg)
1952       .addImm(AMDGPU::sub0)
1953       .addReg(UndefReg)
1954       .addImm(AMDGPU::sub1);
1955     I.eraseFromParent();
1956 
1957     return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
1958            RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
1959   }
1960 
1961   if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
1962     // 64-bit should have been split up in RegBankSelect
1963 
1964     // Try to use an and with a mask if it will save code size.
1965     unsigned Mask;
1966     if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
1967       MachineInstr *ExtI =
1968       BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
1969         .addImm(Mask)
1970         .addReg(SrcReg);
1971       I.eraseFromParent();
1972       return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
1973     }
1974 
1975     const unsigned BFE = Signed ? AMDGPU::V_BFE_I32 : AMDGPU::V_BFE_U32;
1976     MachineInstr *ExtI =
1977       BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
1978       .addReg(SrcReg)
1979       .addImm(0) // Offset
1980       .addImm(SrcSize); // Width
1981     I.eraseFromParent();
1982     return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
1983   }
1984 
1985   if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
1986     const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
1987       AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
1988     if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
1989       return false;
1990 
1991     if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
1992       const unsigned SextOpc = SrcSize == 8 ?
1993         AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
1994       BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
1995         .addReg(SrcReg);
1996       I.eraseFromParent();
1997       return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
1998     }
1999 
2000     const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2001     const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2002 
2003     // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2004     if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2005       // We need a 64-bit register source, but the high bits don't matter.
2006       Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2007       Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2008       unsigned SubReg = InReg ? AMDGPU::sub0 : 0;
2009 
2010       BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2011       BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2012         .addReg(SrcReg, 0, SubReg)
2013         .addImm(AMDGPU::sub0)
2014         .addReg(UndefReg)
2015         .addImm(AMDGPU::sub1);
2016 
2017       BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2018         .addReg(ExtReg)
2019         .addImm(SrcSize << 16);
2020 
2021       I.eraseFromParent();
2022       return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2023     }
2024 
2025     unsigned Mask;
2026     if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2027       BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2028         .addReg(SrcReg)
2029         .addImm(Mask);
2030     } else {
2031       BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2032         .addReg(SrcReg)
2033         .addImm(SrcSize << 16);
2034     }
2035 
2036     I.eraseFromParent();
2037     return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2038   }
2039 
2040   return false;
2041 }
2042 
2043 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
2044   MachineBasicBlock *BB = I.getParent();
2045   MachineOperand &ImmOp = I.getOperand(1);
2046 
2047   // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
2048   if (ImmOp.isFPImm()) {
2049     const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
2050     ImmOp.ChangeToImmediate(Imm.getZExtValue());
2051   } else if (ImmOp.isCImm()) {
2052     ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue());
2053   }
2054 
2055   Register DstReg = I.getOperand(0).getReg();
2056   unsigned Size;
2057   bool IsSgpr;
2058   const RegisterBank *RB = MRI->getRegBankOrNull(I.getOperand(0).getReg());
2059   if (RB) {
2060     IsSgpr = RB->getID() == AMDGPU::SGPRRegBankID;
2061     Size = MRI->getType(DstReg).getSizeInBits();
2062   } else {
2063     const TargetRegisterClass *RC = TRI.getRegClassForReg(*MRI, DstReg);
2064     IsSgpr = TRI.isSGPRClass(RC);
2065     Size = TRI.getRegSizeInBits(*RC);
2066   }
2067 
2068   if (Size != 32 && Size != 64)
2069     return false;
2070 
2071   unsigned Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
2072   if (Size == 32) {
2073     I.setDesc(TII.get(Opcode));
2074     I.addImplicitDefUseOperands(*MF);
2075     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2076   }
2077 
2078   const DebugLoc &DL = I.getDebugLoc();
2079 
2080   APInt Imm(Size, I.getOperand(1).getImm());
2081 
2082   MachineInstr *ResInst;
2083   if (IsSgpr && TII.isInlineConstant(Imm)) {
2084     ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
2085       .addImm(I.getOperand(1).getImm());
2086   } else {
2087     const TargetRegisterClass *RC = IsSgpr ?
2088       &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass;
2089     Register LoReg = MRI->createVirtualRegister(RC);
2090     Register HiReg = MRI->createVirtualRegister(RC);
2091 
2092     BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
2093       .addImm(Imm.trunc(32).getZExtValue());
2094 
2095     BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
2096       .addImm(Imm.ashr(32).getZExtValue());
2097 
2098     ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2099       .addReg(LoReg)
2100       .addImm(AMDGPU::sub0)
2101       .addReg(HiReg)
2102       .addImm(AMDGPU::sub1);
2103   }
2104 
2105   // We can't call constrainSelectedInstRegOperands here, because it doesn't
2106   // work for target independent opcodes
2107   I.eraseFromParent();
2108   const TargetRegisterClass *DstRC =
2109     TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI);
2110   if (!DstRC)
2111     return true;
2112   return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI);
2113 }
2114 
2115 bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2116   // Only manually handle the f64 SGPR case.
2117   //
2118   // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2119   // the bit ops theoretically have a second result due to the implicit def of
2120   // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2121   // that is easy by disabling the check. The result works, but uses a
2122   // nonsensical sreg32orlds_and_sreg_1 regclass.
2123   //
2124   // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2125   // the variadic REG_SEQUENCE operands.
2126 
2127   Register Dst = MI.getOperand(0).getReg();
2128   const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2129   if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2130       MRI->getType(Dst) != LLT::scalar(64))
2131     return false;
2132 
2133   Register Src = MI.getOperand(1).getReg();
2134   MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2135   if (Fabs)
2136     Src = Fabs->getOperand(1).getReg();
2137 
2138   if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2139       !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2140     return false;
2141 
2142   MachineBasicBlock *BB = MI.getParent();
2143   const DebugLoc &DL = MI.getDebugLoc();
2144   Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2145   Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2146   Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2147   Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2148 
2149   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2150     .addReg(Src, 0, AMDGPU::sub0);
2151   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2152     .addReg(Src, 0, AMDGPU::sub1);
2153   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2154     .addImm(0x80000000);
2155 
2156   // Set or toggle sign bit.
2157   unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2158   BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2159     .addReg(HiReg)
2160     .addReg(ConstReg);
2161   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2162     .addReg(LoReg)
2163     .addImm(AMDGPU::sub0)
2164     .addReg(OpReg)
2165     .addImm(AMDGPU::sub1);
2166   MI.eraseFromParent();
2167   return true;
2168 }
2169 
2170 // FIXME: This is a workaround for the same tablegen problems as G_FNEG
2171 bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2172   Register Dst = MI.getOperand(0).getReg();
2173   const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2174   if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2175       MRI->getType(Dst) != LLT::scalar(64))
2176     return false;
2177 
2178   Register Src = MI.getOperand(1).getReg();
2179   MachineBasicBlock *BB = MI.getParent();
2180   const DebugLoc &DL = MI.getDebugLoc();
2181   Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2182   Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2183   Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2184   Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2185 
2186   if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2187       !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2188     return false;
2189 
2190   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2191     .addReg(Src, 0, AMDGPU::sub0);
2192   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2193     .addReg(Src, 0, AMDGPU::sub1);
2194   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2195     .addImm(0x7fffffff);
2196 
2197   // Clear sign bit.
2198   // TODO: Should this used S_BITSET0_*?
2199   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2200     .addReg(HiReg)
2201     .addReg(ConstReg);
2202   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2203     .addReg(LoReg)
2204     .addImm(AMDGPU::sub0)
2205     .addReg(OpReg)
2206     .addImm(AMDGPU::sub1);
2207 
2208   MI.eraseFromParent();
2209   return true;
2210 }
2211 
2212 static bool isConstant(const MachineInstr &MI) {
2213   return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2214 }
2215 
2216 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2217     const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2218 
2219   const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg());
2220 
2221   assert(PtrMI);
2222 
2223   if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2224     return;
2225 
2226   GEPInfo GEPInfo(*PtrMI);
2227 
2228   for (unsigned i = 1; i != 3; ++i) {
2229     const MachineOperand &GEPOp = PtrMI->getOperand(i);
2230     const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2231     assert(OpDef);
2232     if (i == 2 && isConstant(*OpDef)) {
2233       // TODO: Could handle constant base + variable offset, but a combine
2234       // probably should have commuted it.
2235       assert(GEPInfo.Imm == 0);
2236       GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2237       continue;
2238     }
2239     const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2240     if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2241       GEPInfo.SgprParts.push_back(GEPOp.getReg());
2242     else
2243       GEPInfo.VgprParts.push_back(GEPOp.getReg());
2244   }
2245 
2246   AddrInfo.push_back(GEPInfo);
2247   getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2248 }
2249 
2250 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2251   if (!MI.hasOneMemOperand())
2252     return false;
2253 
2254   const MachineMemOperand *MMO = *MI.memoperands_begin();
2255   const Value *Ptr = MMO->getValue();
2256 
2257   // UndefValue means this is a load of a kernel input.  These are uniform.
2258   // Sometimes LDS instructions have constant pointers.
2259   // If Ptr is null, then that means this mem operand contains a
2260   // PseudoSourceValue like GOT.
2261   if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
2262       isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
2263     return true;
2264 
2265   if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
2266     return true;
2267 
2268   const Instruction *I = dyn_cast<Instruction>(Ptr);
2269   return I && I->getMetadata("amdgpu.uniform");
2270 }
2271 
2272 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2273   for (const GEPInfo &GEPInfo : AddrInfo) {
2274     if (!GEPInfo.VgprParts.empty())
2275       return true;
2276   }
2277   return false;
2278 }
2279 
2280 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2281   const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2282   unsigned AS = PtrTy.getAddressSpace();
2283   if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) &&
2284       STI.ldsRequiresM0Init()) {
2285     MachineBasicBlock *BB = I.getParent();
2286 
2287     // If DS instructions require M0 initializtion, insert it before selecting.
2288     BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2289       .addImm(-1);
2290   }
2291 }
2292 
2293 bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
2294   MachineInstr &I) const {
2295   initM0(I);
2296   return selectImpl(I, *CoverageInfo);
2297 }
2298 
2299 // TODO: No rtn optimization.
2300 bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG(
2301   MachineInstr &MI) const {
2302   Register PtrReg = MI.getOperand(1).getReg();
2303   const LLT PtrTy = MRI->getType(PtrReg);
2304   if (PtrTy.getAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
2305       STI.useFlatForGlobal())
2306     return selectImpl(MI, *CoverageInfo);
2307 
2308   Register DstReg = MI.getOperand(0).getReg();
2309   const LLT Ty = MRI->getType(DstReg);
2310   const bool Is64 = Ty.getSizeInBits() == 64;
2311   const unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2312   Register TmpReg = MRI->createVirtualRegister(
2313     Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2314 
2315   const DebugLoc &DL = MI.getDebugLoc();
2316   MachineBasicBlock *BB = MI.getParent();
2317 
2318   Register VAddr, RSrcReg, SOffset;
2319   int64_t Offset = 0;
2320 
2321   unsigned Opcode;
2322   if (selectMUBUFOffsetImpl(MI.getOperand(1), RSrcReg, SOffset, Offset)) {
2323     Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN :
2324                              AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN;
2325   } else if (selectMUBUFAddr64Impl(MI.getOperand(1), VAddr,
2326                                    RSrcReg, SOffset, Offset)) {
2327     Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN :
2328                     AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN;
2329   } else
2330     return selectImpl(MI, *CoverageInfo);
2331 
2332   auto MIB = BuildMI(*BB, &MI, DL, TII.get(Opcode), TmpReg)
2333     .addReg(MI.getOperand(2).getReg());
2334 
2335   if (VAddr)
2336     MIB.addReg(VAddr);
2337 
2338   MIB.addReg(RSrcReg);
2339   if (SOffset)
2340     MIB.addReg(SOffset);
2341   else
2342     MIB.addImm(0);
2343 
2344   MIB.addImm(Offset);
2345   MIB.addImm(0); // slc
2346   MIB.cloneMemRefs(MI);
2347 
2348   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), DstReg)
2349     .addReg(TmpReg, RegState::Kill, SubReg);
2350 
2351   MI.eraseFromParent();
2352 
2353   MRI->setRegClass(
2354     DstReg, Is64 ? &AMDGPU::VReg_64RegClass : &AMDGPU::VGPR_32RegClass);
2355   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2356 }
2357 
2358 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
2359   MachineBasicBlock *BB = I.getParent();
2360   MachineOperand &CondOp = I.getOperand(0);
2361   Register CondReg = CondOp.getReg();
2362   const DebugLoc &DL = I.getDebugLoc();
2363 
2364   unsigned BrOpcode;
2365   Register CondPhysReg;
2366   const TargetRegisterClass *ConstrainRC;
2367 
2368   // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
2369   // whether the branch is uniform when selecting the instruction. In
2370   // GlobalISel, we should push that decision into RegBankSelect. Assume for now
2371   // RegBankSelect knows what it's doing if the branch condition is scc, even
2372   // though it currently does not.
2373   if (!isVCC(CondReg, *MRI)) {
2374     if (MRI->getType(CondReg) != LLT::scalar(32))
2375       return false;
2376 
2377     CondPhysReg = AMDGPU::SCC;
2378     BrOpcode = AMDGPU::S_CBRANCH_SCC1;
2379     ConstrainRC = &AMDGPU::SReg_32RegClass;
2380   } else {
2381     // FIXME: Do we have to insert an and with exec here, like in SelectionDAG?
2382     // We sort of know that a VCC producer based on the register bank, that ands
2383     // inactive lanes with 0. What if there was a logical operation with vcc
2384     // producers in different blocks/with different exec masks?
2385     // FIXME: Should scc->vcc copies and with exec?
2386     CondPhysReg = TRI.getVCC();
2387     BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
2388     ConstrainRC = TRI.getBoolRC();
2389   }
2390 
2391   if (!MRI->getRegClassOrNull(CondReg))
2392     MRI->setRegClass(CondReg, ConstrainRC);
2393 
2394   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
2395     .addReg(CondReg);
2396   BuildMI(*BB, &I, DL, TII.get(BrOpcode))
2397     .addMBB(I.getOperand(1).getMBB());
2398 
2399   I.eraseFromParent();
2400   return true;
2401 }
2402 
2403 bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
2404   MachineInstr &I) const {
2405   Register DstReg = I.getOperand(0).getReg();
2406   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2407   const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2408   I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
2409   if (IsVGPR)
2410     I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
2411 
2412   return RBI.constrainGenericRegister(
2413     DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
2414 }
2415 
2416 bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
2417   Register DstReg = I.getOperand(0).getReg();
2418   Register SrcReg = I.getOperand(1).getReg();
2419   Register MaskReg = I.getOperand(2).getReg();
2420   LLT Ty = MRI->getType(DstReg);
2421   LLT MaskTy = MRI->getType(MaskReg);
2422 
2423   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2424   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2425   const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
2426   const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2427   if (DstRB != SrcRB) // Should only happen for hand written MIR.
2428     return false;
2429 
2430   unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2431   const TargetRegisterClass &RegRC
2432     = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2433 
2434   const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB,
2435                                                                   *MRI);
2436   const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB,
2437                                                                   *MRI);
2438   const TargetRegisterClass *MaskRC =
2439       TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB, *MRI);
2440 
2441   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2442       !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2443       !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
2444     return false;
2445 
2446   MachineBasicBlock *BB = I.getParent();
2447   const DebugLoc &DL = I.getDebugLoc();
2448   if (Ty.getSizeInBits() == 32) {
2449     assert(MaskTy.getSizeInBits() == 32 &&
2450            "ptrmask should have been narrowed during legalize");
2451 
2452     BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
2453       .addReg(SrcReg)
2454       .addReg(MaskReg);
2455     I.eraseFromParent();
2456     return true;
2457   }
2458 
2459   Register HiReg = MRI->createVirtualRegister(&RegRC);
2460   Register LoReg = MRI->createVirtualRegister(&RegRC);
2461 
2462   // Extract the subregisters from the source pointer.
2463   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
2464     .addReg(SrcReg, 0, AMDGPU::sub0);
2465   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
2466     .addReg(SrcReg, 0, AMDGPU::sub1);
2467 
2468   Register MaskedLo, MaskedHi;
2469 
2470   // Try to avoid emitting a bit operation when we only need to touch half of
2471   // the 64-bit pointer.
2472   APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64);
2473 
2474   const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
2475   const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
2476   if ((MaskOnes & MaskLo32) == MaskLo32) {
2477     // If all the bits in the low half are 1, we only need a copy for it.
2478     MaskedLo = LoReg;
2479   } else {
2480     // Extract the mask subregister and apply the and.
2481     Register MaskLo = MRI->createVirtualRegister(&RegRC);
2482     MaskedLo = MRI->createVirtualRegister(&RegRC);
2483 
2484     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
2485       .addReg(MaskReg, 0, AMDGPU::sub0);
2486     BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
2487       .addReg(LoReg)
2488       .addReg(MaskLo);
2489   }
2490 
2491   if ((MaskOnes & MaskHi32) == MaskHi32) {
2492     // If all the bits in the high half are 1, we only need a copy for it.
2493     MaskedHi = HiReg;
2494   } else {
2495     Register MaskHi = MRI->createVirtualRegister(&RegRC);
2496     MaskedHi = MRI->createVirtualRegister(&RegRC);
2497 
2498     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
2499       .addReg(MaskReg, 0, AMDGPU::sub1);
2500     BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
2501       .addReg(HiReg)
2502       .addReg(MaskHi);
2503   }
2504 
2505   BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2506     .addReg(MaskedLo)
2507     .addImm(AMDGPU::sub0)
2508     .addReg(MaskedHi)
2509     .addImm(AMDGPU::sub1);
2510   I.eraseFromParent();
2511   return true;
2512 }
2513 
2514 /// Return the register to use for the index value, and the subregister to use
2515 /// for the indirectly accessed register.
2516 static std::pair<Register, unsigned>
2517 computeIndirectRegIndex(MachineRegisterInfo &MRI,
2518                         const SIRegisterInfo &TRI,
2519                         const TargetRegisterClass *SuperRC,
2520                         Register IdxReg,
2521                         unsigned EltSize) {
2522   Register IdxBaseReg;
2523   int Offset;
2524   MachineInstr *Unused;
2525 
2526   std::tie(IdxBaseReg, Offset, Unused)
2527     = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg);
2528   if (IdxBaseReg == AMDGPU::NoRegister) {
2529     // This will happen if the index is a known constant. This should ordinarily
2530     // be legalized out, but handle it as a register just in case.
2531     assert(Offset == 0);
2532     IdxBaseReg = IdxReg;
2533   }
2534 
2535   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
2536 
2537   // Skip out of bounds offsets, or else we would end up using an undefined
2538   // register.
2539   if (static_cast<unsigned>(Offset) >= SubRegs.size())
2540     return std::make_pair(IdxReg, SubRegs[0]);
2541   return std::make_pair(IdxBaseReg, SubRegs[Offset]);
2542 }
2543 
2544 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
2545   MachineInstr &MI) const {
2546   Register DstReg = MI.getOperand(0).getReg();
2547   Register SrcReg = MI.getOperand(1).getReg();
2548   Register IdxReg = MI.getOperand(2).getReg();
2549 
2550   LLT DstTy = MRI->getType(DstReg);
2551   LLT SrcTy = MRI->getType(SrcReg);
2552 
2553   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2554   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2555   const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
2556 
2557   // The index must be scalar. If it wasn't RegBankSelect should have moved this
2558   // into a waterfall loop.
2559   if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
2560     return false;
2561 
2562   const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB,
2563                                                                   *MRI);
2564   const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(DstTy, *DstRB,
2565                                                                   *MRI);
2566   if (!SrcRC || !DstRC)
2567     return false;
2568   if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2569       !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2570       !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
2571     return false;
2572 
2573   MachineBasicBlock *BB = MI.getParent();
2574   const DebugLoc &DL = MI.getDebugLoc();
2575   const bool Is64 = DstTy.getSizeInBits() == 64;
2576 
2577   unsigned SubReg;
2578   std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, SrcRC, IdxReg,
2579                                                      DstTy.getSizeInBits() / 8);
2580 
2581   if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
2582     if (DstTy.getSizeInBits() != 32 && !Is64)
2583       return false;
2584 
2585     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2586       .addReg(IdxReg);
2587 
2588     unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
2589     BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
2590       .addReg(SrcReg, 0, SubReg)
2591       .addReg(SrcReg, RegState::Implicit);
2592     MI.eraseFromParent();
2593     return true;
2594   }
2595 
2596   if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
2597     return false;
2598 
2599   if (!STI.useVGPRIndexMode()) {
2600     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2601       .addReg(IdxReg);
2602     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
2603       .addReg(SrcReg, 0, SubReg)
2604       .addReg(SrcReg, RegState::Implicit);
2605     MI.eraseFromParent();
2606     return true;
2607   }
2608 
2609   BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON))
2610     .addReg(IdxReg)
2611     .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE);
2612   BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), DstReg)
2613     .addReg(SrcReg, 0, SubReg)
2614     .addReg(SrcReg, RegState::Implicit)
2615     .addReg(AMDGPU::M0, RegState::Implicit);
2616   BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF));
2617 
2618   MI.eraseFromParent();
2619   return true;
2620 }
2621 
2622 // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
2623 bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
2624   MachineInstr &MI) const {
2625   Register DstReg = MI.getOperand(0).getReg();
2626   Register VecReg = MI.getOperand(1).getReg();
2627   Register ValReg = MI.getOperand(2).getReg();
2628   Register IdxReg = MI.getOperand(3).getReg();
2629 
2630   LLT VecTy = MRI->getType(DstReg);
2631   LLT ValTy = MRI->getType(ValReg);
2632   unsigned VecSize = VecTy.getSizeInBits();
2633   unsigned ValSize = ValTy.getSizeInBits();
2634 
2635   const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
2636   const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
2637   const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
2638 
2639   assert(VecTy.getElementType() == ValTy);
2640 
2641   // The index must be scalar. If it wasn't RegBankSelect should have moved this
2642   // into a waterfall loop.
2643   if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
2644     return false;
2645 
2646   const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB,
2647                                                                   *MRI);
2648   const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB,
2649                                                                   *MRI);
2650 
2651   if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
2652       !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
2653       !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
2654       !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
2655     return false;
2656 
2657   if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
2658     return false;
2659 
2660   unsigned SubReg;
2661   std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg,
2662                                                      ValSize / 8);
2663 
2664   const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
2665                          STI.useVGPRIndexMode();
2666 
2667   MachineBasicBlock *BB = MI.getParent();
2668   const DebugLoc &DL = MI.getDebugLoc();
2669 
2670   if (IndexMode) {
2671     BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON))
2672       .addReg(IdxReg)
2673       .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE);
2674   } else {
2675     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2676       .addReg(IdxReg);
2677   }
2678 
2679   const MCInstrDesc &RegWriteOp
2680     = TII.getIndirectRegWritePseudo(VecSize, ValSize,
2681                                     VecRB->getID() == AMDGPU::SGPRRegBankID);
2682   BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
2683     .addReg(VecReg)
2684     .addReg(ValReg)
2685     .addImm(SubReg);
2686 
2687   if (IndexMode)
2688     BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF));
2689 
2690   MI.eraseFromParent();
2691   return true;
2692 }
2693 
2694 static bool isZeroOrUndef(int X) {
2695   return X == 0 || X == -1;
2696 }
2697 
2698 static bool isOneOrUndef(int X) {
2699   return X == 1 || X == -1;
2700 }
2701 
2702 static bool isZeroOrOneOrUndef(int X) {
2703   return X == 0 || X == 1 || X == -1;
2704 }
2705 
2706 // Normalize a VOP3P shuffle mask to refer to the low/high half of a single
2707 // 32-bit register.
2708 static Register normalizeVOP3PMask(int NewMask[2], Register Src0, Register Src1,
2709                                    ArrayRef<int> Mask) {
2710   NewMask[0] = Mask[0];
2711   NewMask[1] = Mask[1];
2712   if (isZeroOrOneOrUndef(Mask[0]) && isZeroOrOneOrUndef(Mask[1]))
2713     return Src0;
2714 
2715   assert(NewMask[0] == 2 || NewMask[0] == 3 || NewMask[0] == -1);
2716   assert(NewMask[1] == 2 || NewMask[1] == 3 || NewMask[1] == -1);
2717 
2718   // Shift the mask inputs to be 0/1;
2719   NewMask[0] = NewMask[0] == -1 ? -1 : NewMask[0] - 2;
2720   NewMask[1] = NewMask[1] == -1 ? -1 : NewMask[1] - 2;
2721   return Src1;
2722 }
2723 
2724 // This is only legal with VOP3P instructions as an aid to op_sel matching.
2725 bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
2726   MachineInstr &MI) const {
2727   Register DstReg = MI.getOperand(0).getReg();
2728   Register Src0Reg = MI.getOperand(1).getReg();
2729   Register Src1Reg = MI.getOperand(2).getReg();
2730   ArrayRef<int> ShufMask = MI.getOperand(3).getShuffleMask();
2731 
2732   const LLT V2S16 = LLT::vector(2, 16);
2733   if (MRI->getType(DstReg) != V2S16 || MRI->getType(Src0Reg) != V2S16)
2734     return false;
2735 
2736   if (!AMDGPU::isLegalVOP3PShuffleMask(ShufMask))
2737     return false;
2738 
2739   assert(ShufMask.size() == 2);
2740   assert(STI.hasSDWA() && "no target has VOP3P but not SDWA");
2741 
2742   MachineBasicBlock *MBB = MI.getParent();
2743   const DebugLoc &DL = MI.getDebugLoc();
2744 
2745   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2746   const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2747   const TargetRegisterClass &RC = IsVALU ?
2748     AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2749 
2750   // Handle the degenerate case which should have folded out.
2751   if (ShufMask[0] == -1 && ShufMask[1] == -1) {
2752     BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), DstReg);
2753 
2754     MI.eraseFromParent();
2755     return RBI.constrainGenericRegister(DstReg, RC, *MRI);
2756   }
2757 
2758   // A legal VOP3P mask only reads one of the sources.
2759   int Mask[2];
2760   Register SrcVec = normalizeVOP3PMask(Mask, Src0Reg, Src1Reg, ShufMask);
2761 
2762   if (!RBI.constrainGenericRegister(DstReg, RC, *MRI) ||
2763       !RBI.constrainGenericRegister(SrcVec, RC, *MRI))
2764     return false;
2765 
2766   // TODO: This also should have been folded out
2767   if (isZeroOrUndef(Mask[0]) && isOneOrUndef(Mask[1])) {
2768     BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), DstReg)
2769       .addReg(SrcVec);
2770 
2771     MI.eraseFromParent();
2772     return true;
2773   }
2774 
2775   if (Mask[0] == 1 && Mask[1] == -1) {
2776     if (IsVALU) {
2777       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
2778         .addImm(16)
2779         .addReg(SrcVec);
2780     } else {
2781       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
2782         .addReg(SrcVec)
2783         .addImm(16);
2784     }
2785   } else if (Mask[0] == -1 && Mask[1] == 0) {
2786     if (IsVALU) {
2787       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), DstReg)
2788         .addImm(16)
2789         .addReg(SrcVec);
2790     } else {
2791       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHL_B32), DstReg)
2792         .addReg(SrcVec)
2793         .addImm(16);
2794     }
2795   } else if (Mask[0] == 0 && Mask[1] == 0) {
2796     if (IsVALU) {
2797       // Write low half of the register into the high half.
2798       MachineInstr *MovSDWA =
2799         BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2800         .addImm(0)                             // $src0_modifiers
2801         .addReg(SrcVec)                        // $src0
2802         .addImm(0)                             // $clamp
2803         .addImm(AMDGPU::SDWA::WORD_1)          // $dst_sel
2804         .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2805         .addImm(AMDGPU::SDWA::WORD_0)          // $src0_sel
2806         .addReg(SrcVec, RegState::Implicit);
2807       MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2808     } else {
2809       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
2810         .addReg(SrcVec)
2811         .addReg(SrcVec);
2812     }
2813   } else if (Mask[0] == 1 && Mask[1] == 1) {
2814     if (IsVALU) {
2815       // Write high half of the register into the low half.
2816       MachineInstr *MovSDWA =
2817         BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2818         .addImm(0)                             // $src0_modifiers
2819         .addReg(SrcVec)                        // $src0
2820         .addImm(0)                             // $clamp
2821         .addImm(AMDGPU::SDWA::WORD_0)          // $dst_sel
2822         .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2823         .addImm(AMDGPU::SDWA::WORD_1)          // $src0_sel
2824         .addReg(SrcVec, RegState::Implicit);
2825       MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2826     } else {
2827       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_HH_B32_B16), DstReg)
2828         .addReg(SrcVec)
2829         .addReg(SrcVec);
2830     }
2831   } else if (Mask[0] == 1 && Mask[1] == 0) {
2832     if (IsVALU) {
2833       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_ALIGNBIT_B32), DstReg)
2834         .addReg(SrcVec)
2835         .addReg(SrcVec)
2836         .addImm(16);
2837     } else {
2838       Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2839       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg)
2840         .addReg(SrcVec)
2841         .addImm(16);
2842       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
2843         .addReg(TmpReg)
2844         .addReg(SrcVec);
2845     }
2846   } else
2847     llvm_unreachable("all shuffle masks should be handled");
2848 
2849   MI.eraseFromParent();
2850   return true;
2851 }
2852 
2853 bool AMDGPUInstructionSelector::select(MachineInstr &I) {
2854   if (I.isPHI())
2855     return selectPHI(I);
2856 
2857   if (!I.isPreISelOpcode()) {
2858     if (I.isCopy())
2859       return selectCOPY(I);
2860     return true;
2861   }
2862 
2863   switch (I.getOpcode()) {
2864   case TargetOpcode::G_AND:
2865   case TargetOpcode::G_OR:
2866   case TargetOpcode::G_XOR:
2867     if (selectImpl(I, *CoverageInfo))
2868       return true;
2869     return selectG_AND_OR_XOR(I);
2870   case TargetOpcode::G_ADD:
2871   case TargetOpcode::G_SUB:
2872     if (selectImpl(I, *CoverageInfo))
2873       return true;
2874     return selectG_ADD_SUB(I);
2875   case TargetOpcode::G_UADDO:
2876   case TargetOpcode::G_USUBO:
2877   case TargetOpcode::G_UADDE:
2878   case TargetOpcode::G_USUBE:
2879     return selectG_UADDO_USUBO_UADDE_USUBE(I);
2880   case TargetOpcode::G_INTTOPTR:
2881   case TargetOpcode::G_BITCAST:
2882   case TargetOpcode::G_PTRTOINT:
2883     return selectCOPY(I);
2884   case TargetOpcode::G_CONSTANT:
2885   case TargetOpcode::G_FCONSTANT:
2886     return selectG_CONSTANT(I);
2887   case TargetOpcode::G_FNEG:
2888     if (selectImpl(I, *CoverageInfo))
2889       return true;
2890     return selectG_FNEG(I);
2891   case TargetOpcode::G_FABS:
2892     if (selectImpl(I, *CoverageInfo))
2893       return true;
2894     return selectG_FABS(I);
2895   case TargetOpcode::G_EXTRACT:
2896     return selectG_EXTRACT(I);
2897   case TargetOpcode::G_MERGE_VALUES:
2898   case TargetOpcode::G_BUILD_VECTOR:
2899   case TargetOpcode::G_CONCAT_VECTORS:
2900     return selectG_MERGE_VALUES(I);
2901   case TargetOpcode::G_UNMERGE_VALUES:
2902     return selectG_UNMERGE_VALUES(I);
2903   case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2904     return selectG_BUILD_VECTOR_TRUNC(I);
2905   case TargetOpcode::G_PTR_ADD:
2906     return selectG_PTR_ADD(I);
2907   case TargetOpcode::G_IMPLICIT_DEF:
2908     return selectG_IMPLICIT_DEF(I);
2909   case TargetOpcode::G_FREEZE:
2910     return selectCOPY(I);
2911   case TargetOpcode::G_INSERT:
2912     return selectG_INSERT(I);
2913   case TargetOpcode::G_INTRINSIC:
2914     return selectG_INTRINSIC(I);
2915   case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
2916     return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
2917   case TargetOpcode::G_ICMP:
2918     if (selectG_ICMP(I))
2919       return true;
2920     return selectImpl(I, *CoverageInfo);
2921   case TargetOpcode::G_LOAD:
2922   case TargetOpcode::G_STORE:
2923   case TargetOpcode::G_ATOMIC_CMPXCHG:
2924   case TargetOpcode::G_ATOMICRMW_XCHG:
2925   case TargetOpcode::G_ATOMICRMW_ADD:
2926   case TargetOpcode::G_ATOMICRMW_SUB:
2927   case TargetOpcode::G_ATOMICRMW_AND:
2928   case TargetOpcode::G_ATOMICRMW_OR:
2929   case TargetOpcode::G_ATOMICRMW_XOR:
2930   case TargetOpcode::G_ATOMICRMW_MIN:
2931   case TargetOpcode::G_ATOMICRMW_MAX:
2932   case TargetOpcode::G_ATOMICRMW_UMIN:
2933   case TargetOpcode::G_ATOMICRMW_UMAX:
2934   case TargetOpcode::G_ATOMICRMW_FADD:
2935   case AMDGPU::G_AMDGPU_ATOMIC_INC:
2936   case AMDGPU::G_AMDGPU_ATOMIC_DEC:
2937   case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
2938   case AMDGPU::G_AMDGPU_ATOMIC_FMAX:
2939     return selectG_LOAD_STORE_ATOMICRMW(I);
2940   case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
2941     return selectG_AMDGPU_ATOMIC_CMPXCHG(I);
2942   case TargetOpcode::G_SELECT:
2943     return selectG_SELECT(I);
2944   case TargetOpcode::G_TRUNC:
2945     return selectG_TRUNC(I);
2946   case TargetOpcode::G_SEXT:
2947   case TargetOpcode::G_ZEXT:
2948   case TargetOpcode::G_ANYEXT:
2949   case TargetOpcode::G_SEXT_INREG:
2950     if (selectImpl(I, *CoverageInfo))
2951       return true;
2952     return selectG_SZA_EXT(I);
2953   case TargetOpcode::G_BRCOND:
2954     return selectG_BRCOND(I);
2955   case TargetOpcode::G_GLOBAL_VALUE:
2956     return selectG_GLOBAL_VALUE(I);
2957   case TargetOpcode::G_PTRMASK:
2958     return selectG_PTRMASK(I);
2959   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2960     return selectG_EXTRACT_VECTOR_ELT(I);
2961   case TargetOpcode::G_INSERT_VECTOR_ELT:
2962     return selectG_INSERT_VECTOR_ELT(I);
2963   case TargetOpcode::G_SHUFFLE_VECTOR:
2964     return selectG_SHUFFLE_VECTOR(I);
2965   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
2966   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
2967     const AMDGPU::ImageDimIntrinsicInfo *Intr
2968       = AMDGPU::getImageDimIntrinsicInfo(I.getIntrinsicID());
2969     assert(Intr && "not an image intrinsic with image pseudo");
2970     return selectImageIntrinsic(I, Intr);
2971   }
2972   default:
2973     return selectImpl(I, *CoverageInfo);
2974   }
2975   return false;
2976 }
2977 
2978 InstructionSelector::ComplexRendererFns
2979 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
2980   return {{
2981       [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
2982   }};
2983 
2984 }
2985 
2986 std::pair<Register, unsigned>
2987 AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root) const {
2988   Register Src = Root.getReg();
2989   Register OrigSrc = Src;
2990   unsigned Mods = 0;
2991   MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
2992 
2993   if (MI && MI->getOpcode() == AMDGPU::G_FNEG) {
2994     Src = MI->getOperand(1).getReg();
2995     Mods |= SISrcMods::NEG;
2996     MI = getDefIgnoringCopies(Src, *MRI);
2997   }
2998 
2999   if (MI && MI->getOpcode() == AMDGPU::G_FABS) {
3000     Src = MI->getOperand(1).getReg();
3001     Mods |= SISrcMods::ABS;
3002   }
3003 
3004   if (Mods != 0 &&
3005       RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
3006     MachineInstr *UseMI = Root.getParent();
3007 
3008     // If we looked through copies to find source modifiers on an SGPR operand,
3009     // we now have an SGPR register source. To avoid potentially violating the
3010     // constant bus restriction, we need to insert a copy to a VGPR.
3011     Register VGPRSrc = MRI->cloneVirtualRegister(OrigSrc);
3012     BuildMI(*UseMI->getParent(), UseMI, UseMI->getDebugLoc(),
3013             TII.get(AMDGPU::COPY), VGPRSrc)
3014       .addReg(Src);
3015     Src = VGPRSrc;
3016   }
3017 
3018   return std::make_pair(Src, Mods);
3019 }
3020 
3021 ///
3022 /// This will select either an SGPR or VGPR operand and will save us from
3023 /// having to write an extra tablegen pattern.
3024 InstructionSelector::ComplexRendererFns
3025 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
3026   return {{
3027       [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3028   }};
3029 }
3030 
3031 InstructionSelector::ComplexRendererFns
3032 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
3033   Register Src;
3034   unsigned Mods;
3035   std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3036 
3037   return {{
3038       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3039       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3040       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },    // clamp
3041       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }     // omod
3042   }};
3043 }
3044 
3045 InstructionSelector::ComplexRendererFns
3046 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
3047   return {{
3048       [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
3049       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3050       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }  // omod
3051   }};
3052 }
3053 
3054 InstructionSelector::ComplexRendererFns
3055 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
3056   Register Src;
3057   unsigned Mods;
3058   std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3059 
3060   return {{
3061       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3062       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
3063   }};
3064 }
3065 
3066 InstructionSelector::ComplexRendererFns
3067 AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
3068   Register Reg = Root.getReg();
3069   const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3070   if (Def && (Def->getOpcode() == AMDGPU::G_FNEG ||
3071               Def->getOpcode() == AMDGPU::G_FABS))
3072     return {};
3073   return {{
3074       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3075   }};
3076 }
3077 
3078 std::pair<Register, unsigned>
3079 AMDGPUInstructionSelector::selectVOP3PModsImpl(
3080   Register Src, const MachineRegisterInfo &MRI) const {
3081   unsigned Mods = 0;
3082   MachineInstr *MI = MRI.getVRegDef(Src);
3083 
3084   if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
3085       // It's possible to see an f32 fneg here, but unlikely.
3086       // TODO: Treat f32 fneg as only high bit.
3087       MRI.getType(Src) == LLT::vector(2, 16)) {
3088     Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
3089     Src = MI->getOperand(1).getReg();
3090     MI = MRI.getVRegDef(Src);
3091   }
3092 
3093   // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
3094 
3095   // Packed instructions do not have abs modifiers.
3096   Mods |= SISrcMods::OP_SEL_1;
3097 
3098   return std::make_pair(Src, Mods);
3099 }
3100 
3101 InstructionSelector::ComplexRendererFns
3102 AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
3103   MachineRegisterInfo &MRI
3104     = Root.getParent()->getParent()->getParent()->getRegInfo();
3105 
3106   Register Src;
3107   unsigned Mods;
3108   std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
3109 
3110   return {{
3111       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3112       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
3113   }};
3114 }
3115 
3116 InstructionSelector::ComplexRendererFns
3117 AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {
3118   Register Src;
3119   unsigned Mods;
3120   std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3121   if (!TM.Options.NoNaNsFPMath && !isKnownNeverNaN(Src, *MRI))
3122     return None;
3123 
3124   return {{
3125       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3126       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
3127   }};
3128 }
3129 
3130 InstructionSelector::ComplexRendererFns
3131 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
3132   // FIXME: Handle op_sel
3133   return {{
3134       [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
3135       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods
3136   }};
3137 }
3138 
3139 InstructionSelector::ComplexRendererFns
3140 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
3141   SmallVector<GEPInfo, 4> AddrInfo;
3142   getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
3143 
3144   if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3145     return None;
3146 
3147   const GEPInfo &GEPInfo = AddrInfo[0];
3148   Optional<int64_t> EncodedImm =
3149       AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm, false);
3150   if (!EncodedImm)
3151     return None;
3152 
3153   unsigned PtrReg = GEPInfo.SgprParts[0];
3154   return {{
3155     [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3156     [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
3157   }};
3158 }
3159 
3160 InstructionSelector::ComplexRendererFns
3161 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
3162   SmallVector<GEPInfo, 4> AddrInfo;
3163   getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
3164 
3165   if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3166     return None;
3167 
3168   const GEPInfo &GEPInfo = AddrInfo[0];
3169   Register PtrReg = GEPInfo.SgprParts[0];
3170   Optional<int64_t> EncodedImm =
3171       AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
3172   if (!EncodedImm)
3173     return None;
3174 
3175   return {{
3176     [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3177     [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
3178   }};
3179 }
3180 
3181 InstructionSelector::ComplexRendererFns
3182 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
3183   MachineInstr *MI = Root.getParent();
3184   MachineBasicBlock *MBB = MI->getParent();
3185 
3186   SmallVector<GEPInfo, 4> AddrInfo;
3187   getAddrModeInfo(*MI, *MRI, AddrInfo);
3188 
3189   // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
3190   // then we can select all ptr + 32-bit offsets not just immediate offsets.
3191   if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3192     return None;
3193 
3194   const GEPInfo &GEPInfo = AddrInfo[0];
3195   // SGPR offset is unsigned.
3196   if (!GEPInfo.Imm || GEPInfo.Imm < 0 || !isUInt<32>(GEPInfo.Imm))
3197     return None;
3198 
3199   // If we make it this far we have a load with an 32-bit immediate offset.
3200   // It is OK to select this using a sgpr offset, because we have already
3201   // failed trying to select this load into one of the _IMM variants since
3202   // the _IMM Patterns are considered before the _SGPR patterns.
3203   Register PtrReg = GEPInfo.SgprParts[0];
3204   Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3205   BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg)
3206           .addImm(GEPInfo.Imm);
3207   return {{
3208     [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3209     [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }
3210   }};
3211 }
3212 
3213 template <bool Signed>
3214 InstructionSelector::ComplexRendererFns
3215 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const {
3216   MachineInstr *MI = Root.getParent();
3217 
3218   InstructionSelector::ComplexRendererFns Default = {{
3219       [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
3220       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },  // offset
3221       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }  // slc
3222     }};
3223 
3224   if (!STI.hasFlatInstOffsets())
3225     return Default;
3226 
3227   const MachineInstr *OpDef = MRI->getVRegDef(Root.getReg());
3228   if (!OpDef || OpDef->getOpcode() != AMDGPU::G_PTR_ADD)
3229     return Default;
3230 
3231   Optional<int64_t> Offset =
3232     getConstantVRegVal(OpDef->getOperand(2).getReg(), *MRI);
3233   if (!Offset.hasValue())
3234     return Default;
3235 
3236   unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
3237   if (!TII.isLegalFLATOffset(Offset.getValue(), AddrSpace, Signed))
3238     return Default;
3239 
3240   Register BasePtr = OpDef->getOperand(1).getReg();
3241 
3242   return {{
3243       [=](MachineInstrBuilder &MIB) { MIB.addReg(BasePtr); },
3244       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset.getValue()); },
3245       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }  // slc
3246     }};
3247 }
3248 
3249 InstructionSelector::ComplexRendererFns
3250 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
3251   return selectFlatOffsetImpl<false>(Root);
3252 }
3253 
3254 InstructionSelector::ComplexRendererFns
3255 AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const {
3256   return selectFlatOffsetImpl<true>(Root);
3257 }
3258 
3259 static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
3260   auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>();
3261   return PSV && PSV->isStack();
3262 }
3263 
3264 InstructionSelector::ComplexRendererFns
3265 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
3266   MachineInstr *MI = Root.getParent();
3267   MachineBasicBlock *MBB = MI->getParent();
3268   MachineFunction *MF = MBB->getParent();
3269   const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3270 
3271   int64_t Offset = 0;
3272   if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
3273       Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) {
3274     Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3275 
3276     // TODO: Should this be inside the render function? The iterator seems to
3277     // move.
3278     BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
3279             HighBits)
3280       .addImm(Offset & ~4095);
3281 
3282     return {{[=](MachineInstrBuilder &MIB) { // rsrc
3283                MIB.addReg(Info->getScratchRSrcReg());
3284              },
3285              [=](MachineInstrBuilder &MIB) { // vaddr
3286                MIB.addReg(HighBits);
3287              },
3288              [=](MachineInstrBuilder &MIB) { // soffset
3289                const MachineMemOperand *MMO = *MI->memoperands_begin();
3290                const MachinePointerInfo &PtrInfo = MMO->getPointerInfo();
3291 
3292                if (isStackPtrRelative(PtrInfo))
3293                  MIB.addReg(Info->getStackPtrOffsetReg());
3294                else
3295                  MIB.addImm(0);
3296              },
3297              [=](MachineInstrBuilder &MIB) { // offset
3298                MIB.addImm(Offset & 4095);
3299              }}};
3300   }
3301 
3302   assert(Offset == 0 || Offset == -1);
3303 
3304   // Try to fold a frame index directly into the MUBUF vaddr field, and any
3305   // offsets.
3306   Optional<int> FI;
3307   Register VAddr = Root.getReg();
3308   if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) {
3309     if (isBaseWithConstantOffset(Root, *MRI)) {
3310       const MachineOperand &LHS = RootDef->getOperand(1);
3311       const MachineOperand &RHS = RootDef->getOperand(2);
3312       const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg());
3313       const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg());
3314       if (LHSDef && RHSDef) {
3315         int64_t PossibleOffset =
3316             RHSDef->getOperand(1).getCImm()->getSExtValue();
3317         if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) &&
3318             (!STI.privateMemoryResourceIsRangeChecked() ||
3319              KnownBits->signBitIsZero(LHS.getReg()))) {
3320           if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
3321             FI = LHSDef->getOperand(1).getIndex();
3322           else
3323             VAddr = LHS.getReg();
3324           Offset = PossibleOffset;
3325         }
3326       }
3327     } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
3328       FI = RootDef->getOperand(1).getIndex();
3329     }
3330   }
3331 
3332   return {{[=](MachineInstrBuilder &MIB) { // rsrc
3333              MIB.addReg(Info->getScratchRSrcReg());
3334            },
3335            [=](MachineInstrBuilder &MIB) { // vaddr
3336              if (FI.hasValue())
3337                MIB.addFrameIndex(FI.getValue());
3338              else
3339                MIB.addReg(VAddr);
3340            },
3341            [=](MachineInstrBuilder &MIB) { // soffset
3342              // If we don't know this private access is a local stack object, it
3343              // needs to be relative to the entry point's scratch wave offset.
3344              // TODO: Should split large offsets that don't fit like above.
3345              // TODO: Don't use scratch wave offset just because the offset
3346              // didn't fit.
3347              if (!Info->isEntryFunction() && FI.hasValue())
3348                MIB.addReg(Info->getStackPtrOffsetReg());
3349              else
3350                MIB.addImm(0);
3351            },
3352            [=](MachineInstrBuilder &MIB) { // offset
3353              MIB.addImm(Offset);
3354            }}};
3355 }
3356 
3357 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
3358                                                 int64_t Offset,
3359                                                 unsigned OffsetBits) const {
3360   if ((OffsetBits == 16 && !isUInt<16>(Offset)) ||
3361       (OffsetBits == 8 && !isUInt<8>(Offset)))
3362     return false;
3363 
3364   if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
3365     return true;
3366 
3367   // On Southern Islands instruction with a negative base value and an offset
3368   // don't seem to work.
3369   return KnownBits->signBitIsZero(Base);
3370 }
3371 
3372 InstructionSelector::ComplexRendererFns
3373 AMDGPUInstructionSelector::selectMUBUFScratchOffset(
3374     MachineOperand &Root) const {
3375   MachineInstr *MI = Root.getParent();
3376   MachineBasicBlock *MBB = MI->getParent();
3377 
3378   int64_t Offset = 0;
3379   if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
3380       !SIInstrInfo::isLegalMUBUFImmOffset(Offset))
3381     return {};
3382 
3383   const MachineFunction *MF = MBB->getParent();
3384   const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3385   const MachineMemOperand *MMO = *MI->memoperands_begin();
3386   const MachinePointerInfo &PtrInfo = MMO->getPointerInfo();
3387 
3388   return {{
3389       [=](MachineInstrBuilder &MIB) { // rsrc
3390         MIB.addReg(Info->getScratchRSrcReg());
3391       },
3392       [=](MachineInstrBuilder &MIB) { // soffset
3393         if (isStackPtrRelative(PtrInfo))
3394           MIB.addReg(Info->getStackPtrOffsetReg());
3395         else
3396           MIB.addImm(0);
3397       },
3398       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
3399   }};
3400 }
3401 
3402 std::pair<Register, unsigned>
3403 AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
3404   const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
3405   if (!RootDef)
3406     return std::make_pair(Root.getReg(), 0);
3407 
3408   int64_t ConstAddr = 0;
3409 
3410   Register PtrBase;
3411   int64_t Offset;
3412   std::tie(PtrBase, Offset) =
3413     getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3414 
3415   if (Offset) {
3416     if (isDSOffsetLegal(PtrBase, Offset, 16)) {
3417       // (add n0, c0)
3418       return std::make_pair(PtrBase, Offset);
3419     }
3420   } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
3421     // TODO
3422 
3423 
3424   } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
3425     // TODO
3426 
3427   }
3428 
3429   return std::make_pair(Root.getReg(), 0);
3430 }
3431 
3432 InstructionSelector::ComplexRendererFns
3433 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
3434   Register Reg;
3435   unsigned Offset;
3436   std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
3437   return {{
3438       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3439       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
3440     }};
3441 }
3442 
3443 InstructionSelector::ComplexRendererFns
3444 AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
3445   Register Reg;
3446   unsigned Offset;
3447   std::tie(Reg, Offset) = selectDS64Bit4ByteAlignedImpl(Root);
3448   return {{
3449       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3450       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
3451       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
3452     }};
3453 }
3454 
3455 std::pair<Register, unsigned>
3456 AMDGPUInstructionSelector::selectDS64Bit4ByteAlignedImpl(MachineOperand &Root) const {
3457   const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
3458   if (!RootDef)
3459     return std::make_pair(Root.getReg(), 0);
3460 
3461   int64_t ConstAddr = 0;
3462 
3463   Register PtrBase;
3464   int64_t Offset;
3465   std::tie(PtrBase, Offset) =
3466     getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3467 
3468   if (Offset) {
3469     int64_t DWordOffset0 = Offset / 4;
3470     int64_t DWordOffset1 = DWordOffset0 + 1;
3471     if (isDSOffsetLegal(PtrBase, DWordOffset1, 8)) {
3472       // (add n0, c0)
3473       return std::make_pair(PtrBase, DWordOffset0);
3474     }
3475   } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
3476     // TODO
3477 
3478   } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
3479     // TODO
3480 
3481   }
3482 
3483   return std::make_pair(Root.getReg(), 0);
3484 }
3485 
3486 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
3487 /// the base value with the constant offset. There may be intervening copies
3488 /// between \p Root and the identified constant. Returns \p Root, 0 if this does
3489 /// not match the pattern.
3490 std::pair<Register, int64_t>
3491 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
3492   Register Root, const MachineRegisterInfo &MRI) const {
3493   MachineInstr *RootI = MRI.getVRegDef(Root);
3494   if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
3495     return {Root, 0};
3496 
3497   MachineOperand &RHS = RootI->getOperand(2);
3498   Optional<ValueAndVReg> MaybeOffset
3499     = getConstantVRegValWithLookThrough(RHS.getReg(), MRI, true);
3500   if (!MaybeOffset)
3501     return {Root, 0};
3502   return {RootI->getOperand(1).getReg(), MaybeOffset->Value};
3503 }
3504 
3505 static void addZeroImm(MachineInstrBuilder &MIB) {
3506   MIB.addImm(0);
3507 }
3508 
3509 /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
3510 /// BasePtr is not valid, a null base pointer will be used.
3511 static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI,
3512                           uint32_t FormatLo, uint32_t FormatHi,
3513                           Register BasePtr) {
3514   Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
3515   Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
3516   Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3517   Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
3518 
3519   B.buildInstr(AMDGPU::S_MOV_B32)
3520     .addDef(RSrc2)
3521     .addImm(FormatLo);
3522   B.buildInstr(AMDGPU::S_MOV_B32)
3523     .addDef(RSrc3)
3524     .addImm(FormatHi);
3525 
3526   // Build the half of the subregister with the constants before building the
3527   // full 128-bit register. If we are building multiple resource descriptors,
3528   // this will allow CSEing of the 2-component register.
3529   B.buildInstr(AMDGPU::REG_SEQUENCE)
3530     .addDef(RSrcHi)
3531     .addReg(RSrc2)
3532     .addImm(AMDGPU::sub0)
3533     .addReg(RSrc3)
3534     .addImm(AMDGPU::sub1);
3535 
3536   Register RSrcLo = BasePtr;
3537   if (!BasePtr) {
3538     RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3539     B.buildInstr(AMDGPU::S_MOV_B64)
3540       .addDef(RSrcLo)
3541       .addImm(0);
3542   }
3543 
3544   B.buildInstr(AMDGPU::REG_SEQUENCE)
3545     .addDef(RSrc)
3546     .addReg(RSrcLo)
3547     .addImm(AMDGPU::sub0_sub1)
3548     .addReg(RSrcHi)
3549     .addImm(AMDGPU::sub2_sub3);
3550 
3551   return RSrc;
3552 }
3553 
3554 static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
3555                                 const SIInstrInfo &TII, Register BasePtr) {
3556   uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
3557 
3558   // FIXME: Why are half the "default" bits ignored based on the addressing
3559   // mode?
3560   return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
3561 }
3562 
3563 static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
3564                                const SIInstrInfo &TII, Register BasePtr) {
3565   uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
3566 
3567   // FIXME: Why are half the "default" bits ignored based on the addressing
3568   // mode?
3569   return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
3570 }
3571 
3572 AMDGPUInstructionSelector::MUBUFAddressData
3573 AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
3574   MUBUFAddressData Data;
3575   Data.N0 = Src;
3576 
3577   Register PtrBase;
3578   int64_t Offset;
3579 
3580   std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
3581   if (isUInt<32>(Offset)) {
3582     Data.N0 = PtrBase;
3583     Data.Offset = Offset;
3584   }
3585 
3586   if (MachineInstr *InputAdd
3587       = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
3588     Data.N2 = InputAdd->getOperand(1).getReg();
3589     Data.N3 = InputAdd->getOperand(2).getReg();
3590 
3591     // FIXME: Need to fix extra SGPR->VGPRcopies inserted
3592     // FIXME: Don't know this was defined by operand 0
3593     //
3594     // TODO: Remove this when we have copy folding optimizations after
3595     // RegBankSelect.
3596     Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
3597     Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
3598   }
3599 
3600   return Data;
3601 }
3602 
3603 /// Return if the addr64 mubuf mode should be used for the given address.
3604 bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
3605   // (ptr_add N2, N3) -> addr64, or
3606   // (ptr_add (ptr_add N2, N3), C1) -> addr64
3607   if (Addr.N2)
3608     return true;
3609 
3610   const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
3611   return N0Bank->getID() == AMDGPU::VGPRRegBankID;
3612 }
3613 
3614 /// Split an immediate offset \p ImmOffset depending on whether it fits in the
3615 /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
3616 /// component.
3617 void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
3618   MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
3619   if (SIInstrInfo::isLegalMUBUFImmOffset(ImmOffset))
3620     return;
3621 
3622   // Illegal offset, store it in soffset.
3623   SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3624   B.buildInstr(AMDGPU::S_MOV_B32)
3625     .addDef(SOffset)
3626     .addImm(ImmOffset);
3627   ImmOffset = 0;
3628 }
3629 
3630 bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
3631   MachineOperand &Root, Register &VAddr, Register &RSrcReg,
3632   Register &SOffset, int64_t &Offset) const {
3633   // FIXME: Predicates should stop this from reaching here.
3634   // addr64 bit was removed for volcanic islands.
3635   if (!STI.hasAddr64() || STI.useFlatForGlobal())
3636     return false;
3637 
3638   MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
3639   if (!shouldUseAddr64(AddrData))
3640     return false;
3641 
3642   Register N0 = AddrData.N0;
3643   Register N2 = AddrData.N2;
3644   Register N3 = AddrData.N3;
3645   Offset = AddrData.Offset;
3646 
3647   // Base pointer for the SRD.
3648   Register SRDPtr;
3649 
3650   if (N2) {
3651     if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
3652       assert(N3);
3653       if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
3654         // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
3655         // addr64, and construct the default resource from a 0 address.
3656         VAddr = N0;
3657       } else {
3658         SRDPtr = N3;
3659         VAddr = N2;
3660       }
3661     } else {
3662       // N2 is not divergent.
3663       SRDPtr = N2;
3664       VAddr = N3;
3665     }
3666   } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
3667     // Use the default null pointer in the resource
3668     VAddr = N0;
3669   } else {
3670     // N0 -> offset, or
3671     // (N0 + C1) -> offset
3672     SRDPtr = N0;
3673   }
3674 
3675   MachineIRBuilder B(*Root.getParent());
3676   RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
3677   splitIllegalMUBUFOffset(B, SOffset, Offset);
3678   return true;
3679 }
3680 
3681 bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
3682   MachineOperand &Root, Register &RSrcReg, Register &SOffset,
3683   int64_t &Offset) const {
3684   MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
3685   if (shouldUseAddr64(AddrData))
3686     return false;
3687 
3688   // N0 -> offset, or
3689   // (N0 + C1) -> offset
3690   Register SRDPtr = AddrData.N0;
3691   Offset = AddrData.Offset;
3692 
3693   // TODO: Look through extensions for 32-bit soffset.
3694   MachineIRBuilder B(*Root.getParent());
3695 
3696   RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
3697   splitIllegalMUBUFOffset(B, SOffset, Offset);
3698   return true;
3699 }
3700 
3701 InstructionSelector::ComplexRendererFns
3702 AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
3703   Register VAddr;
3704   Register RSrcReg;
3705   Register SOffset;
3706   int64_t Offset = 0;
3707 
3708   if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
3709     return {};
3710 
3711   // FIXME: Use defaulted operands for trailing 0s and remove from the complex
3712   // pattern.
3713   return {{
3714       [=](MachineInstrBuilder &MIB) {  // rsrc
3715         MIB.addReg(RSrcReg);
3716       },
3717       [=](MachineInstrBuilder &MIB) { // vaddr
3718         MIB.addReg(VAddr);
3719       },
3720       [=](MachineInstrBuilder &MIB) { // soffset
3721         if (SOffset)
3722           MIB.addReg(SOffset);
3723         else
3724           MIB.addImm(0);
3725       },
3726       [=](MachineInstrBuilder &MIB) { // offset
3727         MIB.addImm(Offset);
3728       },
3729       addZeroImm, //  glc
3730       addZeroImm, //  slc
3731       addZeroImm, //  tfe
3732       addZeroImm, //  dlc
3733       addZeroImm  //  swz
3734     }};
3735 }
3736 
3737 InstructionSelector::ComplexRendererFns
3738 AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
3739   Register RSrcReg;
3740   Register SOffset;
3741   int64_t Offset = 0;
3742 
3743   if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
3744     return {};
3745 
3746   return {{
3747       [=](MachineInstrBuilder &MIB) {  // rsrc
3748         MIB.addReg(RSrcReg);
3749       },
3750       [=](MachineInstrBuilder &MIB) { // soffset
3751         if (SOffset)
3752           MIB.addReg(SOffset);
3753         else
3754           MIB.addImm(0);
3755       },
3756       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
3757       addZeroImm, //  glc
3758       addZeroImm, //  slc
3759       addZeroImm, //  tfe
3760       addZeroImm, //  dlc
3761       addZeroImm  //  swz
3762     }};
3763 }
3764 
3765 InstructionSelector::ComplexRendererFns
3766 AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const {
3767   Register VAddr;
3768   Register RSrcReg;
3769   Register SOffset;
3770   int64_t Offset = 0;
3771 
3772   if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
3773     return {};
3774 
3775   // FIXME: Use defaulted operands for trailing 0s and remove from the complex
3776   // pattern.
3777   return {{
3778       [=](MachineInstrBuilder &MIB) {  // rsrc
3779         MIB.addReg(RSrcReg);
3780       },
3781       [=](MachineInstrBuilder &MIB) { // vaddr
3782         MIB.addReg(VAddr);
3783       },
3784       [=](MachineInstrBuilder &MIB) { // soffset
3785         if (SOffset)
3786           MIB.addReg(SOffset);
3787         else
3788           MIB.addImm(0);
3789       },
3790       [=](MachineInstrBuilder &MIB) { // offset
3791         MIB.addImm(Offset);
3792       },
3793       addZeroImm //  slc
3794     }};
3795 }
3796 
3797 InstructionSelector::ComplexRendererFns
3798 AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const {
3799   Register RSrcReg;
3800   Register SOffset;
3801   int64_t Offset = 0;
3802 
3803   if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
3804     return {};
3805 
3806   return {{
3807       [=](MachineInstrBuilder &MIB) {  // rsrc
3808         MIB.addReg(RSrcReg);
3809       },
3810       [=](MachineInstrBuilder &MIB) { // soffset
3811         if (SOffset)
3812           MIB.addReg(SOffset);
3813         else
3814           MIB.addImm(0);
3815       },
3816       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
3817       addZeroImm //  slc
3818     }};
3819 }
3820 
3821 /// Get an immediate that must be 32-bits, and treated as zero extended.
3822 static Optional<uint64_t> getConstantZext32Val(Register Reg,
3823                                                const MachineRegisterInfo &MRI) {
3824   // getConstantVRegVal sexts any values, so see if that matters.
3825   Optional<int64_t> OffsetVal = getConstantVRegVal(Reg, MRI);
3826   if (!OffsetVal || !isInt<32>(*OffsetVal))
3827     return None;
3828   return Lo_32(*OffsetVal);
3829 }
3830 
3831 InstructionSelector::ComplexRendererFns
3832 AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
3833   Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
3834   if (!OffsetVal)
3835     return {};
3836 
3837   Optional<int64_t> EncodedImm =
3838       AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
3839   if (!EncodedImm)
3840     return {};
3841 
3842   return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }  }};
3843 }
3844 
3845 InstructionSelector::ComplexRendererFns
3846 AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
3847   assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
3848 
3849   Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
3850   if (!OffsetVal)
3851     return {};
3852 
3853   Optional<int64_t> EncodedImm
3854     = AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal);
3855   if (!EncodedImm)
3856     return {};
3857 
3858   return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }  }};
3859 }
3860 
3861 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
3862                                                  const MachineInstr &MI,
3863                                                  int OpIdx) const {
3864   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
3865          "Expected G_CONSTANT");
3866   MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
3867 }
3868 
3869 void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
3870                                                 const MachineInstr &MI,
3871                                                 int OpIdx) const {
3872   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
3873          "Expected G_CONSTANT");
3874   MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
3875 }
3876 
3877 void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB,
3878                                                  const MachineInstr &MI,
3879                                                  int OpIdx) const {
3880   assert(OpIdx == -1);
3881 
3882   const MachineOperand &Op = MI.getOperand(1);
3883   if (MI.getOpcode() == TargetOpcode::G_FCONSTANT)
3884     MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
3885   else {
3886     assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
3887     MIB.addImm(Op.getCImm()->getSExtValue());
3888   }
3889 }
3890 
3891 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
3892                                                 const MachineInstr &MI,
3893                                                 int OpIdx) const {
3894   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
3895          "Expected G_CONSTANT");
3896   MIB.addImm(MI.getOperand(1).getCImm()->getValue().countPopulation());
3897 }
3898 
3899 /// This only really exists to satisfy DAG type checking machinery, so is a
3900 /// no-op here.
3901 void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
3902                                                 const MachineInstr &MI,
3903                                                 int OpIdx) const {
3904   MIB.addImm(MI.getOperand(OpIdx).getImm());
3905 }
3906 
3907 void AMDGPUInstructionSelector::renderExtractGLC(MachineInstrBuilder &MIB,
3908                                                  const MachineInstr &MI,
3909                                                  int OpIdx) const {
3910   assert(OpIdx >= 0 && "expected to match an immediate operand");
3911   MIB.addImm(MI.getOperand(OpIdx).getImm() & 1);
3912 }
3913 
3914 void AMDGPUInstructionSelector::renderExtractSLC(MachineInstrBuilder &MIB,
3915                                                  const MachineInstr &MI,
3916                                                  int OpIdx) const {
3917   assert(OpIdx >= 0 && "expected to match an immediate operand");
3918   MIB.addImm((MI.getOperand(OpIdx).getImm() >> 1) & 1);
3919 }
3920 
3921 void AMDGPUInstructionSelector::renderExtractDLC(MachineInstrBuilder &MIB,
3922                                                  const MachineInstr &MI,
3923                                                  int OpIdx) const {
3924   assert(OpIdx >= 0 && "expected to match an immediate operand");
3925   MIB.addImm((MI.getOperand(OpIdx).getImm() >> 2) & 1);
3926 }
3927 
3928 void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
3929                                                  const MachineInstr &MI,
3930                                                  int OpIdx) const {
3931   assert(OpIdx >= 0 && "expected to match an immediate operand");
3932   MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1);
3933 }
3934 
3935 void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
3936                                                  const MachineInstr &MI,
3937                                                  int OpIdx) const {
3938   MIB.addFrameIndex((MI.getOperand(1).getIndex()));
3939 }
3940 
3941 bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const {
3942   return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm());
3943 }
3944 
3945 bool AMDGPUInstructionSelector::isInlineImmediate32(int64_t Imm) const {
3946   return AMDGPU::isInlinableLiteral32(Imm, STI.hasInv2PiInlineImm());
3947 }
3948 
3949 bool AMDGPUInstructionSelector::isInlineImmediate64(int64_t Imm) const {
3950   return AMDGPU::isInlinableLiteral64(Imm, STI.hasInv2PiInlineImm());
3951 }
3952 
3953 bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
3954   return TII.isInlineConstant(Imm);
3955 }
3956