1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the InstructionSelector class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUInstructionSelector.h"
15 #include "AMDGPUInstrInfo.h"
16 #include "AMDGPUGlobalISelUtils.h"
17 #include "AMDGPURegisterBankInfo.h"
18 #include "AMDGPUSubtarget.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
23 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
24 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
25 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
26 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
27 #include "llvm/CodeGen/GlobalISel/Utils.h"
28 #include "llvm/CodeGen/MachineBasicBlock.h"
29 #include "llvm/CodeGen/MachineFunction.h"
30 #include "llvm/CodeGen/MachineInstr.h"
31 #include "llvm/CodeGen/MachineInstrBuilder.h"
32 #include "llvm/CodeGen/MachineRegisterInfo.h"
33 #include "llvm/IR/Type.h"
34 #include "llvm/Support/Debug.h"
35 #include "llvm/Support/raw_ostream.h"
36 
37 #define DEBUG_TYPE "amdgpu-isel"
38 
39 using namespace llvm;
40 using namespace MIPatternMatch;
41 
42 static cl::opt<bool> AllowRiskySelect(
43   "amdgpu-global-isel-risky-select",
44   cl::desc("Allow GlobalISel to select cases that are likely to not work yet"),
45   cl::init(false),
46   cl::ReallyHidden);
47 
48 #define GET_GLOBALISEL_IMPL
49 #define AMDGPUSubtarget GCNSubtarget
50 #include "AMDGPUGenGlobalISel.inc"
51 #undef GET_GLOBALISEL_IMPL
52 #undef AMDGPUSubtarget
53 
54 AMDGPUInstructionSelector::AMDGPUInstructionSelector(
55     const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
56     const AMDGPUTargetMachine &TM)
57     : InstructionSelector(), TII(*STI.getInstrInfo()),
58       TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
59       STI(STI),
60       EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG),
61 #define GET_GLOBALISEL_PREDICATES_INIT
62 #include "AMDGPUGenGlobalISel.inc"
63 #undef GET_GLOBALISEL_PREDICATES_INIT
64 #define GET_GLOBALISEL_TEMPORARIES_INIT
65 #include "AMDGPUGenGlobalISel.inc"
66 #undef GET_GLOBALISEL_TEMPORARIES_INIT
67 {
68 }
69 
70 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
71 
72 void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB,
73                                         CodeGenCoverage &CoverageInfo) {
74   MRI = &MF.getRegInfo();
75   InstructionSelector::setupMF(MF, KB, CoverageInfo);
76 }
77 
78 bool AMDGPUInstructionSelector::isVCC(Register Reg,
79                                       const MachineRegisterInfo &MRI) const {
80   // The verifier is oblivious to s1 being a valid value for wavesize registers.
81   if (Reg.isPhysical())
82     return false;
83 
84   auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
85   const TargetRegisterClass *RC =
86       RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
87   if (RC) {
88     const LLT Ty = MRI.getType(Reg);
89     return RC->hasSuperClassEq(TRI.getBoolRC()) &&
90            Ty.isValid() && Ty.getSizeInBits() == 1;
91   }
92 
93   const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
94   return RB->getID() == AMDGPU::VCCRegBankID;
95 }
96 
97 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
98                                                         unsigned NewOpc) const {
99   MI.setDesc(TII.get(NewOpc));
100   MI.RemoveOperand(1); // Remove intrinsic ID.
101   MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
102 
103   MachineOperand &Dst = MI.getOperand(0);
104   MachineOperand &Src = MI.getOperand(1);
105 
106   // TODO: This should be legalized to s32 if needed
107   if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
108     return false;
109 
110   const TargetRegisterClass *DstRC
111     = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
112   const TargetRegisterClass *SrcRC
113     = TRI.getConstrainedRegClassForOperand(Src, *MRI);
114   if (!DstRC || DstRC != SrcRC)
115     return false;
116 
117   return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
118          RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
119 }
120 
121 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
122   const DebugLoc &DL = I.getDebugLoc();
123   MachineBasicBlock *BB = I.getParent();
124   I.setDesc(TII.get(TargetOpcode::COPY));
125 
126   const MachineOperand &Src = I.getOperand(1);
127   MachineOperand &Dst = I.getOperand(0);
128   Register DstReg = Dst.getReg();
129   Register SrcReg = Src.getReg();
130 
131   if (isVCC(DstReg, *MRI)) {
132     if (SrcReg == AMDGPU::SCC) {
133       const TargetRegisterClass *RC
134         = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
135       if (!RC)
136         return true;
137       return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
138     }
139 
140     if (!isVCC(SrcReg, *MRI)) {
141       // TODO: Should probably leave the copy and let copyPhysReg expand it.
142       if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
143         return false;
144 
145       const TargetRegisterClass *SrcRC
146         = TRI.getConstrainedRegClassForOperand(Src, *MRI);
147 
148       Register MaskedReg = MRI->createVirtualRegister(SrcRC);
149 
150       // We can't trust the high bits at this point, so clear them.
151 
152       // TODO: Skip masking high bits if def is known boolean.
153 
154       unsigned AndOpc = TRI.isSGPRClass(SrcRC) ?
155         AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
156       BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
157         .addImm(1)
158         .addReg(SrcReg);
159       BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
160         .addImm(0)
161         .addReg(MaskedReg);
162 
163       if (!MRI->getRegClassOrNull(SrcReg))
164         MRI->setRegClass(SrcReg, SrcRC);
165       I.eraseFromParent();
166       return true;
167     }
168 
169     const TargetRegisterClass *RC =
170       TRI.getConstrainedRegClassForOperand(Dst, *MRI);
171     if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
172       return false;
173 
174     return true;
175   }
176 
177   for (const MachineOperand &MO : I.operands()) {
178     if (MO.getReg().isPhysical())
179       continue;
180 
181     const TargetRegisterClass *RC =
182             TRI.getConstrainedRegClassForOperand(MO, *MRI);
183     if (!RC)
184       continue;
185     RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
186   }
187   return true;
188 }
189 
190 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
191   const Register DefReg = I.getOperand(0).getReg();
192   const LLT DefTy = MRI->getType(DefReg);
193   if (DefTy == LLT::scalar(1)) {
194     if (!AllowRiskySelect) {
195       LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n");
196       return false;
197     }
198 
199     LLVM_DEBUG(dbgs() << "Selecting risky boolean phi\n");
200   }
201 
202   // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
203 
204   const RegClassOrRegBank &RegClassOrBank =
205     MRI->getRegClassOrRegBank(DefReg);
206 
207   const TargetRegisterClass *DefRC
208     = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
209   if (!DefRC) {
210     if (!DefTy.isValid()) {
211       LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
212       return false;
213     }
214 
215     const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
216     DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI);
217     if (!DefRC) {
218       LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
219       return false;
220     }
221   }
222 
223   // TODO: Verify that all registers have the same bank
224   I.setDesc(TII.get(TargetOpcode::PHI));
225   return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
226 }
227 
228 MachineOperand
229 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
230                                            const TargetRegisterClass &SubRC,
231                                            unsigned SubIdx) const {
232 
233   MachineInstr *MI = MO.getParent();
234   MachineBasicBlock *BB = MO.getParent()->getParent();
235   Register DstReg = MRI->createVirtualRegister(&SubRC);
236 
237   if (MO.isReg()) {
238     unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
239     Register Reg = MO.getReg();
240     BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
241             .addReg(Reg, 0, ComposedSubIdx);
242 
243     return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
244                                      MO.isKill(), MO.isDead(), MO.isUndef(),
245                                      MO.isEarlyClobber(), 0, MO.isDebug(),
246                                      MO.isInternalRead());
247   }
248 
249   assert(MO.isImm());
250 
251   APInt Imm(64, MO.getImm());
252 
253   switch (SubIdx) {
254   default:
255     llvm_unreachable("do not know to split immediate with this sub index.");
256   case AMDGPU::sub0:
257     return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
258   case AMDGPU::sub1:
259     return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
260   }
261 }
262 
263 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
264   switch (Opc) {
265   case AMDGPU::G_AND:
266     return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
267   case AMDGPU::G_OR:
268     return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
269   case AMDGPU::G_XOR:
270     return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
271   default:
272     llvm_unreachable("not a bit op");
273   }
274 }
275 
276 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
277   Register DstReg = I.getOperand(0).getReg();
278   unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
279 
280   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
281   if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
282       DstRB->getID() != AMDGPU::VCCRegBankID)
283     return false;
284 
285   bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
286                             STI.isWave64());
287   I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
288 
289   // Dead implicit-def of scc
290   I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
291                                          true, // isImp
292                                          false, // isKill
293                                          true)); // isDead
294   return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
295 }
296 
297 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
298   MachineBasicBlock *BB = I.getParent();
299   MachineFunction *MF = BB->getParent();
300   Register DstReg = I.getOperand(0).getReg();
301   const DebugLoc &DL = I.getDebugLoc();
302   LLT Ty = MRI->getType(DstReg);
303   if (Ty.isVector())
304     return false;
305 
306   unsigned Size = Ty.getSizeInBits();
307   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
308   const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
309   const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
310 
311   if (Size == 32) {
312     if (IsSALU) {
313       const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
314       MachineInstr *Add =
315         BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
316         .add(I.getOperand(1))
317         .add(I.getOperand(2));
318       I.eraseFromParent();
319       return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
320     }
321 
322     if (STI.hasAddNoCarry()) {
323       const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
324       I.setDesc(TII.get(Opc));
325       I.addOperand(*MF, MachineOperand::CreateImm(0));
326       I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
327       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
328     }
329 
330     const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
331 
332     Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
333     MachineInstr *Add
334       = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
335       .addDef(UnusedCarry, RegState::Dead)
336       .add(I.getOperand(1))
337       .add(I.getOperand(2))
338       .addImm(0);
339     I.eraseFromParent();
340     return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
341   }
342 
343   assert(!Sub && "illegal sub should not reach here");
344 
345   const TargetRegisterClass &RC
346     = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
347   const TargetRegisterClass &HalfRC
348     = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
349 
350   MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
351   MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
352   MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
353   MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
354 
355   Register DstLo = MRI->createVirtualRegister(&HalfRC);
356   Register DstHi = MRI->createVirtualRegister(&HalfRC);
357 
358   if (IsSALU) {
359     BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
360       .add(Lo1)
361       .add(Lo2);
362     BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
363       .add(Hi1)
364       .add(Hi2);
365   } else {
366     const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
367     Register CarryReg = MRI->createVirtualRegister(CarryRC);
368     BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
369       .addDef(CarryReg)
370       .add(Lo1)
371       .add(Lo2)
372       .addImm(0);
373     MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
374       .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
375       .add(Hi1)
376       .add(Hi2)
377       .addReg(CarryReg, RegState::Kill)
378       .addImm(0);
379 
380     if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
381       return false;
382   }
383 
384   BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
385     .addReg(DstLo)
386     .addImm(AMDGPU::sub0)
387     .addReg(DstHi)
388     .addImm(AMDGPU::sub1);
389 
390 
391   if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
392     return false;
393 
394   I.eraseFromParent();
395   return true;
396 }
397 
398 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
399   MachineInstr &I) const {
400   MachineBasicBlock *BB = I.getParent();
401   MachineFunction *MF = BB->getParent();
402   const DebugLoc &DL = I.getDebugLoc();
403   Register Dst0Reg = I.getOperand(0).getReg();
404   Register Dst1Reg = I.getOperand(1).getReg();
405   const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
406                      I.getOpcode() == AMDGPU::G_UADDE;
407   const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
408                           I.getOpcode() == AMDGPU::G_USUBE;
409 
410   if (isVCC(Dst1Reg, *MRI)) {
411     unsigned NoCarryOpc =
412         IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
413     unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
414     I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
415     I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
416     I.addOperand(*MF, MachineOperand::CreateImm(0));
417     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
418   }
419 
420   Register Src0Reg = I.getOperand(2).getReg();
421   Register Src1Reg = I.getOperand(3).getReg();
422 
423   if (HasCarryIn) {
424     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
425       .addReg(I.getOperand(4).getReg());
426   }
427 
428   unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
429   unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
430 
431   BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
432     .add(I.getOperand(2))
433     .add(I.getOperand(3));
434   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
435     .addReg(AMDGPU::SCC);
436 
437   if (!MRI->getRegClassOrNull(Dst1Reg))
438     MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
439 
440   if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
441       !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
442       !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
443     return false;
444 
445   if (HasCarryIn &&
446       !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
447                                     AMDGPU::SReg_32RegClass, *MRI))
448     return false;
449 
450   I.eraseFromParent();
451   return true;
452 }
453 
454 // TODO: We should probably legalize these to only using 32-bit results.
455 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
456   MachineBasicBlock *BB = I.getParent();
457   Register DstReg = I.getOperand(0).getReg();
458   Register SrcReg = I.getOperand(1).getReg();
459   LLT DstTy = MRI->getType(DstReg);
460   LLT SrcTy = MRI->getType(SrcReg);
461   const unsigned SrcSize = SrcTy.getSizeInBits();
462   unsigned DstSize = DstTy.getSizeInBits();
463 
464   // TODO: Should handle any multiple of 32 offset.
465   unsigned Offset = I.getOperand(2).getImm();
466   if (Offset % 32 != 0 || DstSize > 128)
467     return false;
468 
469   // 16-bit operations really use 32-bit registers.
470   // FIXME: Probably should not allow 16-bit G_EXTRACT results.
471   if (DstSize == 16)
472     DstSize = 32;
473 
474   const TargetRegisterClass *DstRC =
475     TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
476   if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
477     return false;
478 
479   const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
480   const TargetRegisterClass *SrcRC =
481     TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
482   if (!SrcRC)
483     return false;
484   unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32,
485                                                          DstSize / 32);
486   SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
487   if (!SrcRC)
488     return false;
489 
490   SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
491                                     *SrcRC, I.getOperand(1));
492   const DebugLoc &DL = I.getDebugLoc();
493   BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
494     .addReg(SrcReg, 0, SubReg);
495 
496   I.eraseFromParent();
497   return true;
498 }
499 
500 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
501   MachineBasicBlock *BB = MI.getParent();
502   Register DstReg = MI.getOperand(0).getReg();
503   LLT DstTy = MRI->getType(DstReg);
504   LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
505 
506   const unsigned SrcSize = SrcTy.getSizeInBits();
507   if (SrcSize < 32)
508     return selectImpl(MI, *CoverageInfo);
509 
510   const DebugLoc &DL = MI.getDebugLoc();
511   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
512   const unsigned DstSize = DstTy.getSizeInBits();
513   const TargetRegisterClass *DstRC =
514     TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
515   if (!DstRC)
516     return false;
517 
518   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
519   MachineInstrBuilder MIB =
520     BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
521   for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
522     MachineOperand &Src = MI.getOperand(I + 1);
523     MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
524     MIB.addImm(SubRegs[I]);
525 
526     const TargetRegisterClass *SrcRC
527       = TRI.getConstrainedRegClassForOperand(Src, *MRI);
528     if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
529       return false;
530   }
531 
532   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
533     return false;
534 
535   MI.eraseFromParent();
536   return true;
537 }
538 
539 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
540   MachineBasicBlock *BB = MI.getParent();
541   const int NumDst = MI.getNumOperands() - 1;
542 
543   MachineOperand &Src = MI.getOperand(NumDst);
544 
545   Register SrcReg = Src.getReg();
546   Register DstReg0 = MI.getOperand(0).getReg();
547   LLT DstTy = MRI->getType(DstReg0);
548   LLT SrcTy = MRI->getType(SrcReg);
549 
550   const unsigned DstSize = DstTy.getSizeInBits();
551   const unsigned SrcSize = SrcTy.getSizeInBits();
552   const DebugLoc &DL = MI.getDebugLoc();
553   const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
554 
555   const TargetRegisterClass *SrcRC =
556     TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
557   if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
558     return false;
559 
560   // Note we could have mixed SGPR and VGPR destination banks for an SGPR
561   // source, and this relies on the fact that the same subregister indices are
562   // used for both.
563   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
564   for (int I = 0, E = NumDst; I != E; ++I) {
565     MachineOperand &Dst = MI.getOperand(I);
566     BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
567       .addReg(SrcReg, 0, SubRegs[I]);
568 
569     // Make sure the subregister index is valid for the source register.
570     SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
571     if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
572       return false;
573 
574     const TargetRegisterClass *DstRC =
575       TRI.getConstrainedRegClassForOperand(Dst, *MRI);
576     if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
577       return false;
578   }
579 
580   MI.eraseFromParent();
581   return true;
582 }
583 
584 bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
585   MachineInstr &MI) const {
586   if (selectImpl(MI, *CoverageInfo))
587     return true;
588 
589   const LLT S32 = LLT::scalar(32);
590   const LLT V2S16 = LLT::vector(2, 16);
591 
592   Register Dst = MI.getOperand(0).getReg();
593   if (MRI->getType(Dst) != V2S16)
594     return false;
595 
596   const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
597   if (DstBank->getID() != AMDGPU::SGPRRegBankID)
598     return false;
599 
600   Register Src0 = MI.getOperand(1).getReg();
601   Register Src1 = MI.getOperand(2).getReg();
602   if (MRI->getType(Src0) != S32)
603     return false;
604 
605   const DebugLoc &DL = MI.getDebugLoc();
606   MachineBasicBlock *BB = MI.getParent();
607 
608   auto ConstSrc1 = getConstantVRegValWithLookThrough(Src1, *MRI, true, true);
609   if (ConstSrc1) {
610     auto ConstSrc0 = getConstantVRegValWithLookThrough(Src0, *MRI, true, true);
611     if (ConstSrc0) {
612       uint32_t Lo16 = static_cast<uint32_t>(ConstSrc0->Value) & 0xffff;
613       uint32_t Hi16 = static_cast<uint32_t>(ConstSrc1->Value) & 0xffff;
614 
615       BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst)
616         .addImm(Lo16 | (Hi16 << 16));
617       MI.eraseFromParent();
618       return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
619     }
620   }
621 
622   // TODO: This should probably be a combine somewhere
623   // (build_vector_trunc $src0, undef -> copy $src0
624   MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
625   if (Src1Def && Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
626     MI.setDesc(TII.get(AMDGPU::COPY));
627     MI.RemoveOperand(2);
628     return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI) &&
629            RBI.constrainGenericRegister(Src0, AMDGPU::SReg_32RegClass, *MRI);
630   }
631 
632   Register ShiftSrc0;
633   Register ShiftSrc1;
634   int64_t ShiftAmt;
635 
636   // With multiple uses of the shift, this will duplicate the shift and
637   // increase register pressure.
638   //
639   // (build_vector_trunc (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
640   //  => (S_PACK_HH_B32_B16 $src0, $src1)
641   // (build_vector_trunc $src0, (lshr_oneuse SReg_32:$src1, 16))
642   //  => (S_PACK_LH_B32_B16 $src0, $src1)
643   // (build_vector_trunc $src0, $src1)
644   //  => (S_PACK_LL_B32_B16 $src0, $src1)
645 
646   // FIXME: This is an inconvenient way to check a specific value
647   bool Shift0 = mi_match(
648     Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_ICst(ShiftAmt)))) &&
649     ShiftAmt == 16;
650 
651   bool Shift1 = mi_match(
652     Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_ICst(ShiftAmt)))) &&
653     ShiftAmt == 16;
654 
655   unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
656   if (Shift0 && Shift1) {
657     Opc = AMDGPU::S_PACK_HH_B32_B16;
658     MI.getOperand(1).setReg(ShiftSrc0);
659     MI.getOperand(2).setReg(ShiftSrc1);
660   } else if (Shift1) {
661     Opc = AMDGPU::S_PACK_LH_B32_B16;
662     MI.getOperand(2).setReg(ShiftSrc1);
663   } else if (Shift0 && ConstSrc1 && ConstSrc1->Value == 0) {
664     // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
665     auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
666       .addReg(ShiftSrc0)
667       .addImm(16);
668 
669     MI.eraseFromParent();
670     return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
671   }
672 
673   MI.setDesc(TII.get(Opc));
674   return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
675 }
676 
677 bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const {
678   return selectG_ADD_SUB(I);
679 }
680 
681 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
682   const MachineOperand &MO = I.getOperand(0);
683 
684   // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
685   // regbank check here is to know why getConstrainedRegClassForOperand failed.
686   const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
687   if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
688       (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
689     I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
690     return true;
691   }
692 
693   return false;
694 }
695 
696 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
697   MachineBasicBlock *BB = I.getParent();
698 
699   Register DstReg = I.getOperand(0).getReg();
700   Register Src0Reg = I.getOperand(1).getReg();
701   Register Src1Reg = I.getOperand(2).getReg();
702   LLT Src1Ty = MRI->getType(Src1Reg);
703 
704   unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
705   unsigned InsSize = Src1Ty.getSizeInBits();
706 
707   int64_t Offset = I.getOperand(3).getImm();
708 
709   // FIXME: These cases should have been illegal and unnecessary to check here.
710   if (Offset % 32 != 0 || InsSize % 32 != 0)
711     return false;
712 
713   // Currently not handled by getSubRegFromChannel.
714   if (InsSize > 128)
715     return false;
716 
717   unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
718   if (SubReg == AMDGPU::NoSubRegister)
719     return false;
720 
721   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
722   const TargetRegisterClass *DstRC =
723     TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
724   if (!DstRC)
725     return false;
726 
727   const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
728   const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
729   const TargetRegisterClass *Src0RC =
730     TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI);
731   const TargetRegisterClass *Src1RC =
732     TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI);
733 
734   // Deal with weird cases where the class only partially supports the subreg
735   // index.
736   Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
737   if (!Src0RC || !Src1RC)
738     return false;
739 
740   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
741       !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
742       !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
743     return false;
744 
745   const DebugLoc &DL = I.getDebugLoc();
746   BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
747     .addReg(Src0Reg)
748     .addReg(Src1Reg)
749     .addImm(SubReg);
750 
751   I.eraseFromParent();
752   return true;
753 }
754 
755 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
756   if (STI.getLDSBankCount() != 16)
757     return selectImpl(MI, *CoverageInfo);
758 
759   Register Dst = MI.getOperand(0).getReg();
760   Register Src0 = MI.getOperand(2).getReg();
761   Register M0Val = MI.getOperand(6).getReg();
762   if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
763       !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
764       !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
765     return false;
766 
767   // This requires 2 instructions. It is possible to write a pattern to support
768   // this, but the generated isel emitter doesn't correctly deal with multiple
769   // output instructions using the same physical register input. The copy to m0
770   // is incorrectly placed before the second instruction.
771   //
772   // TODO: Match source modifiers.
773 
774   Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
775   const DebugLoc &DL = MI.getDebugLoc();
776   MachineBasicBlock *MBB = MI.getParent();
777 
778   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
779     .addReg(M0Val);
780   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
781     .addImm(2)
782     .addImm(MI.getOperand(4).getImm())  // $attr
783     .addImm(MI.getOperand(3).getImm()); // $attrchan
784 
785   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
786     .addImm(0)                          // $src0_modifiers
787     .addReg(Src0)                       // $src0
788     .addImm(MI.getOperand(4).getImm())  // $attr
789     .addImm(MI.getOperand(3).getImm())  // $attrchan
790     .addImm(0)                          // $src2_modifiers
791     .addReg(InterpMov)                  // $src2 - 2 f16 values selected by high
792     .addImm(MI.getOperand(5).getImm())  // $high
793     .addImm(0)                          // $clamp
794     .addImm(0);                         // $omod
795 
796   MI.eraseFromParent();
797   return true;
798 }
799 
800 // Writelane is special in that it can use SGPR and M0 (which would normally
801 // count as using the constant bus twice - but in this case it is allowed since
802 // the lane selector doesn't count as a use of the constant bus). However, it is
803 // still required to abide by the 1 SGPR rule. Fix this up if we might have
804 // multiple SGPRs.
805 bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
806   // With a constant bus limit of at least 2, there's no issue.
807   if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
808     return selectImpl(MI, *CoverageInfo);
809 
810   MachineBasicBlock *MBB = MI.getParent();
811   const DebugLoc &DL = MI.getDebugLoc();
812   Register VDst = MI.getOperand(0).getReg();
813   Register Val = MI.getOperand(2).getReg();
814   Register LaneSelect = MI.getOperand(3).getReg();
815   Register VDstIn = MI.getOperand(4).getReg();
816 
817   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
818 
819   Optional<ValueAndVReg> ConstSelect =
820     getConstantVRegValWithLookThrough(LaneSelect, *MRI, true, true);
821   if (ConstSelect) {
822     // The selector has to be an inline immediate, so we can use whatever for
823     // the other operands.
824     MIB.addReg(Val);
825     MIB.addImm(ConstSelect->Value &
826                maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
827   } else {
828     Optional<ValueAndVReg> ConstVal =
829       getConstantVRegValWithLookThrough(Val, *MRI, true, true);
830 
831     // If the value written is an inline immediate, we can get away without a
832     // copy to m0.
833     if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value,
834                                                  STI.hasInv2PiInlineImm())) {
835       MIB.addImm(ConstVal->Value);
836       MIB.addReg(LaneSelect);
837     } else {
838       MIB.addReg(Val);
839 
840       // If the lane selector was originally in a VGPR and copied with
841       // readfirstlane, there's a hazard to read the same SGPR from the
842       // VALU. Constrain to a different SGPR to help avoid needing a nop later.
843       RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
844 
845       BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
846         .addReg(LaneSelect);
847       MIB.addReg(AMDGPU::M0);
848     }
849   }
850 
851   MIB.addReg(VDstIn);
852 
853   MI.eraseFromParent();
854   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
855 }
856 
857 // We need to handle this here because tablegen doesn't support matching
858 // instructions with multiple outputs.
859 bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
860   Register Dst0 = MI.getOperand(0).getReg();
861   Register Dst1 = MI.getOperand(1).getReg();
862 
863   LLT Ty = MRI->getType(Dst0);
864   unsigned Opc;
865   if (Ty == LLT::scalar(32))
866     Opc = AMDGPU::V_DIV_SCALE_F32;
867   else if (Ty == LLT::scalar(64))
868     Opc = AMDGPU::V_DIV_SCALE_F64;
869   else
870     return false;
871 
872   const DebugLoc &DL = MI.getDebugLoc();
873   MachineBasicBlock *MBB = MI.getParent();
874 
875   Register Numer = MI.getOperand(3).getReg();
876   Register Denom = MI.getOperand(4).getReg();
877   unsigned ChooseDenom = MI.getOperand(5).getImm();
878 
879   Register Src0 = ChooseDenom != 0 ? Numer : Denom;
880 
881   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
882     .addDef(Dst1)
883     .addUse(Src0)
884     .addUse(Denom)
885     .addUse(Numer);
886 
887   MI.eraseFromParent();
888   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
889 }
890 
891 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
892   unsigned IntrinsicID = I.getIntrinsicID();
893   switch (IntrinsicID) {
894   case Intrinsic::amdgcn_if_break: {
895     MachineBasicBlock *BB = I.getParent();
896 
897     // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
898     // SelectionDAG uses for wave32 vs wave64.
899     BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
900       .add(I.getOperand(0))
901       .add(I.getOperand(2))
902       .add(I.getOperand(3));
903 
904     Register DstReg = I.getOperand(0).getReg();
905     Register Src0Reg = I.getOperand(2).getReg();
906     Register Src1Reg = I.getOperand(3).getReg();
907 
908     I.eraseFromParent();
909 
910     for (Register Reg : { DstReg, Src0Reg, Src1Reg })
911       MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
912 
913     return true;
914   }
915   case Intrinsic::amdgcn_interp_p1_f16:
916     return selectInterpP1F16(I);
917   case Intrinsic::amdgcn_wqm:
918     return constrainCopyLikeIntrin(I, AMDGPU::WQM);
919   case Intrinsic::amdgcn_softwqm:
920     return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
921   case Intrinsic::amdgcn_wwm:
922     return constrainCopyLikeIntrin(I, AMDGPU::WWM);
923   case Intrinsic::amdgcn_writelane:
924     return selectWritelane(I);
925   case Intrinsic::amdgcn_div_scale:
926     return selectDivScale(I);
927   case Intrinsic::amdgcn_icmp:
928     return selectIntrinsicIcmp(I);
929   case Intrinsic::amdgcn_ballot:
930     return selectBallot(I);
931   case Intrinsic::amdgcn_reloc_constant:
932     return selectRelocConstant(I);
933   case Intrinsic::amdgcn_groupstaticsize:
934     return selectGroupStaticSize(I);
935   case Intrinsic::returnaddress:
936     return selectReturnAddress(I);
937   default:
938     return selectImpl(I, *CoverageInfo);
939   }
940 }
941 
942 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) {
943   if (Size != 32 && Size != 64)
944     return -1;
945   switch (P) {
946   default:
947     llvm_unreachable("Unknown condition code!");
948   case CmpInst::ICMP_NE:
949     return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64;
950   case CmpInst::ICMP_EQ:
951     return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64;
952   case CmpInst::ICMP_SGT:
953     return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64;
954   case CmpInst::ICMP_SGE:
955     return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64;
956   case CmpInst::ICMP_SLT:
957     return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64;
958   case CmpInst::ICMP_SLE:
959     return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64;
960   case CmpInst::ICMP_UGT:
961     return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64;
962   case CmpInst::ICMP_UGE:
963     return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64;
964   case CmpInst::ICMP_ULT:
965     return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64;
966   case CmpInst::ICMP_ULE:
967     return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64;
968   }
969 }
970 
971 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
972                                               unsigned Size) const {
973   if (Size == 64) {
974     if (!STI.hasScalarCompareEq64())
975       return -1;
976 
977     switch (P) {
978     case CmpInst::ICMP_NE:
979       return AMDGPU::S_CMP_LG_U64;
980     case CmpInst::ICMP_EQ:
981       return AMDGPU::S_CMP_EQ_U64;
982     default:
983       return -1;
984     }
985   }
986 
987   if (Size != 32)
988     return -1;
989 
990   switch (P) {
991   case CmpInst::ICMP_NE:
992     return AMDGPU::S_CMP_LG_U32;
993   case CmpInst::ICMP_EQ:
994     return AMDGPU::S_CMP_EQ_U32;
995   case CmpInst::ICMP_SGT:
996     return AMDGPU::S_CMP_GT_I32;
997   case CmpInst::ICMP_SGE:
998     return AMDGPU::S_CMP_GE_I32;
999   case CmpInst::ICMP_SLT:
1000     return AMDGPU::S_CMP_LT_I32;
1001   case CmpInst::ICMP_SLE:
1002     return AMDGPU::S_CMP_LE_I32;
1003   case CmpInst::ICMP_UGT:
1004     return AMDGPU::S_CMP_GT_U32;
1005   case CmpInst::ICMP_UGE:
1006     return AMDGPU::S_CMP_GE_U32;
1007   case CmpInst::ICMP_ULT:
1008     return AMDGPU::S_CMP_LT_U32;
1009   case CmpInst::ICMP_ULE:
1010     return AMDGPU::S_CMP_LE_U32;
1011   default:
1012     llvm_unreachable("Unknown condition code!");
1013   }
1014 }
1015 
1016 bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
1017   MachineBasicBlock *BB = I.getParent();
1018   const DebugLoc &DL = I.getDebugLoc();
1019 
1020   Register SrcReg = I.getOperand(2).getReg();
1021   unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1022 
1023   auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1024 
1025   Register CCReg = I.getOperand(0).getReg();
1026   if (!isVCC(CCReg, *MRI)) {
1027     int Opcode = getS_CMPOpcode(Pred, Size);
1028     if (Opcode == -1)
1029       return false;
1030     MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1031             .add(I.getOperand(2))
1032             .add(I.getOperand(3));
1033     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1034       .addReg(AMDGPU::SCC);
1035     bool Ret =
1036         constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
1037         RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1038     I.eraseFromParent();
1039     return Ret;
1040   }
1041 
1042   int Opcode = getV_CMPOpcode(Pred, Size);
1043   if (Opcode == -1)
1044     return false;
1045 
1046   MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode),
1047             I.getOperand(0).getReg())
1048             .add(I.getOperand(2))
1049             .add(I.getOperand(3));
1050   RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
1051                                *TRI.getBoolRC(), *MRI);
1052   bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1053   I.eraseFromParent();
1054   return Ret;
1055 }
1056 
1057 bool AMDGPUInstructionSelector::selectIntrinsicIcmp(MachineInstr &I) const {
1058   Register Dst = I.getOperand(0).getReg();
1059   if (isVCC(Dst, *MRI))
1060     return false;
1061 
1062   if (MRI->getType(Dst).getSizeInBits() != STI.getWavefrontSize())
1063     return false;
1064 
1065   MachineBasicBlock *BB = I.getParent();
1066   const DebugLoc &DL = I.getDebugLoc();
1067   Register SrcReg = I.getOperand(2).getReg();
1068   unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1069   auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1070 
1071   int Opcode = getV_CMPOpcode(Pred, Size);
1072   if (Opcode == -1)
1073     return false;
1074 
1075   MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst)
1076                            .add(I.getOperand(2))
1077                            .add(I.getOperand(3));
1078   RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), *TRI.getBoolRC(),
1079                                *MRI);
1080   bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1081   I.eraseFromParent();
1082   return Ret;
1083 }
1084 
1085 bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1086   MachineBasicBlock *BB = I.getParent();
1087   const DebugLoc &DL = I.getDebugLoc();
1088   Register DstReg = I.getOperand(0).getReg();
1089   const unsigned Size = MRI->getType(DstReg).getSizeInBits();
1090   const bool Is64 = Size == 64;
1091 
1092   if (Size != STI.getWavefrontSize())
1093     return false;
1094 
1095   Optional<ValueAndVReg> Arg =
1096       getConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI, true);
1097 
1098   if (Arg.hasValue()) {
1099     const int64_t Value = Arg.getValue().Value;
1100     if (Value == 0) {
1101       unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1102       BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
1103     } else if (Value == -1) { // all ones
1104       Register SrcReg = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
1105       BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
1106     } else
1107       return false;
1108   } else {
1109     Register SrcReg = I.getOperand(2).getReg();
1110     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
1111   }
1112 
1113   I.eraseFromParent();
1114   return true;
1115 }
1116 
1117 bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1118   Register DstReg = I.getOperand(0).getReg();
1119   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1120   const TargetRegisterClass *DstRC =
1121     TRI.getRegClassForSizeOnBank(32, *DstBank, *MRI);
1122   if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1123     return false;
1124 
1125   const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1126 
1127   Module *M = MF->getFunction().getParent();
1128   const MDNode *Metadata = I.getOperand(2).getMetadata();
1129   auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1130   auto RelocSymbol = cast<GlobalVariable>(
1131     M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1132 
1133   MachineBasicBlock *BB = I.getParent();
1134   BuildMI(*BB, &I, I.getDebugLoc(),
1135           TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1136     .addGlobalAddress(RelocSymbol, 0, SIInstrInfo::MO_ABS32_LO);
1137 
1138   I.eraseFromParent();
1139   return true;
1140 }
1141 
1142 bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1143   Triple::OSType OS = MF->getTarget().getTargetTriple().getOS();
1144 
1145   Register DstReg = I.getOperand(0).getReg();
1146   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1147   unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1148     AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1149 
1150   MachineBasicBlock *MBB = I.getParent();
1151   const DebugLoc &DL = I.getDebugLoc();
1152 
1153   auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1154 
1155   if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1156     const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1157     MIB.addImm(MFI->getLDSSize());
1158   } else {
1159     Module *M = MF->getFunction().getParent();
1160     const GlobalValue *GV
1161       = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1162     MIB.addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
1163   }
1164 
1165   I.eraseFromParent();
1166   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1167 }
1168 
1169 bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1170   MachineBasicBlock *MBB = I.getParent();
1171   MachineFunction &MF = *MBB->getParent();
1172   const DebugLoc &DL = I.getDebugLoc();
1173 
1174   MachineOperand &Dst = I.getOperand(0);
1175   Register DstReg = Dst.getReg();
1176   unsigned Depth = I.getOperand(2).getImm();
1177 
1178   const TargetRegisterClass *RC
1179     = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1180   if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1181       !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1182     return false;
1183 
1184   // Check for kernel and shader functions
1185   if (Depth != 0 ||
1186       MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1187     BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1188       .addImm(0);
1189     I.eraseFromParent();
1190     return true;
1191   }
1192 
1193   MachineFrameInfo &MFI = MF.getFrameInfo();
1194   // There is a call to @llvm.returnaddress in this function
1195   MFI.setReturnAddressIsTaken(true);
1196 
1197   // Get the return address reg and mark it as an implicit live-in
1198   Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1199   Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1200                                              AMDGPU::SReg_64RegClass);
1201   BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1202     .addReg(LiveIn);
1203   I.eraseFromParent();
1204   return true;
1205 }
1206 
1207 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1208   // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
1209   // SelectionDAG uses for wave32 vs wave64.
1210   MachineBasicBlock *BB = MI.getParent();
1211   BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1212       .add(MI.getOperand(1));
1213 
1214   Register Reg = MI.getOperand(1).getReg();
1215   MI.eraseFromParent();
1216 
1217   if (!MRI->getRegClassOrNull(Reg))
1218     MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1219   return true;
1220 }
1221 
1222 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1223   MachineInstr &MI, Intrinsic::ID IntrID) const {
1224   MachineBasicBlock *MBB = MI.getParent();
1225   MachineFunction *MF = MBB->getParent();
1226   const DebugLoc &DL = MI.getDebugLoc();
1227 
1228   unsigned IndexOperand = MI.getOperand(7).getImm();
1229   bool WaveRelease = MI.getOperand(8).getImm() != 0;
1230   bool WaveDone = MI.getOperand(9).getImm() != 0;
1231 
1232   if (WaveDone && !WaveRelease)
1233     report_fatal_error("ds_ordered_count: wave_done requires wave_release");
1234 
1235   unsigned OrderedCountIndex = IndexOperand & 0x3f;
1236   IndexOperand &= ~0x3f;
1237   unsigned CountDw = 0;
1238 
1239   if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1240     CountDw = (IndexOperand >> 24) & 0xf;
1241     IndexOperand &= ~(0xf << 24);
1242 
1243     if (CountDw < 1 || CountDw > 4) {
1244       report_fatal_error(
1245         "ds_ordered_count: dword count must be between 1 and 4");
1246     }
1247   }
1248 
1249   if (IndexOperand)
1250     report_fatal_error("ds_ordered_count: bad index operand");
1251 
1252   unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1253   unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1254 
1255   unsigned Offset0 = OrderedCountIndex << 2;
1256   unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) |
1257                      (Instruction << 4);
1258 
1259   if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
1260     Offset1 |= (CountDw - 1) << 6;
1261 
1262   unsigned Offset = Offset0 | (Offset1 << 8);
1263 
1264   Register M0Val = MI.getOperand(2).getReg();
1265   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1266     .addReg(M0Val);
1267 
1268   Register DstReg = MI.getOperand(0).getReg();
1269   Register ValReg = MI.getOperand(3).getReg();
1270   MachineInstrBuilder DS =
1271     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1272       .addReg(ValReg)
1273       .addImm(Offset)
1274       .cloneMemRefs(MI);
1275 
1276   if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1277     return false;
1278 
1279   bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1280   MI.eraseFromParent();
1281   return Ret;
1282 }
1283 
1284 static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1285   switch (IntrID) {
1286   case Intrinsic::amdgcn_ds_gws_init:
1287     return AMDGPU::DS_GWS_INIT;
1288   case Intrinsic::amdgcn_ds_gws_barrier:
1289     return AMDGPU::DS_GWS_BARRIER;
1290   case Intrinsic::amdgcn_ds_gws_sema_v:
1291     return AMDGPU::DS_GWS_SEMA_V;
1292   case Intrinsic::amdgcn_ds_gws_sema_br:
1293     return AMDGPU::DS_GWS_SEMA_BR;
1294   case Intrinsic::amdgcn_ds_gws_sema_p:
1295     return AMDGPU::DS_GWS_SEMA_P;
1296   case Intrinsic::amdgcn_ds_gws_sema_release_all:
1297     return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1298   default:
1299     llvm_unreachable("not a gws intrinsic");
1300   }
1301 }
1302 
1303 bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1304                                                      Intrinsic::ID IID) const {
1305   if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1306       !STI.hasGWSSemaReleaseAll())
1307     return false;
1308 
1309   // intrinsic ID, vsrc, offset
1310   const bool HasVSrc = MI.getNumOperands() == 3;
1311   assert(HasVSrc || MI.getNumOperands() == 2);
1312 
1313   Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1314   const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1315   if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1316     return false;
1317 
1318   MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1319   assert(OffsetDef);
1320 
1321   unsigned ImmOffset;
1322 
1323   MachineBasicBlock *MBB = MI.getParent();
1324   const DebugLoc &DL = MI.getDebugLoc();
1325 
1326   MachineInstr *Readfirstlane = nullptr;
1327 
1328   // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1329   // incoming offset, in case there's an add of a constant. We'll have to put it
1330   // back later.
1331   if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1332     Readfirstlane = OffsetDef;
1333     BaseOffset = OffsetDef->getOperand(1).getReg();
1334     OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1335   }
1336 
1337   if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1338     // If we have a constant offset, try to use the 0 in m0 as the base.
1339     // TODO: Look into changing the default m0 initialization value. If the
1340     // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1341     // the immediate offset.
1342 
1343     ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1344     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1345       .addImm(0);
1346   } else {
1347     std::tie(BaseOffset, ImmOffset, OffsetDef)
1348       = AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset);
1349 
1350     if (Readfirstlane) {
1351       // We have the constant offset now, so put the readfirstlane back on the
1352       // variable component.
1353       if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1354         return false;
1355 
1356       Readfirstlane->getOperand(1).setReg(BaseOffset);
1357       BaseOffset = Readfirstlane->getOperand(0).getReg();
1358     } else {
1359       if (!RBI.constrainGenericRegister(BaseOffset,
1360                                         AMDGPU::SReg_32RegClass, *MRI))
1361         return false;
1362     }
1363 
1364     Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1365     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1366       .addReg(BaseOffset)
1367       .addImm(16);
1368 
1369     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1370       .addReg(M0Base);
1371   }
1372 
1373   // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1374   // offset field) % 64. Some versions of the programming guide omit the m0
1375   // part, or claim it's from offset 0.
1376   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1377 
1378   if (HasVSrc) {
1379     Register VSrc = MI.getOperand(1).getReg();
1380     MIB.addReg(VSrc);
1381     if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1382       return false;
1383   }
1384 
1385   MIB.addImm(ImmOffset)
1386      .addImm(-1) // $gds
1387      .cloneMemRefs(MI);
1388 
1389   MI.eraseFromParent();
1390   return true;
1391 }
1392 
1393 bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1394                                                       bool IsAppend) const {
1395   Register PtrBase = MI.getOperand(2).getReg();
1396   LLT PtrTy = MRI->getType(PtrBase);
1397   bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1398 
1399   unsigned Offset;
1400   std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1401 
1402   // TODO: Should this try to look through readfirstlane like GWS?
1403   if (!isDSOffsetLegal(PtrBase, Offset, 16)) {
1404     PtrBase = MI.getOperand(2).getReg();
1405     Offset = 0;
1406   }
1407 
1408   MachineBasicBlock *MBB = MI.getParent();
1409   const DebugLoc &DL = MI.getDebugLoc();
1410   const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1411 
1412   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1413     .addReg(PtrBase);
1414   if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1415     return false;
1416 
1417   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1418     .addImm(Offset)
1419     .addImm(IsGDS ? -1 : 0)
1420     .cloneMemRefs(MI);
1421   MI.eraseFromParent();
1422   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1423 }
1424 
1425 bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
1426   if (TM.getOptLevel() > CodeGenOpt::None) {
1427     unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
1428     if (WGSize <= STI.getWavefrontSize()) {
1429       MachineBasicBlock *MBB = MI.getParent();
1430       const DebugLoc &DL = MI.getDebugLoc();
1431       BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER));
1432       MI.eraseFromParent();
1433       return true;
1434     }
1435   }
1436   return selectImpl(MI, *CoverageInfo);
1437 }
1438 
1439 static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
1440                          bool &IsTexFail) {
1441   if (TexFailCtrl)
1442     IsTexFail = true;
1443 
1444   TFE = (TexFailCtrl & 0x1) ? 1 : 0;
1445   TexFailCtrl &= ~(uint64_t)0x1;
1446   LWE = (TexFailCtrl & 0x2) ? 1 : 0;
1447   TexFailCtrl &= ~(uint64_t)0x2;
1448 
1449   return TexFailCtrl == 0;
1450 }
1451 
1452 static bool parseCachePolicy(uint64_t Value,
1453                              bool *GLC, bool *SLC, bool *DLC) {
1454   if (GLC) {
1455     *GLC = (Value & 0x1) ? 1 : 0;
1456     Value &= ~(uint64_t)0x1;
1457   }
1458   if (SLC) {
1459     *SLC = (Value & 0x2) ? 1 : 0;
1460     Value &= ~(uint64_t)0x2;
1461   }
1462   if (DLC) {
1463     *DLC = (Value & 0x4) ? 1 : 0;
1464     Value &= ~(uint64_t)0x4;
1465   }
1466 
1467   return Value == 0;
1468 }
1469 
1470 bool AMDGPUInstructionSelector::selectImageIntrinsic(
1471   MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
1472   MachineBasicBlock *MBB = MI.getParent();
1473   const DebugLoc &DL = MI.getDebugLoc();
1474 
1475   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1476     AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1477 
1478   const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
1479   const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
1480       AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
1481   const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo =
1482       AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode);
1483   unsigned IntrOpcode = Intr->BaseOpcode;
1484   const bool IsGFX10 = STI.getGeneration() >= AMDGPUSubtarget::GFX10;
1485 
1486   const int VAddrIdx = getImageVAddrIdxBegin(BaseOpcode,
1487                                              MI.getNumExplicitDefs());
1488   int NumVAddr, NumGradients;
1489   std::tie(NumVAddr, NumGradients) = getImageNumVAddr(Intr, BaseOpcode);
1490 
1491   Register VDataIn, VDataOut;
1492   LLT VDataTy;
1493   int NumVDataDwords = -1;
1494   bool IsD16 = false;
1495 
1496   // XXX - Can we just get the second to last argument for ctrl?
1497   unsigned CtrlIdx; // Index of texfailctrl argument
1498   bool Unorm;
1499   if (!BaseOpcode->Sampler) {
1500     Unorm = true;
1501     CtrlIdx = VAddrIdx + NumVAddr + 1;
1502   } else {
1503     Unorm = MI.getOperand(VAddrIdx + NumVAddr + 2).getImm() != 0;
1504     CtrlIdx = VAddrIdx + NumVAddr + 3;
1505   }
1506 
1507   bool TFE;
1508   bool LWE;
1509   bool IsTexFail = false;
1510   if (!parseTexFail(MI.getOperand(CtrlIdx).getImm(), TFE, LWE, IsTexFail))
1511     return false;
1512 
1513   const int Flags = MI.getOperand(CtrlIdx + 2).getImm();
1514   const bool IsA16 = (Flags & 1) != 0;
1515   const bool IsG16 = (Flags & 2) != 0;
1516 
1517   // A16 implies 16 bit gradients
1518   if (IsA16 && !IsG16)
1519     return false;
1520 
1521   unsigned DMask = 0;
1522   unsigned DMaskLanes = 0;
1523 
1524   if (BaseOpcode->Atomic) {
1525     VDataOut = MI.getOperand(0).getReg();
1526     VDataIn = MI.getOperand(2).getReg();
1527     LLT Ty = MRI->getType(VDataIn);
1528 
1529     // Be careful to allow atomic swap on 16-bit element vectors.
1530     const bool Is64Bit = BaseOpcode->AtomicX2 ?
1531       Ty.getSizeInBits() == 128 :
1532       Ty.getSizeInBits() == 64;
1533 
1534     if (BaseOpcode->AtomicX2) {
1535       assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
1536 
1537       DMask = Is64Bit ? 0xf : 0x3;
1538       NumVDataDwords = Is64Bit ? 4 : 2;
1539     } else {
1540       DMask = Is64Bit ? 0x3 : 0x1;
1541       NumVDataDwords = Is64Bit ? 2 : 1;
1542     }
1543   } else {
1544     const int DMaskIdx = 2; // Input/output + intrinsic ID.
1545 
1546     DMask = MI.getOperand(DMaskIdx).getImm();
1547     DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
1548 
1549     if (BaseOpcode->Store) {
1550       VDataIn = MI.getOperand(1).getReg();
1551       VDataTy = MRI->getType(VDataIn);
1552       NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
1553     } else {
1554       VDataOut = MI.getOperand(0).getReg();
1555       VDataTy = MRI->getType(VDataOut);
1556       NumVDataDwords = DMaskLanes;
1557 
1558       // One memoperand is mandatory, except for getresinfo.
1559       // FIXME: Check this in verifier.
1560       if (!MI.memoperands_empty()) {
1561         const MachineMemOperand *MMO = *MI.memoperands_begin();
1562 
1563         // Infer d16 from the memory size, as the register type will be mangled by
1564         // unpacked subtargets, or by TFE.
1565         IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32;
1566 
1567         if (IsD16 && !STI.hasUnpackedD16VMem())
1568           NumVDataDwords = (DMaskLanes + 1) / 2;
1569       }
1570     }
1571   }
1572 
1573   // Optimize _L to _LZ when _L is zero
1574   if (LZMappingInfo) {
1575     // The legalizer replaced the register with an immediate 0 if we need to
1576     // change the opcode.
1577     const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1);
1578     if (Lod.isImm()) {
1579       assert(Lod.getImm() == 0);
1580       IntrOpcode = LZMappingInfo->LZ;  // set new opcode to _lz variant of _l
1581     }
1582   }
1583 
1584   // Optimize _mip away, when 'lod' is zero
1585   if (MIPMappingInfo) {
1586     const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1);
1587     if (Lod.isImm()) {
1588       assert(Lod.getImm() == 0);
1589       IntrOpcode = MIPMappingInfo->NONMIP;  // set new opcode to variant without _mip
1590     }
1591   }
1592 
1593   // Set G16 opcode
1594   if (IsG16 && !IsA16) {
1595     const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
1596         AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
1597     assert(G16MappingInfo);
1598     IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
1599   }
1600 
1601   // TODO: Check this in verifier.
1602   assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
1603 
1604   bool GLC = false;
1605   bool SLC = false;
1606   bool DLC = false;
1607   if (BaseOpcode->Atomic) {
1608     GLC = true; // TODO no-return optimization
1609     if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), nullptr, &SLC,
1610                           IsGFX10 ? &DLC : nullptr))
1611       return false;
1612   } else {
1613     if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), &GLC, &SLC,
1614                           IsGFX10 ? &DLC : nullptr))
1615       return false;
1616   }
1617 
1618   int NumVAddrRegs = 0;
1619   int NumVAddrDwords = 0;
1620   for (int I = 0; I < NumVAddr; ++I) {
1621     // Skip the $noregs and 0s inserted during legalization.
1622     MachineOperand &AddrOp = MI.getOperand(VAddrIdx + I);
1623     if (!AddrOp.isReg())
1624       continue; // XXX - Break?
1625 
1626     Register Addr = AddrOp.getReg();
1627     if (!Addr)
1628       break;
1629 
1630     ++NumVAddrRegs;
1631     NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
1632   }
1633 
1634   // The legalizer preprocessed the intrinsic arguments. If we aren't using
1635   // NSA, these should have beeen packed into a single value in the first
1636   // address register
1637   const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs;
1638   if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
1639     LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
1640     return false;
1641   }
1642 
1643   if (IsTexFail)
1644     ++NumVDataDwords;
1645 
1646   int Opcode = -1;
1647   if (IsGFX10) {
1648     Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1649                                    UseNSA ? AMDGPU::MIMGEncGfx10NSA
1650                                           : AMDGPU::MIMGEncGfx10Default,
1651                                    NumVDataDwords, NumVAddrDwords);
1652   } else {
1653     if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
1654       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
1655                                      NumVDataDwords, NumVAddrDwords);
1656     if (Opcode == -1)
1657       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
1658                                      NumVDataDwords, NumVAddrDwords);
1659   }
1660   assert(Opcode != -1);
1661 
1662   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
1663     .cloneMemRefs(MI);
1664 
1665   if (VDataOut) {
1666     if (BaseOpcode->AtomicX2) {
1667       const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
1668 
1669       Register TmpReg = MRI->createVirtualRegister(
1670         Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
1671       unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
1672 
1673       MIB.addDef(TmpReg);
1674       BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
1675         .addReg(TmpReg, RegState::Kill, SubReg);
1676 
1677     } else {
1678       MIB.addDef(VDataOut); // vdata output
1679     }
1680   }
1681 
1682   if (VDataIn)
1683     MIB.addReg(VDataIn); // vdata input
1684 
1685   for (int i = 0; i != NumVAddrRegs; ++i) {
1686     MachineOperand &SrcOp = MI.getOperand(VAddrIdx + i);
1687     if (SrcOp.isReg()) {
1688       assert(SrcOp.getReg() != 0);
1689       MIB.addReg(SrcOp.getReg());
1690     }
1691   }
1692 
1693   MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr).getReg()); // rsrc
1694   if (BaseOpcode->Sampler)
1695     MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr + 1).getReg()); // sampler
1696 
1697   MIB.addImm(DMask); // dmask
1698 
1699   if (IsGFX10)
1700     MIB.addImm(DimInfo->Encoding);
1701   MIB.addImm(Unorm);
1702   if (IsGFX10)
1703     MIB.addImm(DLC);
1704 
1705   MIB.addImm(GLC);
1706   MIB.addImm(SLC);
1707   MIB.addImm(IsA16 &&  // a16 or r128
1708              STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
1709   if (IsGFX10)
1710     MIB.addImm(IsA16 ? -1 : 0);
1711 
1712   MIB.addImm(TFE); // tfe
1713   MIB.addImm(LWE); // lwe
1714   if (!IsGFX10)
1715     MIB.addImm(DimInfo->DA ? -1 : 0);
1716   if (BaseOpcode->HasD16)
1717     MIB.addImm(IsD16 ? -1 : 0);
1718 
1719   MI.eraseFromParent();
1720   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1721 }
1722 
1723 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
1724     MachineInstr &I) const {
1725   unsigned IntrinsicID = I.getIntrinsicID();
1726   switch (IntrinsicID) {
1727   case Intrinsic::amdgcn_end_cf:
1728     return selectEndCfIntrinsic(I);
1729   case Intrinsic::amdgcn_ds_ordered_add:
1730   case Intrinsic::amdgcn_ds_ordered_swap:
1731     return selectDSOrderedIntrinsic(I, IntrinsicID);
1732   case Intrinsic::amdgcn_ds_gws_init:
1733   case Intrinsic::amdgcn_ds_gws_barrier:
1734   case Intrinsic::amdgcn_ds_gws_sema_v:
1735   case Intrinsic::amdgcn_ds_gws_sema_br:
1736   case Intrinsic::amdgcn_ds_gws_sema_p:
1737   case Intrinsic::amdgcn_ds_gws_sema_release_all:
1738     return selectDSGWSIntrinsic(I, IntrinsicID);
1739   case Intrinsic::amdgcn_ds_append:
1740     return selectDSAppendConsume(I, true);
1741   case Intrinsic::amdgcn_ds_consume:
1742     return selectDSAppendConsume(I, false);
1743   case Intrinsic::amdgcn_s_barrier:
1744     return selectSBarrier(I);
1745   default: {
1746     return selectImpl(I, *CoverageInfo);
1747   }
1748   }
1749 }
1750 
1751 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
1752   if (selectImpl(I, *CoverageInfo))
1753     return true;
1754 
1755   MachineBasicBlock *BB = I.getParent();
1756   const DebugLoc &DL = I.getDebugLoc();
1757 
1758   Register DstReg = I.getOperand(0).getReg();
1759   unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
1760   assert(Size <= 32 || Size == 64);
1761   const MachineOperand &CCOp = I.getOperand(1);
1762   Register CCReg = CCOp.getReg();
1763   if (!isVCC(CCReg, *MRI)) {
1764     unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
1765                                          AMDGPU::S_CSELECT_B32;
1766     MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
1767             .addReg(CCReg);
1768 
1769     // The generic constrainSelectedInstRegOperands doesn't work for the scc register
1770     // bank, because it does not cover the register class that we used to represent
1771     // for it.  So we need to manually set the register class here.
1772     if (!MRI->getRegClassOrNull(CCReg))
1773         MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
1774     MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
1775             .add(I.getOperand(2))
1776             .add(I.getOperand(3));
1777 
1778     bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI) |
1779                constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
1780     I.eraseFromParent();
1781     return Ret;
1782   }
1783 
1784   // Wide VGPR select should have been split in RegBankSelect.
1785   if (Size > 32)
1786     return false;
1787 
1788   MachineInstr *Select =
1789       BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1790               .addImm(0)
1791               .add(I.getOperand(3))
1792               .addImm(0)
1793               .add(I.getOperand(2))
1794               .add(I.getOperand(1));
1795 
1796   bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
1797   I.eraseFromParent();
1798   return Ret;
1799 }
1800 
1801 static int sizeToSubRegIndex(unsigned Size) {
1802   switch (Size) {
1803   case 32:
1804     return AMDGPU::sub0;
1805   case 64:
1806     return AMDGPU::sub0_sub1;
1807   case 96:
1808     return AMDGPU::sub0_sub1_sub2;
1809   case 128:
1810     return AMDGPU::sub0_sub1_sub2_sub3;
1811   case 256:
1812     return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
1813   default:
1814     if (Size < 32)
1815       return AMDGPU::sub0;
1816     if (Size > 256)
1817       return -1;
1818     return sizeToSubRegIndex(PowerOf2Ceil(Size));
1819   }
1820 }
1821 
1822 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
1823   Register DstReg = I.getOperand(0).getReg();
1824   Register SrcReg = I.getOperand(1).getReg();
1825   const LLT DstTy = MRI->getType(DstReg);
1826   const LLT SrcTy = MRI->getType(SrcReg);
1827   const LLT S1 = LLT::scalar(1);
1828 
1829   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
1830   const RegisterBank *DstRB;
1831   if (DstTy == S1) {
1832     // This is a special case. We don't treat s1 for legalization artifacts as
1833     // vcc booleans.
1834     DstRB = SrcRB;
1835   } else {
1836     DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1837     if (SrcRB != DstRB)
1838       return false;
1839   }
1840 
1841   const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
1842 
1843   unsigned DstSize = DstTy.getSizeInBits();
1844   unsigned SrcSize = SrcTy.getSizeInBits();
1845 
1846   const TargetRegisterClass *SrcRC
1847     = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI);
1848   const TargetRegisterClass *DstRC
1849     = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI);
1850   if (!SrcRC || !DstRC)
1851     return false;
1852 
1853   if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
1854       !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
1855     LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
1856     return false;
1857   }
1858 
1859   if (DstTy == LLT::vector(2, 16) && SrcTy == LLT::vector(2, 32)) {
1860     MachineBasicBlock *MBB = I.getParent();
1861     const DebugLoc &DL = I.getDebugLoc();
1862 
1863     Register LoReg = MRI->createVirtualRegister(DstRC);
1864     Register HiReg = MRI->createVirtualRegister(DstRC);
1865     BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
1866       .addReg(SrcReg, 0, AMDGPU::sub0);
1867     BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
1868       .addReg(SrcReg, 0, AMDGPU::sub1);
1869 
1870     if (IsVALU && STI.hasSDWA()) {
1871       // Write the low 16-bits of the high element into the high 16-bits of the
1872       // low element.
1873       MachineInstr *MovSDWA =
1874         BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
1875         .addImm(0)                             // $src0_modifiers
1876         .addReg(HiReg)                         // $src0
1877         .addImm(0)                             // $clamp
1878         .addImm(AMDGPU::SDWA::WORD_1)          // $dst_sel
1879         .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
1880         .addImm(AMDGPU::SDWA::WORD_0)          // $src0_sel
1881         .addReg(LoReg, RegState::Implicit);
1882       MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
1883     } else {
1884       Register TmpReg0 = MRI->createVirtualRegister(DstRC);
1885       Register TmpReg1 = MRI->createVirtualRegister(DstRC);
1886       Register ImmReg = MRI->createVirtualRegister(DstRC);
1887       if (IsVALU) {
1888         BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
1889           .addImm(16)
1890           .addReg(HiReg);
1891       } else {
1892         BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
1893           .addReg(HiReg)
1894           .addImm(16);
1895       }
1896 
1897       unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
1898       unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
1899       unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
1900 
1901       BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
1902         .addImm(0xffff);
1903       BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
1904         .addReg(LoReg)
1905         .addReg(ImmReg);
1906       BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
1907         .addReg(TmpReg0)
1908         .addReg(TmpReg1);
1909     }
1910 
1911     I.eraseFromParent();
1912     return true;
1913   }
1914 
1915   if (!DstTy.isScalar())
1916     return false;
1917 
1918   if (SrcSize > 32) {
1919     int SubRegIdx = sizeToSubRegIndex(DstSize);
1920     if (SubRegIdx == -1)
1921       return false;
1922 
1923     // Deal with weird cases where the class only partially supports the subreg
1924     // index.
1925     const TargetRegisterClass *SrcWithSubRC
1926       = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
1927     if (!SrcWithSubRC)
1928       return false;
1929 
1930     if (SrcWithSubRC != SrcRC) {
1931       if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
1932         return false;
1933     }
1934 
1935     I.getOperand(1).setSubReg(SubRegIdx);
1936   }
1937 
1938   I.setDesc(TII.get(TargetOpcode::COPY));
1939   return true;
1940 }
1941 
1942 /// \returns true if a bitmask for \p Size bits will be an inline immediate.
1943 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
1944   Mask = maskTrailingOnes<unsigned>(Size);
1945   int SignedMask = static_cast<int>(Mask);
1946   return SignedMask >= -16 && SignedMask <= 64;
1947 }
1948 
1949 // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
1950 const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
1951   Register Reg, const MachineRegisterInfo &MRI,
1952   const TargetRegisterInfo &TRI) const {
1953   const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
1954   if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>())
1955     return RB;
1956 
1957   // Ignore the type, since we don't use vcc in artifacts.
1958   if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
1959     return &RBI.getRegBankFromRegClass(*RC, LLT());
1960   return nullptr;
1961 }
1962 
1963 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
1964   bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
1965   bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
1966   const DebugLoc &DL = I.getDebugLoc();
1967   MachineBasicBlock &MBB = *I.getParent();
1968   const Register DstReg = I.getOperand(0).getReg();
1969   const Register SrcReg = I.getOperand(1).getReg();
1970 
1971   const LLT DstTy = MRI->getType(DstReg);
1972   const LLT SrcTy = MRI->getType(SrcReg);
1973   const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
1974     I.getOperand(2).getImm() : SrcTy.getSizeInBits();
1975   const unsigned DstSize = DstTy.getSizeInBits();
1976   if (!DstTy.isScalar())
1977     return false;
1978 
1979   // Artifact casts should never use vcc.
1980   const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
1981 
1982   // FIXME: This should probably be illegal and split earlier.
1983   if (I.getOpcode() == AMDGPU::G_ANYEXT) {
1984     if (DstSize <= 32)
1985       return selectCOPY(I);
1986 
1987     const TargetRegisterClass *SrcRC =
1988         TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank, *MRI);
1989     const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1990     const TargetRegisterClass *DstRC =
1991         TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
1992 
1993     Register UndefReg = MRI->createVirtualRegister(SrcRC);
1994     BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
1995     BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1996       .addReg(SrcReg)
1997       .addImm(AMDGPU::sub0)
1998       .addReg(UndefReg)
1999       .addImm(AMDGPU::sub1);
2000     I.eraseFromParent();
2001 
2002     return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2003            RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2004   }
2005 
2006   if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2007     // 64-bit should have been split up in RegBankSelect
2008 
2009     // Try to use an and with a mask if it will save code size.
2010     unsigned Mask;
2011     if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2012       MachineInstr *ExtI =
2013       BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2014         .addImm(Mask)
2015         .addReg(SrcReg);
2016       I.eraseFromParent();
2017       return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2018     }
2019 
2020     const unsigned BFE = Signed ? AMDGPU::V_BFE_I32 : AMDGPU::V_BFE_U32;
2021     MachineInstr *ExtI =
2022       BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2023       .addReg(SrcReg)
2024       .addImm(0) // Offset
2025       .addImm(SrcSize); // Width
2026     I.eraseFromParent();
2027     return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2028   }
2029 
2030   if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2031     const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2032       AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2033     if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2034       return false;
2035 
2036     if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2037       const unsigned SextOpc = SrcSize == 8 ?
2038         AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2039       BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2040         .addReg(SrcReg);
2041       I.eraseFromParent();
2042       return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2043     }
2044 
2045     const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2046     const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2047 
2048     // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2049     if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2050       // We need a 64-bit register source, but the high bits don't matter.
2051       Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2052       Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2053       unsigned SubReg = InReg ? AMDGPU::sub0 : 0;
2054 
2055       BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2056       BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2057         .addReg(SrcReg, 0, SubReg)
2058         .addImm(AMDGPU::sub0)
2059         .addReg(UndefReg)
2060         .addImm(AMDGPU::sub1);
2061 
2062       BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2063         .addReg(ExtReg)
2064         .addImm(SrcSize << 16);
2065 
2066       I.eraseFromParent();
2067       return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2068     }
2069 
2070     unsigned Mask;
2071     if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2072       BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2073         .addReg(SrcReg)
2074         .addImm(Mask);
2075     } else {
2076       BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2077         .addReg(SrcReg)
2078         .addImm(SrcSize << 16);
2079     }
2080 
2081     I.eraseFromParent();
2082     return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2083   }
2084 
2085   return false;
2086 }
2087 
2088 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
2089   MachineBasicBlock *BB = I.getParent();
2090   MachineOperand &ImmOp = I.getOperand(1);
2091   Register DstReg = I.getOperand(0).getReg();
2092   unsigned Size = MRI->getType(DstReg).getSizeInBits();
2093 
2094   // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
2095   if (ImmOp.isFPImm()) {
2096     const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
2097     ImmOp.ChangeToImmediate(Imm.getZExtValue());
2098   } else if (ImmOp.isCImm()) {
2099     ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue());
2100   } else {
2101     llvm_unreachable("Not supported by g_constants");
2102   }
2103 
2104   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2105   const bool IsSgpr = DstRB->getID() == AMDGPU::SGPRRegBankID;
2106 
2107   unsigned Opcode;
2108   if (DstRB->getID() == AMDGPU::VCCRegBankID) {
2109     Opcode = STI.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2110   } else {
2111     Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
2112 
2113     // We should never produce s1 values on banks other than VCC. If the user of
2114     // this already constrained the register, we may incorrectly think it's VCC
2115     // if it wasn't originally.
2116     if (Size == 1)
2117       return false;
2118   }
2119 
2120   if (Size != 64) {
2121     I.setDesc(TII.get(Opcode));
2122     I.addImplicitDefUseOperands(*MF);
2123     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2124   }
2125 
2126   const DebugLoc &DL = I.getDebugLoc();
2127 
2128   APInt Imm(Size, I.getOperand(1).getImm());
2129 
2130   MachineInstr *ResInst;
2131   if (IsSgpr && TII.isInlineConstant(Imm)) {
2132     ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
2133       .addImm(I.getOperand(1).getImm());
2134   } else {
2135     const TargetRegisterClass *RC = IsSgpr ?
2136       &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass;
2137     Register LoReg = MRI->createVirtualRegister(RC);
2138     Register HiReg = MRI->createVirtualRegister(RC);
2139 
2140     BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
2141       .addImm(Imm.trunc(32).getZExtValue());
2142 
2143     BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
2144       .addImm(Imm.ashr(32).getZExtValue());
2145 
2146     ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2147       .addReg(LoReg)
2148       .addImm(AMDGPU::sub0)
2149       .addReg(HiReg)
2150       .addImm(AMDGPU::sub1);
2151   }
2152 
2153   // We can't call constrainSelectedInstRegOperands here, because it doesn't
2154   // work for target independent opcodes
2155   I.eraseFromParent();
2156   const TargetRegisterClass *DstRC =
2157     TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI);
2158   if (!DstRC)
2159     return true;
2160   return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI);
2161 }
2162 
2163 bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2164   // Only manually handle the f64 SGPR case.
2165   //
2166   // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2167   // the bit ops theoretically have a second result due to the implicit def of
2168   // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2169   // that is easy by disabling the check. The result works, but uses a
2170   // nonsensical sreg32orlds_and_sreg_1 regclass.
2171   //
2172   // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2173   // the variadic REG_SEQUENCE operands.
2174 
2175   Register Dst = MI.getOperand(0).getReg();
2176   const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2177   if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2178       MRI->getType(Dst) != LLT::scalar(64))
2179     return false;
2180 
2181   Register Src = MI.getOperand(1).getReg();
2182   MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2183   if (Fabs)
2184     Src = Fabs->getOperand(1).getReg();
2185 
2186   if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2187       !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2188     return false;
2189 
2190   MachineBasicBlock *BB = MI.getParent();
2191   const DebugLoc &DL = MI.getDebugLoc();
2192   Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2193   Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2194   Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2195   Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2196 
2197   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2198     .addReg(Src, 0, AMDGPU::sub0);
2199   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2200     .addReg(Src, 0, AMDGPU::sub1);
2201   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2202     .addImm(0x80000000);
2203 
2204   // Set or toggle sign bit.
2205   unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2206   BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2207     .addReg(HiReg)
2208     .addReg(ConstReg);
2209   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2210     .addReg(LoReg)
2211     .addImm(AMDGPU::sub0)
2212     .addReg(OpReg)
2213     .addImm(AMDGPU::sub1);
2214   MI.eraseFromParent();
2215   return true;
2216 }
2217 
2218 // FIXME: This is a workaround for the same tablegen problems as G_FNEG
2219 bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2220   Register Dst = MI.getOperand(0).getReg();
2221   const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2222   if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2223       MRI->getType(Dst) != LLT::scalar(64))
2224     return false;
2225 
2226   Register Src = MI.getOperand(1).getReg();
2227   MachineBasicBlock *BB = MI.getParent();
2228   const DebugLoc &DL = MI.getDebugLoc();
2229   Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2230   Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2231   Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2232   Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2233 
2234   if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2235       !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2236     return false;
2237 
2238   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2239     .addReg(Src, 0, AMDGPU::sub0);
2240   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2241     .addReg(Src, 0, AMDGPU::sub1);
2242   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2243     .addImm(0x7fffffff);
2244 
2245   // Clear sign bit.
2246   // TODO: Should this used S_BITSET0_*?
2247   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2248     .addReg(HiReg)
2249     .addReg(ConstReg);
2250   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2251     .addReg(LoReg)
2252     .addImm(AMDGPU::sub0)
2253     .addReg(OpReg)
2254     .addImm(AMDGPU::sub1);
2255 
2256   MI.eraseFromParent();
2257   return true;
2258 }
2259 
2260 static bool isConstant(const MachineInstr &MI) {
2261   return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2262 }
2263 
2264 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2265     const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2266 
2267   const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg());
2268 
2269   assert(PtrMI);
2270 
2271   if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2272     return;
2273 
2274   GEPInfo GEPInfo(*PtrMI);
2275 
2276   for (unsigned i = 1; i != 3; ++i) {
2277     const MachineOperand &GEPOp = PtrMI->getOperand(i);
2278     const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2279     assert(OpDef);
2280     if (i == 2 && isConstant(*OpDef)) {
2281       // TODO: Could handle constant base + variable offset, but a combine
2282       // probably should have commuted it.
2283       assert(GEPInfo.Imm == 0);
2284       GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2285       continue;
2286     }
2287     const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2288     if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2289       GEPInfo.SgprParts.push_back(GEPOp.getReg());
2290     else
2291       GEPInfo.VgprParts.push_back(GEPOp.getReg());
2292   }
2293 
2294   AddrInfo.push_back(GEPInfo);
2295   getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2296 }
2297 
2298 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2299   if (!MI.hasOneMemOperand())
2300     return false;
2301 
2302   const MachineMemOperand *MMO = *MI.memoperands_begin();
2303   const Value *Ptr = MMO->getValue();
2304 
2305   // UndefValue means this is a load of a kernel input.  These are uniform.
2306   // Sometimes LDS instructions have constant pointers.
2307   // If Ptr is null, then that means this mem operand contains a
2308   // PseudoSourceValue like GOT.
2309   if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
2310       isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
2311     return true;
2312 
2313   if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
2314     return true;
2315 
2316   const Instruction *I = dyn_cast<Instruction>(Ptr);
2317   return I && I->getMetadata("amdgpu.uniform");
2318 }
2319 
2320 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2321   for (const GEPInfo &GEPInfo : AddrInfo) {
2322     if (!GEPInfo.VgprParts.empty())
2323       return true;
2324   }
2325   return false;
2326 }
2327 
2328 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2329   const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2330   unsigned AS = PtrTy.getAddressSpace();
2331   if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) &&
2332       STI.ldsRequiresM0Init()) {
2333     MachineBasicBlock *BB = I.getParent();
2334 
2335     // If DS instructions require M0 initializtion, insert it before selecting.
2336     BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2337       .addImm(-1);
2338   }
2339 }
2340 
2341 bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
2342   MachineInstr &I) const {
2343   initM0(I);
2344   return selectImpl(I, *CoverageInfo);
2345 }
2346 
2347 // TODO: No rtn optimization.
2348 bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG(
2349   MachineInstr &MI) const {
2350   Register PtrReg = MI.getOperand(1).getReg();
2351   const LLT PtrTy = MRI->getType(PtrReg);
2352   if (PtrTy.getAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
2353       STI.useFlatForGlobal())
2354     return selectImpl(MI, *CoverageInfo);
2355 
2356   Register DstReg = MI.getOperand(0).getReg();
2357   const LLT Ty = MRI->getType(DstReg);
2358   const bool Is64 = Ty.getSizeInBits() == 64;
2359   const unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2360   Register TmpReg = MRI->createVirtualRegister(
2361     Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2362 
2363   const DebugLoc &DL = MI.getDebugLoc();
2364   MachineBasicBlock *BB = MI.getParent();
2365 
2366   Register VAddr, RSrcReg, SOffset;
2367   int64_t Offset = 0;
2368 
2369   unsigned Opcode;
2370   if (selectMUBUFOffsetImpl(MI.getOperand(1), RSrcReg, SOffset, Offset)) {
2371     Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN :
2372                              AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN;
2373   } else if (selectMUBUFAddr64Impl(MI.getOperand(1), VAddr,
2374                                    RSrcReg, SOffset, Offset)) {
2375     Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN :
2376                     AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN;
2377   } else
2378     return selectImpl(MI, *CoverageInfo);
2379 
2380   auto MIB = BuildMI(*BB, &MI, DL, TII.get(Opcode), TmpReg)
2381     .addReg(MI.getOperand(2).getReg());
2382 
2383   if (VAddr)
2384     MIB.addReg(VAddr);
2385 
2386   MIB.addReg(RSrcReg);
2387   if (SOffset)
2388     MIB.addReg(SOffset);
2389   else
2390     MIB.addImm(0);
2391 
2392   MIB.addImm(Offset);
2393   MIB.addImm(0); // slc
2394   MIB.cloneMemRefs(MI);
2395 
2396   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), DstReg)
2397     .addReg(TmpReg, RegState::Kill, SubReg);
2398 
2399   MI.eraseFromParent();
2400 
2401   MRI->setRegClass(
2402     DstReg, Is64 ? &AMDGPU::VReg_64RegClass : &AMDGPU::VGPR_32RegClass);
2403   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2404 }
2405 
2406 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
2407   MachineBasicBlock *BB = I.getParent();
2408   MachineOperand &CondOp = I.getOperand(0);
2409   Register CondReg = CondOp.getReg();
2410   const DebugLoc &DL = I.getDebugLoc();
2411 
2412   unsigned BrOpcode;
2413   Register CondPhysReg;
2414   const TargetRegisterClass *ConstrainRC;
2415 
2416   // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
2417   // whether the branch is uniform when selecting the instruction. In
2418   // GlobalISel, we should push that decision into RegBankSelect. Assume for now
2419   // RegBankSelect knows what it's doing if the branch condition is scc, even
2420   // though it currently does not.
2421   if (!isVCC(CondReg, *MRI)) {
2422     if (MRI->getType(CondReg) != LLT::scalar(32))
2423       return false;
2424 
2425     CondPhysReg = AMDGPU::SCC;
2426     BrOpcode = AMDGPU::S_CBRANCH_SCC1;
2427     ConstrainRC = &AMDGPU::SReg_32RegClass;
2428   } else {
2429     // FIXME: Do we have to insert an and with exec here, like in SelectionDAG?
2430     // We sort of know that a VCC producer based on the register bank, that ands
2431     // inactive lanes with 0. What if there was a logical operation with vcc
2432     // producers in different blocks/with different exec masks?
2433     // FIXME: Should scc->vcc copies and with exec?
2434     CondPhysReg = TRI.getVCC();
2435     BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
2436     ConstrainRC = TRI.getBoolRC();
2437   }
2438 
2439   if (!MRI->getRegClassOrNull(CondReg))
2440     MRI->setRegClass(CondReg, ConstrainRC);
2441 
2442   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
2443     .addReg(CondReg);
2444   BuildMI(*BB, &I, DL, TII.get(BrOpcode))
2445     .addMBB(I.getOperand(1).getMBB());
2446 
2447   I.eraseFromParent();
2448   return true;
2449 }
2450 
2451 bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
2452   MachineInstr &I) const {
2453   Register DstReg = I.getOperand(0).getReg();
2454   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2455   const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2456   I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
2457   if (IsVGPR)
2458     I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
2459 
2460   return RBI.constrainGenericRegister(
2461     DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
2462 }
2463 
2464 bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
2465   Register DstReg = I.getOperand(0).getReg();
2466   Register SrcReg = I.getOperand(1).getReg();
2467   Register MaskReg = I.getOperand(2).getReg();
2468   LLT Ty = MRI->getType(DstReg);
2469   LLT MaskTy = MRI->getType(MaskReg);
2470 
2471   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2472   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2473   const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
2474   const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2475   if (DstRB != SrcRB) // Should only happen for hand written MIR.
2476     return false;
2477 
2478   unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2479   const TargetRegisterClass &RegRC
2480     = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2481 
2482   const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB,
2483                                                                   *MRI);
2484   const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB,
2485                                                                   *MRI);
2486   const TargetRegisterClass *MaskRC =
2487       TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB, *MRI);
2488 
2489   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2490       !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2491       !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
2492     return false;
2493 
2494   MachineBasicBlock *BB = I.getParent();
2495   const DebugLoc &DL = I.getDebugLoc();
2496   if (Ty.getSizeInBits() == 32) {
2497     assert(MaskTy.getSizeInBits() == 32 &&
2498            "ptrmask should have been narrowed during legalize");
2499 
2500     BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
2501       .addReg(SrcReg)
2502       .addReg(MaskReg);
2503     I.eraseFromParent();
2504     return true;
2505   }
2506 
2507   Register HiReg = MRI->createVirtualRegister(&RegRC);
2508   Register LoReg = MRI->createVirtualRegister(&RegRC);
2509 
2510   // Extract the subregisters from the source pointer.
2511   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
2512     .addReg(SrcReg, 0, AMDGPU::sub0);
2513   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
2514     .addReg(SrcReg, 0, AMDGPU::sub1);
2515 
2516   Register MaskedLo, MaskedHi;
2517 
2518   // Try to avoid emitting a bit operation when we only need to touch half of
2519   // the 64-bit pointer.
2520   APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64);
2521 
2522   const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
2523   const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
2524   if ((MaskOnes & MaskLo32) == MaskLo32) {
2525     // If all the bits in the low half are 1, we only need a copy for it.
2526     MaskedLo = LoReg;
2527   } else {
2528     // Extract the mask subregister and apply the and.
2529     Register MaskLo = MRI->createVirtualRegister(&RegRC);
2530     MaskedLo = MRI->createVirtualRegister(&RegRC);
2531 
2532     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
2533       .addReg(MaskReg, 0, AMDGPU::sub0);
2534     BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
2535       .addReg(LoReg)
2536       .addReg(MaskLo);
2537   }
2538 
2539   if ((MaskOnes & MaskHi32) == MaskHi32) {
2540     // If all the bits in the high half are 1, we only need a copy for it.
2541     MaskedHi = HiReg;
2542   } else {
2543     Register MaskHi = MRI->createVirtualRegister(&RegRC);
2544     MaskedHi = MRI->createVirtualRegister(&RegRC);
2545 
2546     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
2547       .addReg(MaskReg, 0, AMDGPU::sub1);
2548     BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
2549       .addReg(HiReg)
2550       .addReg(MaskHi);
2551   }
2552 
2553   BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2554     .addReg(MaskedLo)
2555     .addImm(AMDGPU::sub0)
2556     .addReg(MaskedHi)
2557     .addImm(AMDGPU::sub1);
2558   I.eraseFromParent();
2559   return true;
2560 }
2561 
2562 /// Return the register to use for the index value, and the subregister to use
2563 /// for the indirectly accessed register.
2564 static std::pair<Register, unsigned>
2565 computeIndirectRegIndex(MachineRegisterInfo &MRI,
2566                         const SIRegisterInfo &TRI,
2567                         const TargetRegisterClass *SuperRC,
2568                         Register IdxReg,
2569                         unsigned EltSize) {
2570   Register IdxBaseReg;
2571   int Offset;
2572   MachineInstr *Unused;
2573 
2574   std::tie(IdxBaseReg, Offset, Unused)
2575     = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg);
2576   if (IdxBaseReg == AMDGPU::NoRegister) {
2577     // This will happen if the index is a known constant. This should ordinarily
2578     // be legalized out, but handle it as a register just in case.
2579     assert(Offset == 0);
2580     IdxBaseReg = IdxReg;
2581   }
2582 
2583   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
2584 
2585   // Skip out of bounds offsets, or else we would end up using an undefined
2586   // register.
2587   if (static_cast<unsigned>(Offset) >= SubRegs.size())
2588     return std::make_pair(IdxReg, SubRegs[0]);
2589   return std::make_pair(IdxBaseReg, SubRegs[Offset]);
2590 }
2591 
2592 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
2593   MachineInstr &MI) const {
2594   Register DstReg = MI.getOperand(0).getReg();
2595   Register SrcReg = MI.getOperand(1).getReg();
2596   Register IdxReg = MI.getOperand(2).getReg();
2597 
2598   LLT DstTy = MRI->getType(DstReg);
2599   LLT SrcTy = MRI->getType(SrcReg);
2600 
2601   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2602   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2603   const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
2604 
2605   // The index must be scalar. If it wasn't RegBankSelect should have moved this
2606   // into a waterfall loop.
2607   if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
2608     return false;
2609 
2610   const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB,
2611                                                                   *MRI);
2612   const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(DstTy, *DstRB,
2613                                                                   *MRI);
2614   if (!SrcRC || !DstRC)
2615     return false;
2616   if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2617       !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2618       !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
2619     return false;
2620 
2621   MachineBasicBlock *BB = MI.getParent();
2622   const DebugLoc &DL = MI.getDebugLoc();
2623   const bool Is64 = DstTy.getSizeInBits() == 64;
2624 
2625   unsigned SubReg;
2626   std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, SrcRC, IdxReg,
2627                                                      DstTy.getSizeInBits() / 8);
2628 
2629   if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
2630     if (DstTy.getSizeInBits() != 32 && !Is64)
2631       return false;
2632 
2633     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2634       .addReg(IdxReg);
2635 
2636     unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
2637     BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
2638       .addReg(SrcReg, 0, SubReg)
2639       .addReg(SrcReg, RegState::Implicit);
2640     MI.eraseFromParent();
2641     return true;
2642   }
2643 
2644   if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
2645     return false;
2646 
2647   if (!STI.useVGPRIndexMode()) {
2648     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2649       .addReg(IdxReg);
2650     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
2651       .addReg(SrcReg, 0, SubReg)
2652       .addReg(SrcReg, RegState::Implicit);
2653     MI.eraseFromParent();
2654     return true;
2655   }
2656 
2657   BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON))
2658     .addReg(IdxReg)
2659     .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE);
2660   BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), DstReg)
2661     .addReg(SrcReg, 0, SubReg)
2662     .addReg(SrcReg, RegState::Implicit)
2663     .addReg(AMDGPU::M0, RegState::Implicit);
2664   BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF));
2665 
2666   MI.eraseFromParent();
2667   return true;
2668 }
2669 
2670 // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
2671 bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
2672   MachineInstr &MI) const {
2673   Register DstReg = MI.getOperand(0).getReg();
2674   Register VecReg = MI.getOperand(1).getReg();
2675   Register ValReg = MI.getOperand(2).getReg();
2676   Register IdxReg = MI.getOperand(3).getReg();
2677 
2678   LLT VecTy = MRI->getType(DstReg);
2679   LLT ValTy = MRI->getType(ValReg);
2680   unsigned VecSize = VecTy.getSizeInBits();
2681   unsigned ValSize = ValTy.getSizeInBits();
2682 
2683   const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
2684   const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
2685   const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
2686 
2687   assert(VecTy.getElementType() == ValTy);
2688 
2689   // The index must be scalar. If it wasn't RegBankSelect should have moved this
2690   // into a waterfall loop.
2691   if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
2692     return false;
2693 
2694   const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB,
2695                                                                   *MRI);
2696   const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB,
2697                                                                   *MRI);
2698 
2699   if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
2700       !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
2701       !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
2702       !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
2703     return false;
2704 
2705   if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
2706     return false;
2707 
2708   unsigned SubReg;
2709   std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg,
2710                                                      ValSize / 8);
2711 
2712   const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
2713                          STI.useVGPRIndexMode();
2714 
2715   MachineBasicBlock *BB = MI.getParent();
2716   const DebugLoc &DL = MI.getDebugLoc();
2717 
2718   if (IndexMode) {
2719     BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON))
2720       .addReg(IdxReg)
2721       .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE);
2722   } else {
2723     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2724       .addReg(IdxReg);
2725   }
2726 
2727   const MCInstrDesc &RegWriteOp
2728     = TII.getIndirectRegWritePseudo(VecSize, ValSize,
2729                                     VecRB->getID() == AMDGPU::SGPRRegBankID);
2730   BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
2731     .addReg(VecReg)
2732     .addReg(ValReg)
2733     .addImm(SubReg);
2734 
2735   if (IndexMode)
2736     BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF));
2737 
2738   MI.eraseFromParent();
2739   return true;
2740 }
2741 
2742 static bool isZeroOrUndef(int X) {
2743   return X == 0 || X == -1;
2744 }
2745 
2746 static bool isOneOrUndef(int X) {
2747   return X == 1 || X == -1;
2748 }
2749 
2750 static bool isZeroOrOneOrUndef(int X) {
2751   return X == 0 || X == 1 || X == -1;
2752 }
2753 
2754 // Normalize a VOP3P shuffle mask to refer to the low/high half of a single
2755 // 32-bit register.
2756 static Register normalizeVOP3PMask(int NewMask[2], Register Src0, Register Src1,
2757                                    ArrayRef<int> Mask) {
2758   NewMask[0] = Mask[0];
2759   NewMask[1] = Mask[1];
2760   if (isZeroOrOneOrUndef(Mask[0]) && isZeroOrOneOrUndef(Mask[1]))
2761     return Src0;
2762 
2763   assert(NewMask[0] == 2 || NewMask[0] == 3 || NewMask[0] == -1);
2764   assert(NewMask[1] == 2 || NewMask[1] == 3 || NewMask[1] == -1);
2765 
2766   // Shift the mask inputs to be 0/1;
2767   NewMask[0] = NewMask[0] == -1 ? -1 : NewMask[0] - 2;
2768   NewMask[1] = NewMask[1] == -1 ? -1 : NewMask[1] - 2;
2769   return Src1;
2770 }
2771 
2772 // This is only legal with VOP3P instructions as an aid to op_sel matching.
2773 bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
2774   MachineInstr &MI) const {
2775   Register DstReg = MI.getOperand(0).getReg();
2776   Register Src0Reg = MI.getOperand(1).getReg();
2777   Register Src1Reg = MI.getOperand(2).getReg();
2778   ArrayRef<int> ShufMask = MI.getOperand(3).getShuffleMask();
2779 
2780   const LLT V2S16 = LLT::vector(2, 16);
2781   if (MRI->getType(DstReg) != V2S16 || MRI->getType(Src0Reg) != V2S16)
2782     return false;
2783 
2784   if (!AMDGPU::isLegalVOP3PShuffleMask(ShufMask))
2785     return false;
2786 
2787   assert(ShufMask.size() == 2);
2788   assert(STI.hasSDWA() && "no target has VOP3P but not SDWA");
2789 
2790   MachineBasicBlock *MBB = MI.getParent();
2791   const DebugLoc &DL = MI.getDebugLoc();
2792 
2793   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2794   const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2795   const TargetRegisterClass &RC = IsVALU ?
2796     AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2797 
2798   // Handle the degenerate case which should have folded out.
2799   if (ShufMask[0] == -1 && ShufMask[1] == -1) {
2800     BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), DstReg);
2801 
2802     MI.eraseFromParent();
2803     return RBI.constrainGenericRegister(DstReg, RC, *MRI);
2804   }
2805 
2806   // A legal VOP3P mask only reads one of the sources.
2807   int Mask[2];
2808   Register SrcVec = normalizeVOP3PMask(Mask, Src0Reg, Src1Reg, ShufMask);
2809 
2810   if (!RBI.constrainGenericRegister(DstReg, RC, *MRI) ||
2811       !RBI.constrainGenericRegister(SrcVec, RC, *MRI))
2812     return false;
2813 
2814   // TODO: This also should have been folded out
2815   if (isZeroOrUndef(Mask[0]) && isOneOrUndef(Mask[1])) {
2816     BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), DstReg)
2817       .addReg(SrcVec);
2818 
2819     MI.eraseFromParent();
2820     return true;
2821   }
2822 
2823   if (Mask[0] == 1 && Mask[1] == -1) {
2824     if (IsVALU) {
2825       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
2826         .addImm(16)
2827         .addReg(SrcVec);
2828     } else {
2829       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
2830         .addReg(SrcVec)
2831         .addImm(16);
2832     }
2833   } else if (Mask[0] == -1 && Mask[1] == 0) {
2834     if (IsVALU) {
2835       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), DstReg)
2836         .addImm(16)
2837         .addReg(SrcVec);
2838     } else {
2839       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHL_B32), DstReg)
2840         .addReg(SrcVec)
2841         .addImm(16);
2842     }
2843   } else if (Mask[0] == 0 && Mask[1] == 0) {
2844     if (IsVALU) {
2845       // Write low half of the register into the high half.
2846       MachineInstr *MovSDWA =
2847         BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2848         .addImm(0)                             // $src0_modifiers
2849         .addReg(SrcVec)                        // $src0
2850         .addImm(0)                             // $clamp
2851         .addImm(AMDGPU::SDWA::WORD_1)          // $dst_sel
2852         .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2853         .addImm(AMDGPU::SDWA::WORD_0)          // $src0_sel
2854         .addReg(SrcVec, RegState::Implicit);
2855       MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2856     } else {
2857       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
2858         .addReg(SrcVec)
2859         .addReg(SrcVec);
2860     }
2861   } else if (Mask[0] == 1 && Mask[1] == 1) {
2862     if (IsVALU) {
2863       // Write high half of the register into the low half.
2864       MachineInstr *MovSDWA =
2865         BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2866         .addImm(0)                             // $src0_modifiers
2867         .addReg(SrcVec)                        // $src0
2868         .addImm(0)                             // $clamp
2869         .addImm(AMDGPU::SDWA::WORD_0)          // $dst_sel
2870         .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2871         .addImm(AMDGPU::SDWA::WORD_1)          // $src0_sel
2872         .addReg(SrcVec, RegState::Implicit);
2873       MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2874     } else {
2875       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_HH_B32_B16), DstReg)
2876         .addReg(SrcVec)
2877         .addReg(SrcVec);
2878     }
2879   } else if (Mask[0] == 1 && Mask[1] == 0) {
2880     if (IsVALU) {
2881       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_ALIGNBIT_B32), DstReg)
2882         .addReg(SrcVec)
2883         .addReg(SrcVec)
2884         .addImm(16);
2885     } else {
2886       Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2887       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg)
2888         .addReg(SrcVec)
2889         .addImm(16);
2890       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
2891         .addReg(TmpReg)
2892         .addReg(SrcVec);
2893     }
2894   } else
2895     llvm_unreachable("all shuffle masks should be handled");
2896 
2897   MI.eraseFromParent();
2898   return true;
2899 }
2900 
2901 bool AMDGPUInstructionSelector::select(MachineInstr &I) {
2902   if (I.isPHI())
2903     return selectPHI(I);
2904 
2905   if (!I.isPreISelOpcode()) {
2906     if (I.isCopy())
2907       return selectCOPY(I);
2908     return true;
2909   }
2910 
2911   switch (I.getOpcode()) {
2912   case TargetOpcode::G_AND:
2913   case TargetOpcode::G_OR:
2914   case TargetOpcode::G_XOR:
2915     if (selectImpl(I, *CoverageInfo))
2916       return true;
2917     return selectG_AND_OR_XOR(I);
2918   case TargetOpcode::G_ADD:
2919   case TargetOpcode::G_SUB:
2920     if (selectImpl(I, *CoverageInfo))
2921       return true;
2922     return selectG_ADD_SUB(I);
2923   case TargetOpcode::G_UADDO:
2924   case TargetOpcode::G_USUBO:
2925   case TargetOpcode::G_UADDE:
2926   case TargetOpcode::G_USUBE:
2927     return selectG_UADDO_USUBO_UADDE_USUBE(I);
2928   case TargetOpcode::G_INTTOPTR:
2929   case TargetOpcode::G_BITCAST:
2930   case TargetOpcode::G_PTRTOINT:
2931     return selectCOPY(I);
2932   case TargetOpcode::G_CONSTANT:
2933   case TargetOpcode::G_FCONSTANT:
2934     return selectG_CONSTANT(I);
2935   case TargetOpcode::G_FNEG:
2936     if (selectImpl(I, *CoverageInfo))
2937       return true;
2938     return selectG_FNEG(I);
2939   case TargetOpcode::G_FABS:
2940     if (selectImpl(I, *CoverageInfo))
2941       return true;
2942     return selectG_FABS(I);
2943   case TargetOpcode::G_EXTRACT:
2944     return selectG_EXTRACT(I);
2945   case TargetOpcode::G_MERGE_VALUES:
2946   case TargetOpcode::G_BUILD_VECTOR:
2947   case TargetOpcode::G_CONCAT_VECTORS:
2948     return selectG_MERGE_VALUES(I);
2949   case TargetOpcode::G_UNMERGE_VALUES:
2950     return selectG_UNMERGE_VALUES(I);
2951   case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2952     return selectG_BUILD_VECTOR_TRUNC(I);
2953   case TargetOpcode::G_PTR_ADD:
2954     return selectG_PTR_ADD(I);
2955   case TargetOpcode::G_IMPLICIT_DEF:
2956     return selectG_IMPLICIT_DEF(I);
2957   case TargetOpcode::G_FREEZE:
2958     return selectCOPY(I);
2959   case TargetOpcode::G_INSERT:
2960     return selectG_INSERT(I);
2961   case TargetOpcode::G_INTRINSIC:
2962     return selectG_INTRINSIC(I);
2963   case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
2964     return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
2965   case TargetOpcode::G_ICMP:
2966     if (selectG_ICMP(I))
2967       return true;
2968     return selectImpl(I, *CoverageInfo);
2969   case TargetOpcode::G_LOAD:
2970   case TargetOpcode::G_STORE:
2971   case TargetOpcode::G_ATOMIC_CMPXCHG:
2972   case TargetOpcode::G_ATOMICRMW_XCHG:
2973   case TargetOpcode::G_ATOMICRMW_ADD:
2974   case TargetOpcode::G_ATOMICRMW_SUB:
2975   case TargetOpcode::G_ATOMICRMW_AND:
2976   case TargetOpcode::G_ATOMICRMW_OR:
2977   case TargetOpcode::G_ATOMICRMW_XOR:
2978   case TargetOpcode::G_ATOMICRMW_MIN:
2979   case TargetOpcode::G_ATOMICRMW_MAX:
2980   case TargetOpcode::G_ATOMICRMW_UMIN:
2981   case TargetOpcode::G_ATOMICRMW_UMAX:
2982   case TargetOpcode::G_ATOMICRMW_FADD:
2983   case AMDGPU::G_AMDGPU_ATOMIC_INC:
2984   case AMDGPU::G_AMDGPU_ATOMIC_DEC:
2985   case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
2986   case AMDGPU::G_AMDGPU_ATOMIC_FMAX:
2987     return selectG_LOAD_STORE_ATOMICRMW(I);
2988   case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
2989     return selectG_AMDGPU_ATOMIC_CMPXCHG(I);
2990   case TargetOpcode::G_SELECT:
2991     return selectG_SELECT(I);
2992   case TargetOpcode::G_TRUNC:
2993     return selectG_TRUNC(I);
2994   case TargetOpcode::G_SEXT:
2995   case TargetOpcode::G_ZEXT:
2996   case TargetOpcode::G_ANYEXT:
2997   case TargetOpcode::G_SEXT_INREG:
2998     if (selectImpl(I, *CoverageInfo))
2999       return true;
3000     return selectG_SZA_EXT(I);
3001   case TargetOpcode::G_BRCOND:
3002     return selectG_BRCOND(I);
3003   case TargetOpcode::G_GLOBAL_VALUE:
3004     return selectG_GLOBAL_VALUE(I);
3005   case TargetOpcode::G_PTRMASK:
3006     return selectG_PTRMASK(I);
3007   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3008     return selectG_EXTRACT_VECTOR_ELT(I);
3009   case TargetOpcode::G_INSERT_VECTOR_ELT:
3010     return selectG_INSERT_VECTOR_ELT(I);
3011   case TargetOpcode::G_SHUFFLE_VECTOR:
3012     return selectG_SHUFFLE_VECTOR(I);
3013   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3014   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
3015     const AMDGPU::ImageDimIntrinsicInfo *Intr
3016       = AMDGPU::getImageDimIntrinsicInfo(I.getIntrinsicID());
3017     assert(Intr && "not an image intrinsic with image pseudo");
3018     return selectImageIntrinsic(I, Intr);
3019   }
3020   default:
3021     return selectImpl(I, *CoverageInfo);
3022   }
3023   return false;
3024 }
3025 
3026 InstructionSelector::ComplexRendererFns
3027 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
3028   return {{
3029       [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3030   }};
3031 
3032 }
3033 
3034 std::pair<Register, unsigned>
3035 AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root) const {
3036   Register Src = Root.getReg();
3037   Register OrigSrc = Src;
3038   unsigned Mods = 0;
3039   MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
3040 
3041   if (MI && MI->getOpcode() == AMDGPU::G_FNEG) {
3042     Src = MI->getOperand(1).getReg();
3043     Mods |= SISrcMods::NEG;
3044     MI = getDefIgnoringCopies(Src, *MRI);
3045   }
3046 
3047   if (MI && MI->getOpcode() == AMDGPU::G_FABS) {
3048     Src = MI->getOperand(1).getReg();
3049     Mods |= SISrcMods::ABS;
3050   }
3051 
3052   if (Mods != 0 &&
3053       RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
3054     MachineInstr *UseMI = Root.getParent();
3055 
3056     // If we looked through copies to find source modifiers on an SGPR operand,
3057     // we now have an SGPR register source. To avoid potentially violating the
3058     // constant bus restriction, we need to insert a copy to a VGPR.
3059     Register VGPRSrc = MRI->cloneVirtualRegister(OrigSrc);
3060     BuildMI(*UseMI->getParent(), UseMI, UseMI->getDebugLoc(),
3061             TII.get(AMDGPU::COPY), VGPRSrc)
3062       .addReg(Src);
3063     Src = VGPRSrc;
3064   }
3065 
3066   return std::make_pair(Src, Mods);
3067 }
3068 
3069 ///
3070 /// This will select either an SGPR or VGPR operand and will save us from
3071 /// having to write an extra tablegen pattern.
3072 InstructionSelector::ComplexRendererFns
3073 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
3074   return {{
3075       [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3076   }};
3077 }
3078 
3079 InstructionSelector::ComplexRendererFns
3080 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
3081   Register Src;
3082   unsigned Mods;
3083   std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3084 
3085   return {{
3086       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3087       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3088       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },    // clamp
3089       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }     // omod
3090   }};
3091 }
3092 
3093 InstructionSelector::ComplexRendererFns
3094 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
3095   return {{
3096       [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
3097       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3098       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }  // omod
3099   }};
3100 }
3101 
3102 InstructionSelector::ComplexRendererFns
3103 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
3104   Register Src;
3105   unsigned Mods;
3106   std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3107 
3108   return {{
3109       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3110       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
3111   }};
3112 }
3113 
3114 InstructionSelector::ComplexRendererFns
3115 AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
3116   Register Reg = Root.getReg();
3117   const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3118   if (Def && (Def->getOpcode() == AMDGPU::G_FNEG ||
3119               Def->getOpcode() == AMDGPU::G_FABS))
3120     return {};
3121   return {{
3122       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3123   }};
3124 }
3125 
3126 std::pair<Register, unsigned>
3127 AMDGPUInstructionSelector::selectVOP3PModsImpl(
3128   Register Src, const MachineRegisterInfo &MRI) const {
3129   unsigned Mods = 0;
3130   MachineInstr *MI = MRI.getVRegDef(Src);
3131 
3132   if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
3133       // It's possible to see an f32 fneg here, but unlikely.
3134       // TODO: Treat f32 fneg as only high bit.
3135       MRI.getType(Src) == LLT::vector(2, 16)) {
3136     Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
3137     Src = MI->getOperand(1).getReg();
3138     MI = MRI.getVRegDef(Src);
3139   }
3140 
3141   // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
3142 
3143   // Packed instructions do not have abs modifiers.
3144   Mods |= SISrcMods::OP_SEL_1;
3145 
3146   return std::make_pair(Src, Mods);
3147 }
3148 
3149 InstructionSelector::ComplexRendererFns
3150 AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
3151   MachineRegisterInfo &MRI
3152     = Root.getParent()->getParent()->getParent()->getRegInfo();
3153 
3154   Register Src;
3155   unsigned Mods;
3156   std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
3157 
3158   return {{
3159       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3160       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
3161   }};
3162 }
3163 
3164 InstructionSelector::ComplexRendererFns
3165 AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {
3166   Register Src;
3167   unsigned Mods;
3168   std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3169   if (!TM.Options.NoNaNsFPMath && !isKnownNeverNaN(Src, *MRI))
3170     return None;
3171 
3172   return {{
3173       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3174       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
3175   }};
3176 }
3177 
3178 InstructionSelector::ComplexRendererFns
3179 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
3180   // FIXME: Handle op_sel
3181   return {{
3182       [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
3183       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods
3184   }};
3185 }
3186 
3187 InstructionSelector::ComplexRendererFns
3188 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
3189   SmallVector<GEPInfo, 4> AddrInfo;
3190   getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
3191 
3192   if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3193     return None;
3194 
3195   const GEPInfo &GEPInfo = AddrInfo[0];
3196   Optional<int64_t> EncodedImm =
3197       AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm, false);
3198   if (!EncodedImm)
3199     return None;
3200 
3201   unsigned PtrReg = GEPInfo.SgprParts[0];
3202   return {{
3203     [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3204     [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
3205   }};
3206 }
3207 
3208 InstructionSelector::ComplexRendererFns
3209 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
3210   SmallVector<GEPInfo, 4> AddrInfo;
3211   getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
3212 
3213   if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3214     return None;
3215 
3216   const GEPInfo &GEPInfo = AddrInfo[0];
3217   Register PtrReg = GEPInfo.SgprParts[0];
3218   Optional<int64_t> EncodedImm =
3219       AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
3220   if (!EncodedImm)
3221     return None;
3222 
3223   return {{
3224     [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3225     [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
3226   }};
3227 }
3228 
3229 InstructionSelector::ComplexRendererFns
3230 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
3231   MachineInstr *MI = Root.getParent();
3232   MachineBasicBlock *MBB = MI->getParent();
3233 
3234   SmallVector<GEPInfo, 4> AddrInfo;
3235   getAddrModeInfo(*MI, *MRI, AddrInfo);
3236 
3237   // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
3238   // then we can select all ptr + 32-bit offsets not just immediate offsets.
3239   if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3240     return None;
3241 
3242   const GEPInfo &GEPInfo = AddrInfo[0];
3243   // SGPR offset is unsigned.
3244   if (!GEPInfo.Imm || GEPInfo.Imm < 0 || !isUInt<32>(GEPInfo.Imm))
3245     return None;
3246 
3247   // If we make it this far we have a load with an 32-bit immediate offset.
3248   // It is OK to select this using a sgpr offset, because we have already
3249   // failed trying to select this load into one of the _IMM variants since
3250   // the _IMM Patterns are considered before the _SGPR patterns.
3251   Register PtrReg = GEPInfo.SgprParts[0];
3252   Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3253   BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg)
3254           .addImm(GEPInfo.Imm);
3255   return {{
3256     [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3257     [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }
3258   }};
3259 }
3260 
3261 template <bool Signed>
3262 InstructionSelector::ComplexRendererFns
3263 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const {
3264   MachineInstr *MI = Root.getParent();
3265 
3266   InstructionSelector::ComplexRendererFns Default = {{
3267       [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
3268       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },  // offset
3269     }};
3270 
3271   if (!STI.hasFlatInstOffsets())
3272     return Default;
3273 
3274   const MachineInstr *OpDef = MRI->getVRegDef(Root.getReg());
3275   if (!OpDef || OpDef->getOpcode() != AMDGPU::G_PTR_ADD)
3276     return Default;
3277 
3278   Optional<int64_t> Offset =
3279     getConstantVRegVal(OpDef->getOperand(2).getReg(), *MRI);
3280   if (!Offset.hasValue())
3281     return Default;
3282 
3283   unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
3284   if (!TII.isLegalFLATOffset(Offset.getValue(), AddrSpace, Signed))
3285     return Default;
3286 
3287   Register BasePtr = OpDef->getOperand(1).getReg();
3288 
3289   return {{
3290       [=](MachineInstrBuilder &MIB) { MIB.addReg(BasePtr); },
3291       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset.getValue()); },
3292     }};
3293 }
3294 
3295 InstructionSelector::ComplexRendererFns
3296 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
3297   return selectFlatOffsetImpl<false>(Root);
3298 }
3299 
3300 InstructionSelector::ComplexRendererFns
3301 AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const {
3302   return selectFlatOffsetImpl<true>(Root);
3303 }
3304 
3305 /// Match a zero extend from a 32-bit value to 64-bits.
3306 static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
3307   Register ZExtSrc;
3308   if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
3309     return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3310 
3311   // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3312   const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
3313   if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3314     return false;
3315 
3316   int64_t MergeRHS;
3317   if (mi_match(Def->getOperand(2).getReg(), MRI, m_ICst(MergeRHS)) &&
3318       MergeRHS == 0) {
3319     return Def->getOperand(1).getReg();
3320   }
3321 
3322   return Register();
3323 }
3324 
3325 // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
3326 InstructionSelector::ComplexRendererFns
3327 AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
3328   Register PtrBase;
3329   int64_t ImmOffset;
3330 
3331   // Match the immediate offset first, which canonically is moved as low as
3332   // possible.
3333   std::tie(PtrBase, ImmOffset) = getPtrBaseWithConstantOffset(Root.getReg(),
3334                                                               *MRI);
3335 
3336   // TODO: Could split larger constant into VGPR offset.
3337   if (ImmOffset != 0 &&
3338       !TII.isLegalFLATOffset(ImmOffset, AMDGPUAS::GLOBAL_ADDRESS, true)) {
3339     PtrBase = Root.getReg();
3340     ImmOffset = 0;
3341   }
3342 
3343   // Match the variable offset.
3344   const MachineInstr *PtrBaseDef = getDefIgnoringCopies(PtrBase, *MRI);
3345   if (PtrBaseDef->getOpcode() != AMDGPU::G_PTR_ADD)
3346     return None;
3347 
3348   // Look through the SGPR->VGPR copy.
3349   Register PtrBaseSrc =
3350     getSrcRegIgnoringCopies(PtrBaseDef->getOperand(1).getReg(), *MRI);
3351   if (!PtrBaseSrc)
3352     return None;
3353 
3354   const RegisterBank *BaseRB = RBI.getRegBank(PtrBaseSrc, *MRI, TRI);
3355   if (BaseRB->getID() != AMDGPU::SGPRRegBankID)
3356     return None;
3357 
3358   Register SAddr = PtrBaseSrc;
3359   Register PtrBaseOffset = PtrBaseDef->getOperand(2).getReg();
3360 
3361   // It's possible voffset is an SGPR here, but the copy to VGPR will be
3362   // inserted later.
3363   Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset);
3364   if (!VOffset)
3365     return None;
3366 
3367   return {{[=](MachineInstrBuilder &MIB) { // saddr
3368              MIB.addReg(SAddr);
3369            },
3370            [=](MachineInstrBuilder &MIB) { // voffset
3371              MIB.addReg(VOffset);
3372            },
3373            [=](MachineInstrBuilder &MIB) { // offset
3374              MIB.addImm(ImmOffset);
3375            }}};
3376 }
3377 
3378 static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
3379   auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>();
3380   return PSV && PSV->isStack();
3381 }
3382 
3383 InstructionSelector::ComplexRendererFns
3384 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
3385   MachineInstr *MI = Root.getParent();
3386   MachineBasicBlock *MBB = MI->getParent();
3387   MachineFunction *MF = MBB->getParent();
3388   const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3389 
3390   int64_t Offset = 0;
3391   if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
3392       Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) {
3393     Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3394 
3395     // TODO: Should this be inside the render function? The iterator seems to
3396     // move.
3397     BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
3398             HighBits)
3399       .addImm(Offset & ~4095);
3400 
3401     return {{[=](MachineInstrBuilder &MIB) { // rsrc
3402                MIB.addReg(Info->getScratchRSrcReg());
3403              },
3404              [=](MachineInstrBuilder &MIB) { // vaddr
3405                MIB.addReg(HighBits);
3406              },
3407              [=](MachineInstrBuilder &MIB) { // soffset
3408                const MachineMemOperand *MMO = *MI->memoperands_begin();
3409                const MachinePointerInfo &PtrInfo = MMO->getPointerInfo();
3410 
3411                if (isStackPtrRelative(PtrInfo))
3412                  MIB.addReg(Info->getStackPtrOffsetReg());
3413                else
3414                  MIB.addImm(0);
3415              },
3416              [=](MachineInstrBuilder &MIB) { // offset
3417                MIB.addImm(Offset & 4095);
3418              }}};
3419   }
3420 
3421   assert(Offset == 0 || Offset == -1);
3422 
3423   // Try to fold a frame index directly into the MUBUF vaddr field, and any
3424   // offsets.
3425   Optional<int> FI;
3426   Register VAddr = Root.getReg();
3427   if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) {
3428     if (isBaseWithConstantOffset(Root, *MRI)) {
3429       const MachineOperand &LHS = RootDef->getOperand(1);
3430       const MachineOperand &RHS = RootDef->getOperand(2);
3431       const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg());
3432       const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg());
3433       if (LHSDef && RHSDef) {
3434         int64_t PossibleOffset =
3435             RHSDef->getOperand(1).getCImm()->getSExtValue();
3436         if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) &&
3437             (!STI.privateMemoryResourceIsRangeChecked() ||
3438              KnownBits->signBitIsZero(LHS.getReg()))) {
3439           if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
3440             FI = LHSDef->getOperand(1).getIndex();
3441           else
3442             VAddr = LHS.getReg();
3443           Offset = PossibleOffset;
3444         }
3445       }
3446     } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
3447       FI = RootDef->getOperand(1).getIndex();
3448     }
3449   }
3450 
3451   return {{[=](MachineInstrBuilder &MIB) { // rsrc
3452              MIB.addReg(Info->getScratchRSrcReg());
3453            },
3454            [=](MachineInstrBuilder &MIB) { // vaddr
3455              if (FI.hasValue())
3456                MIB.addFrameIndex(FI.getValue());
3457              else
3458                MIB.addReg(VAddr);
3459            },
3460            [=](MachineInstrBuilder &MIB) { // soffset
3461              // If we don't know this private access is a local stack object, it
3462              // needs to be relative to the entry point's scratch wave offset.
3463              // TODO: Should split large offsets that don't fit like above.
3464              // TODO: Don't use scratch wave offset just because the offset
3465              // didn't fit.
3466              if (!Info->isEntryFunction() && FI.hasValue())
3467                MIB.addReg(Info->getStackPtrOffsetReg());
3468              else
3469                MIB.addImm(0);
3470            },
3471            [=](MachineInstrBuilder &MIB) { // offset
3472              MIB.addImm(Offset);
3473            }}};
3474 }
3475 
3476 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
3477                                                 int64_t Offset,
3478                                                 unsigned OffsetBits) const {
3479   if ((OffsetBits == 16 && !isUInt<16>(Offset)) ||
3480       (OffsetBits == 8 && !isUInt<8>(Offset)))
3481     return false;
3482 
3483   if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
3484     return true;
3485 
3486   // On Southern Islands instruction with a negative base value and an offset
3487   // don't seem to work.
3488   return KnownBits->signBitIsZero(Base);
3489 }
3490 
3491 InstructionSelector::ComplexRendererFns
3492 AMDGPUInstructionSelector::selectMUBUFScratchOffset(
3493     MachineOperand &Root) const {
3494   MachineInstr *MI = Root.getParent();
3495   MachineBasicBlock *MBB = MI->getParent();
3496 
3497   int64_t Offset = 0;
3498   if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
3499       !SIInstrInfo::isLegalMUBUFImmOffset(Offset))
3500     return {};
3501 
3502   const MachineFunction *MF = MBB->getParent();
3503   const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3504   const MachineMemOperand *MMO = *MI->memoperands_begin();
3505   const MachinePointerInfo &PtrInfo = MMO->getPointerInfo();
3506 
3507   return {{
3508       [=](MachineInstrBuilder &MIB) { // rsrc
3509         MIB.addReg(Info->getScratchRSrcReg());
3510       },
3511       [=](MachineInstrBuilder &MIB) { // soffset
3512         if (isStackPtrRelative(PtrInfo))
3513           MIB.addReg(Info->getStackPtrOffsetReg());
3514         else
3515           MIB.addImm(0);
3516       },
3517       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
3518   }};
3519 }
3520 
3521 std::pair<Register, unsigned>
3522 AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
3523   const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
3524   if (!RootDef)
3525     return std::make_pair(Root.getReg(), 0);
3526 
3527   int64_t ConstAddr = 0;
3528 
3529   Register PtrBase;
3530   int64_t Offset;
3531   std::tie(PtrBase, Offset) =
3532     getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3533 
3534   if (Offset) {
3535     if (isDSOffsetLegal(PtrBase, Offset, 16)) {
3536       // (add n0, c0)
3537       return std::make_pair(PtrBase, Offset);
3538     }
3539   } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
3540     // TODO
3541 
3542 
3543   } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
3544     // TODO
3545 
3546   }
3547 
3548   return std::make_pair(Root.getReg(), 0);
3549 }
3550 
3551 InstructionSelector::ComplexRendererFns
3552 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
3553   Register Reg;
3554   unsigned Offset;
3555   std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
3556   return {{
3557       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3558       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
3559     }};
3560 }
3561 
3562 InstructionSelector::ComplexRendererFns
3563 AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
3564   return selectDSReadWrite2(Root, false);
3565 }
3566 
3567 InstructionSelector::ComplexRendererFns
3568 AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
3569   return selectDSReadWrite2(Root, true);
3570 }
3571 
3572 InstructionSelector::ComplexRendererFns
3573 AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
3574                                               bool IsDS128) const {
3575   Register Reg;
3576   unsigned Offset;
3577   std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, IsDS128);
3578   return {{
3579       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3580       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
3581       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
3582     }};
3583 }
3584 
3585 std::pair<Register, unsigned>
3586 AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
3587                                                   bool IsDS128) const {
3588   const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
3589   if (!RootDef)
3590     return std::make_pair(Root.getReg(), 0);
3591 
3592   int64_t ConstAddr = 0;
3593 
3594   Register PtrBase;
3595   int64_t Offset;
3596   std::tie(PtrBase, Offset) =
3597     getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3598 
3599   if (Offset) {
3600     int64_t OffsetValue0 = Offset / (IsDS128 ? 8 : 4);
3601     int64_t OffsetValue1 = OffsetValue0 + 1;
3602     if (isDSOffsetLegal(PtrBase, OffsetValue1, (IsDS128 ? 16 : 8))) {
3603       // (add n0, c0)
3604       return std::make_pair(PtrBase, OffsetValue0);
3605     }
3606   } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
3607     // TODO
3608 
3609   } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
3610     // TODO
3611 
3612   }
3613 
3614   return std::make_pair(Root.getReg(), 0);
3615 }
3616 
3617 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
3618 /// the base value with the constant offset. There may be intervening copies
3619 /// between \p Root and the identified constant. Returns \p Root, 0 if this does
3620 /// not match the pattern.
3621 std::pair<Register, int64_t>
3622 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
3623   Register Root, const MachineRegisterInfo &MRI) const {
3624   MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
3625   if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
3626     return {Root, 0};
3627 
3628   MachineOperand &RHS = RootI->getOperand(2);
3629   Optional<ValueAndVReg> MaybeOffset
3630     = getConstantVRegValWithLookThrough(RHS.getReg(), MRI, true);
3631   if (!MaybeOffset)
3632     return {Root, 0};
3633   return {RootI->getOperand(1).getReg(), MaybeOffset->Value};
3634 }
3635 
3636 static void addZeroImm(MachineInstrBuilder &MIB) {
3637   MIB.addImm(0);
3638 }
3639 
3640 /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
3641 /// BasePtr is not valid, a null base pointer will be used.
3642 static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI,
3643                           uint32_t FormatLo, uint32_t FormatHi,
3644                           Register BasePtr) {
3645   Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
3646   Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
3647   Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3648   Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
3649 
3650   B.buildInstr(AMDGPU::S_MOV_B32)
3651     .addDef(RSrc2)
3652     .addImm(FormatLo);
3653   B.buildInstr(AMDGPU::S_MOV_B32)
3654     .addDef(RSrc3)
3655     .addImm(FormatHi);
3656 
3657   // Build the half of the subregister with the constants before building the
3658   // full 128-bit register. If we are building multiple resource descriptors,
3659   // this will allow CSEing of the 2-component register.
3660   B.buildInstr(AMDGPU::REG_SEQUENCE)
3661     .addDef(RSrcHi)
3662     .addReg(RSrc2)
3663     .addImm(AMDGPU::sub0)
3664     .addReg(RSrc3)
3665     .addImm(AMDGPU::sub1);
3666 
3667   Register RSrcLo = BasePtr;
3668   if (!BasePtr) {
3669     RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3670     B.buildInstr(AMDGPU::S_MOV_B64)
3671       .addDef(RSrcLo)
3672       .addImm(0);
3673   }
3674 
3675   B.buildInstr(AMDGPU::REG_SEQUENCE)
3676     .addDef(RSrc)
3677     .addReg(RSrcLo)
3678     .addImm(AMDGPU::sub0_sub1)
3679     .addReg(RSrcHi)
3680     .addImm(AMDGPU::sub2_sub3);
3681 
3682   return RSrc;
3683 }
3684 
3685 static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
3686                                 const SIInstrInfo &TII, Register BasePtr) {
3687   uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
3688 
3689   // FIXME: Why are half the "default" bits ignored based on the addressing
3690   // mode?
3691   return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
3692 }
3693 
3694 static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
3695                                const SIInstrInfo &TII, Register BasePtr) {
3696   uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
3697 
3698   // FIXME: Why are half the "default" bits ignored based on the addressing
3699   // mode?
3700   return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
3701 }
3702 
3703 AMDGPUInstructionSelector::MUBUFAddressData
3704 AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
3705   MUBUFAddressData Data;
3706   Data.N0 = Src;
3707 
3708   Register PtrBase;
3709   int64_t Offset;
3710 
3711   std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
3712   if (isUInt<32>(Offset)) {
3713     Data.N0 = PtrBase;
3714     Data.Offset = Offset;
3715   }
3716 
3717   if (MachineInstr *InputAdd
3718       = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
3719     Data.N2 = InputAdd->getOperand(1).getReg();
3720     Data.N3 = InputAdd->getOperand(2).getReg();
3721 
3722     // FIXME: Need to fix extra SGPR->VGPRcopies inserted
3723     // FIXME: Don't know this was defined by operand 0
3724     //
3725     // TODO: Remove this when we have copy folding optimizations after
3726     // RegBankSelect.
3727     Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
3728     Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
3729   }
3730 
3731   return Data;
3732 }
3733 
3734 /// Return if the addr64 mubuf mode should be used for the given address.
3735 bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
3736   // (ptr_add N2, N3) -> addr64, or
3737   // (ptr_add (ptr_add N2, N3), C1) -> addr64
3738   if (Addr.N2)
3739     return true;
3740 
3741   const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
3742   return N0Bank->getID() == AMDGPU::VGPRRegBankID;
3743 }
3744 
3745 /// Split an immediate offset \p ImmOffset depending on whether it fits in the
3746 /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
3747 /// component.
3748 void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
3749   MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
3750   if (SIInstrInfo::isLegalMUBUFImmOffset(ImmOffset))
3751     return;
3752 
3753   // Illegal offset, store it in soffset.
3754   SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3755   B.buildInstr(AMDGPU::S_MOV_B32)
3756     .addDef(SOffset)
3757     .addImm(ImmOffset);
3758   ImmOffset = 0;
3759 }
3760 
3761 bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
3762   MachineOperand &Root, Register &VAddr, Register &RSrcReg,
3763   Register &SOffset, int64_t &Offset) const {
3764   // FIXME: Predicates should stop this from reaching here.
3765   // addr64 bit was removed for volcanic islands.
3766   if (!STI.hasAddr64() || STI.useFlatForGlobal())
3767     return false;
3768 
3769   MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
3770   if (!shouldUseAddr64(AddrData))
3771     return false;
3772 
3773   Register N0 = AddrData.N0;
3774   Register N2 = AddrData.N2;
3775   Register N3 = AddrData.N3;
3776   Offset = AddrData.Offset;
3777 
3778   // Base pointer for the SRD.
3779   Register SRDPtr;
3780 
3781   if (N2) {
3782     if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
3783       assert(N3);
3784       if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
3785         // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
3786         // addr64, and construct the default resource from a 0 address.
3787         VAddr = N0;
3788       } else {
3789         SRDPtr = N3;
3790         VAddr = N2;
3791       }
3792     } else {
3793       // N2 is not divergent.
3794       SRDPtr = N2;
3795       VAddr = N3;
3796     }
3797   } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
3798     // Use the default null pointer in the resource
3799     VAddr = N0;
3800   } else {
3801     // N0 -> offset, or
3802     // (N0 + C1) -> offset
3803     SRDPtr = N0;
3804   }
3805 
3806   MachineIRBuilder B(*Root.getParent());
3807   RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
3808   splitIllegalMUBUFOffset(B, SOffset, Offset);
3809   return true;
3810 }
3811 
3812 bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
3813   MachineOperand &Root, Register &RSrcReg, Register &SOffset,
3814   int64_t &Offset) const {
3815 
3816   // FIXME: Pattern should not reach here.
3817   if (STI.useFlatForGlobal())
3818     return false;
3819 
3820   MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
3821   if (shouldUseAddr64(AddrData))
3822     return false;
3823 
3824   // N0 -> offset, or
3825   // (N0 + C1) -> offset
3826   Register SRDPtr = AddrData.N0;
3827   Offset = AddrData.Offset;
3828 
3829   // TODO: Look through extensions for 32-bit soffset.
3830   MachineIRBuilder B(*Root.getParent());
3831 
3832   RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
3833   splitIllegalMUBUFOffset(B, SOffset, Offset);
3834   return true;
3835 }
3836 
3837 InstructionSelector::ComplexRendererFns
3838 AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
3839   Register VAddr;
3840   Register RSrcReg;
3841   Register SOffset;
3842   int64_t Offset = 0;
3843 
3844   if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
3845     return {};
3846 
3847   // FIXME: Use defaulted operands for trailing 0s and remove from the complex
3848   // pattern.
3849   return {{
3850       [=](MachineInstrBuilder &MIB) {  // rsrc
3851         MIB.addReg(RSrcReg);
3852       },
3853       [=](MachineInstrBuilder &MIB) { // vaddr
3854         MIB.addReg(VAddr);
3855       },
3856       [=](MachineInstrBuilder &MIB) { // soffset
3857         if (SOffset)
3858           MIB.addReg(SOffset);
3859         else
3860           MIB.addImm(0);
3861       },
3862       [=](MachineInstrBuilder &MIB) { // offset
3863         MIB.addImm(Offset);
3864       },
3865       addZeroImm, //  glc
3866       addZeroImm, //  slc
3867       addZeroImm, //  tfe
3868       addZeroImm, //  dlc
3869       addZeroImm  //  swz
3870     }};
3871 }
3872 
3873 InstructionSelector::ComplexRendererFns
3874 AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
3875   Register RSrcReg;
3876   Register SOffset;
3877   int64_t Offset = 0;
3878 
3879   if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
3880     return {};
3881 
3882   return {{
3883       [=](MachineInstrBuilder &MIB) {  // rsrc
3884         MIB.addReg(RSrcReg);
3885       },
3886       [=](MachineInstrBuilder &MIB) { // soffset
3887         if (SOffset)
3888           MIB.addReg(SOffset);
3889         else
3890           MIB.addImm(0);
3891       },
3892       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
3893       addZeroImm, //  glc
3894       addZeroImm, //  slc
3895       addZeroImm, //  tfe
3896       addZeroImm, //  dlc
3897       addZeroImm  //  swz
3898     }};
3899 }
3900 
3901 InstructionSelector::ComplexRendererFns
3902 AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const {
3903   Register VAddr;
3904   Register RSrcReg;
3905   Register SOffset;
3906   int64_t Offset = 0;
3907 
3908   if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
3909     return {};
3910 
3911   // FIXME: Use defaulted operands for trailing 0s and remove from the complex
3912   // pattern.
3913   return {{
3914       [=](MachineInstrBuilder &MIB) {  // rsrc
3915         MIB.addReg(RSrcReg);
3916       },
3917       [=](MachineInstrBuilder &MIB) { // vaddr
3918         MIB.addReg(VAddr);
3919       },
3920       [=](MachineInstrBuilder &MIB) { // soffset
3921         if (SOffset)
3922           MIB.addReg(SOffset);
3923         else
3924           MIB.addImm(0);
3925       },
3926       [=](MachineInstrBuilder &MIB) { // offset
3927         MIB.addImm(Offset);
3928       },
3929       addZeroImm //  slc
3930     }};
3931 }
3932 
3933 InstructionSelector::ComplexRendererFns
3934 AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const {
3935   Register RSrcReg;
3936   Register SOffset;
3937   int64_t Offset = 0;
3938 
3939   if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
3940     return {};
3941 
3942   return {{
3943       [=](MachineInstrBuilder &MIB) {  // rsrc
3944         MIB.addReg(RSrcReg);
3945       },
3946       [=](MachineInstrBuilder &MIB) { // soffset
3947         if (SOffset)
3948           MIB.addReg(SOffset);
3949         else
3950           MIB.addImm(0);
3951       },
3952       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
3953       addZeroImm //  slc
3954     }};
3955 }
3956 
3957 /// Get an immediate that must be 32-bits, and treated as zero extended.
3958 static Optional<uint64_t> getConstantZext32Val(Register Reg,
3959                                                const MachineRegisterInfo &MRI) {
3960   // getConstantVRegVal sexts any values, so see if that matters.
3961   Optional<int64_t> OffsetVal = getConstantVRegVal(Reg, MRI);
3962   if (!OffsetVal || !isInt<32>(*OffsetVal))
3963     return None;
3964   return Lo_32(*OffsetVal);
3965 }
3966 
3967 InstructionSelector::ComplexRendererFns
3968 AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
3969   Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
3970   if (!OffsetVal)
3971     return {};
3972 
3973   Optional<int64_t> EncodedImm =
3974       AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
3975   if (!EncodedImm)
3976     return {};
3977 
3978   return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }  }};
3979 }
3980 
3981 InstructionSelector::ComplexRendererFns
3982 AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
3983   assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
3984 
3985   Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
3986   if (!OffsetVal)
3987     return {};
3988 
3989   Optional<int64_t> EncodedImm
3990     = AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal);
3991   if (!EncodedImm)
3992     return {};
3993 
3994   return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }  }};
3995 }
3996 
3997 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
3998                                                  const MachineInstr &MI,
3999                                                  int OpIdx) const {
4000   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
4001          "Expected G_CONSTANT");
4002   MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
4003 }
4004 
4005 void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
4006                                                 const MachineInstr &MI,
4007                                                 int OpIdx) const {
4008   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
4009          "Expected G_CONSTANT");
4010   MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
4011 }
4012 
4013 void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB,
4014                                                  const MachineInstr &MI,
4015                                                  int OpIdx) const {
4016   assert(OpIdx == -1);
4017 
4018   const MachineOperand &Op = MI.getOperand(1);
4019   if (MI.getOpcode() == TargetOpcode::G_FCONSTANT)
4020     MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
4021   else {
4022     assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
4023     MIB.addImm(Op.getCImm()->getSExtValue());
4024   }
4025 }
4026 
4027 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
4028                                                 const MachineInstr &MI,
4029                                                 int OpIdx) const {
4030   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
4031          "Expected G_CONSTANT");
4032   MIB.addImm(MI.getOperand(1).getCImm()->getValue().countPopulation());
4033 }
4034 
4035 /// This only really exists to satisfy DAG type checking machinery, so is a
4036 /// no-op here.
4037 void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
4038                                                 const MachineInstr &MI,
4039                                                 int OpIdx) const {
4040   MIB.addImm(MI.getOperand(OpIdx).getImm());
4041 }
4042 
4043 void AMDGPUInstructionSelector::renderExtractGLC(MachineInstrBuilder &MIB,
4044                                                  const MachineInstr &MI,
4045                                                  int OpIdx) const {
4046   assert(OpIdx >= 0 && "expected to match an immediate operand");
4047   MIB.addImm(MI.getOperand(OpIdx).getImm() & 1);
4048 }
4049 
4050 void AMDGPUInstructionSelector::renderExtractSLC(MachineInstrBuilder &MIB,
4051                                                  const MachineInstr &MI,
4052                                                  int OpIdx) const {
4053   assert(OpIdx >= 0 && "expected to match an immediate operand");
4054   MIB.addImm((MI.getOperand(OpIdx).getImm() >> 1) & 1);
4055 }
4056 
4057 void AMDGPUInstructionSelector::renderExtractDLC(MachineInstrBuilder &MIB,
4058                                                  const MachineInstr &MI,
4059                                                  int OpIdx) const {
4060   assert(OpIdx >= 0 && "expected to match an immediate operand");
4061   MIB.addImm((MI.getOperand(OpIdx).getImm() >> 2) & 1);
4062 }
4063 
4064 void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
4065                                                  const MachineInstr &MI,
4066                                                  int OpIdx) const {
4067   assert(OpIdx >= 0 && "expected to match an immediate operand");
4068   MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1);
4069 }
4070 
4071 void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
4072                                                  const MachineInstr &MI,
4073                                                  int OpIdx) const {
4074   MIB.addFrameIndex((MI.getOperand(1).getIndex()));
4075 }
4076 
4077 bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const {
4078   return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm());
4079 }
4080 
4081 bool AMDGPUInstructionSelector::isInlineImmediate32(int64_t Imm) const {
4082   return AMDGPU::isInlinableLiteral32(Imm, STI.hasInv2PiInlineImm());
4083 }
4084 
4085 bool AMDGPUInstructionSelector::isInlineImmediate64(int64_t Imm) const {
4086   return AMDGPU::isInlinableLiteral64(Imm, STI.hasInv2PiInlineImm());
4087 }
4088 
4089 bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
4090   return TII.isInlineConstant(Imm);
4091 }
4092