1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the InstructionSelector class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUInstructionSelector.h"
15 #include "AMDGPUInstrInfo.h"
16 #include "AMDGPUGlobalISelUtils.h"
17 #include "AMDGPURegisterBankInfo.h"
18 #include "AMDGPUSubtarget.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
23 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
24 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
25 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
26 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
27 #include "llvm/CodeGen/GlobalISel/Utils.h"
28 #include "llvm/CodeGen/MachineBasicBlock.h"
29 #include "llvm/CodeGen/MachineFunction.h"
30 #include "llvm/CodeGen/MachineInstr.h"
31 #include "llvm/CodeGen/MachineInstrBuilder.h"
32 #include "llvm/CodeGen/MachineRegisterInfo.h"
33 #include "llvm/IR/Type.h"
34 #include "llvm/Support/Debug.h"
35 #include "llvm/Support/raw_ostream.h"
36 
37 #define DEBUG_TYPE "amdgpu-isel"
38 
39 using namespace llvm;
40 using namespace MIPatternMatch;
41 
42 static cl::opt<bool> AllowRiskySelect(
43   "amdgpu-global-isel-risky-select",
44   cl::desc("Allow GlobalISel to select cases that are likely to not work yet"),
45   cl::init(false),
46   cl::ReallyHidden);
47 
48 #define GET_GLOBALISEL_IMPL
49 #define AMDGPUSubtarget GCNSubtarget
50 #include "AMDGPUGenGlobalISel.inc"
51 #undef GET_GLOBALISEL_IMPL
52 #undef AMDGPUSubtarget
53 
54 AMDGPUInstructionSelector::AMDGPUInstructionSelector(
55     const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
56     const AMDGPUTargetMachine &TM)
57     : InstructionSelector(), TII(*STI.getInstrInfo()),
58       TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
59       STI(STI),
60       EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG),
61 #define GET_GLOBALISEL_PREDICATES_INIT
62 #include "AMDGPUGenGlobalISel.inc"
63 #undef GET_GLOBALISEL_PREDICATES_INIT
64 #define GET_GLOBALISEL_TEMPORARIES_INIT
65 #include "AMDGPUGenGlobalISel.inc"
66 #undef GET_GLOBALISEL_TEMPORARIES_INIT
67 {
68 }
69 
70 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
71 
72 void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB,
73                                         CodeGenCoverage &CoverageInfo) {
74   MRI = &MF.getRegInfo();
75   InstructionSelector::setupMF(MF, KB, CoverageInfo);
76 }
77 
78 bool AMDGPUInstructionSelector::isVCC(Register Reg,
79                                       const MachineRegisterInfo &MRI) const {
80   if (Register::isPhysicalRegister(Reg))
81     return Reg == TRI.getVCC();
82 
83   auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
84   const TargetRegisterClass *RC =
85       RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
86   if (RC) {
87     const LLT Ty = MRI.getType(Reg);
88     return RC->hasSuperClassEq(TRI.getBoolRC()) &&
89            Ty.isValid() && Ty.getSizeInBits() == 1;
90   }
91 
92   const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
93   return RB->getID() == AMDGPU::VCCRegBankID;
94 }
95 
96 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
97                                                         unsigned NewOpc) const {
98   MI.setDesc(TII.get(NewOpc));
99   MI.RemoveOperand(1); // Remove intrinsic ID.
100   MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
101 
102   MachineOperand &Dst = MI.getOperand(0);
103   MachineOperand &Src = MI.getOperand(1);
104 
105   // TODO: This should be legalized to s32 if needed
106   if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
107     return false;
108 
109   const TargetRegisterClass *DstRC
110     = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
111   const TargetRegisterClass *SrcRC
112     = TRI.getConstrainedRegClassForOperand(Src, *MRI);
113   if (!DstRC || DstRC != SrcRC)
114     return false;
115 
116   return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
117          RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
118 }
119 
120 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
121   const DebugLoc &DL = I.getDebugLoc();
122   MachineBasicBlock *BB = I.getParent();
123   I.setDesc(TII.get(TargetOpcode::COPY));
124 
125   const MachineOperand &Src = I.getOperand(1);
126   MachineOperand &Dst = I.getOperand(0);
127   Register DstReg = Dst.getReg();
128   Register SrcReg = Src.getReg();
129 
130   if (isVCC(DstReg, *MRI)) {
131     if (SrcReg == AMDGPU::SCC) {
132       const TargetRegisterClass *RC
133         = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
134       if (!RC)
135         return true;
136       return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
137     }
138 
139     if (!isVCC(SrcReg, *MRI)) {
140       // TODO: Should probably leave the copy and let copyPhysReg expand it.
141       if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
142         return false;
143 
144       const TargetRegisterClass *SrcRC
145         = TRI.getConstrainedRegClassForOperand(Src, *MRI);
146 
147       Register MaskedReg = MRI->createVirtualRegister(SrcRC);
148 
149       // We can't trust the high bits at this point, so clear them.
150 
151       // TODO: Skip masking high bits if def is known boolean.
152 
153       unsigned AndOpc = TRI.isSGPRClass(SrcRC) ?
154         AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
155       BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
156         .addImm(1)
157         .addReg(SrcReg);
158       BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
159         .addImm(0)
160         .addReg(MaskedReg);
161 
162       if (!MRI->getRegClassOrNull(SrcReg))
163         MRI->setRegClass(SrcReg, SrcRC);
164       I.eraseFromParent();
165       return true;
166     }
167 
168     const TargetRegisterClass *RC =
169       TRI.getConstrainedRegClassForOperand(Dst, *MRI);
170     if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
171       return false;
172 
173     return true;
174   }
175 
176   for (const MachineOperand &MO : I.operands()) {
177     if (Register::isPhysicalRegister(MO.getReg()))
178       continue;
179 
180     const TargetRegisterClass *RC =
181             TRI.getConstrainedRegClassForOperand(MO, *MRI);
182     if (!RC)
183       continue;
184     RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
185   }
186   return true;
187 }
188 
189 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
190   const Register DefReg = I.getOperand(0).getReg();
191   const LLT DefTy = MRI->getType(DefReg);
192   if (DefTy == LLT::scalar(1)) {
193     if (!AllowRiskySelect) {
194       LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n");
195       return false;
196     }
197 
198     LLVM_DEBUG(dbgs() << "Selecting risky boolean phi\n");
199   }
200 
201   // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
202 
203   const RegClassOrRegBank &RegClassOrBank =
204     MRI->getRegClassOrRegBank(DefReg);
205 
206   const TargetRegisterClass *DefRC
207     = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
208   if (!DefRC) {
209     if (!DefTy.isValid()) {
210       LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
211       return false;
212     }
213 
214     const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
215     DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI);
216     if (!DefRC) {
217       LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
218       return false;
219     }
220   }
221 
222   // TODO: Verify that all registers have the same bank
223   I.setDesc(TII.get(TargetOpcode::PHI));
224   return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
225 }
226 
227 MachineOperand
228 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
229                                            const TargetRegisterClass &SubRC,
230                                            unsigned SubIdx) const {
231 
232   MachineInstr *MI = MO.getParent();
233   MachineBasicBlock *BB = MO.getParent()->getParent();
234   Register DstReg = MRI->createVirtualRegister(&SubRC);
235 
236   if (MO.isReg()) {
237     unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
238     Register Reg = MO.getReg();
239     BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
240             .addReg(Reg, 0, ComposedSubIdx);
241 
242     return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
243                                      MO.isKill(), MO.isDead(), MO.isUndef(),
244                                      MO.isEarlyClobber(), 0, MO.isDebug(),
245                                      MO.isInternalRead());
246   }
247 
248   assert(MO.isImm());
249 
250   APInt Imm(64, MO.getImm());
251 
252   switch (SubIdx) {
253   default:
254     llvm_unreachable("do not know to split immediate with this sub index.");
255   case AMDGPU::sub0:
256     return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
257   case AMDGPU::sub1:
258     return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
259   }
260 }
261 
262 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
263   switch (Opc) {
264   case AMDGPU::G_AND:
265     return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
266   case AMDGPU::G_OR:
267     return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
268   case AMDGPU::G_XOR:
269     return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
270   default:
271     llvm_unreachable("not a bit op");
272   }
273 }
274 
275 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
276   Register DstReg = I.getOperand(0).getReg();
277   unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
278 
279   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
280   if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
281       DstRB->getID() != AMDGPU::VCCRegBankID)
282     return false;
283 
284   bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
285                             STI.isWave64());
286   I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
287 
288   // Dead implicit-def of scc
289   I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
290                                          true, // isImp
291                                          false, // isKill
292                                          true)); // isDead
293   return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
294 }
295 
296 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
297   MachineBasicBlock *BB = I.getParent();
298   MachineFunction *MF = BB->getParent();
299   Register DstReg = I.getOperand(0).getReg();
300   const DebugLoc &DL = I.getDebugLoc();
301   LLT Ty = MRI->getType(DstReg);
302   if (Ty.isVector())
303     return false;
304 
305   unsigned Size = Ty.getSizeInBits();
306   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
307   const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
308   const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
309 
310   if (Size == 32) {
311     if (IsSALU) {
312       const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
313       MachineInstr *Add =
314         BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
315         .add(I.getOperand(1))
316         .add(I.getOperand(2));
317       I.eraseFromParent();
318       return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
319     }
320 
321     if (STI.hasAddNoCarry()) {
322       const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
323       I.setDesc(TII.get(Opc));
324       I.addOperand(*MF, MachineOperand::CreateImm(0));
325       I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
326       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
327     }
328 
329     const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
330 
331     Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
332     MachineInstr *Add
333       = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
334       .addDef(UnusedCarry, RegState::Dead)
335       .add(I.getOperand(1))
336       .add(I.getOperand(2))
337       .addImm(0);
338     I.eraseFromParent();
339     return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
340   }
341 
342   assert(!Sub && "illegal sub should not reach here");
343 
344   const TargetRegisterClass &RC
345     = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
346   const TargetRegisterClass &HalfRC
347     = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
348 
349   MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
350   MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
351   MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
352   MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
353 
354   Register DstLo = MRI->createVirtualRegister(&HalfRC);
355   Register DstHi = MRI->createVirtualRegister(&HalfRC);
356 
357   if (IsSALU) {
358     BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
359       .add(Lo1)
360       .add(Lo2);
361     BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
362       .add(Hi1)
363       .add(Hi2);
364   } else {
365     const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
366     Register CarryReg = MRI->createVirtualRegister(CarryRC);
367     BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
368       .addDef(CarryReg)
369       .add(Lo1)
370       .add(Lo2)
371       .addImm(0);
372     MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
373       .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
374       .add(Hi1)
375       .add(Hi2)
376       .addReg(CarryReg, RegState::Kill)
377       .addImm(0);
378 
379     if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
380       return false;
381   }
382 
383   BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
384     .addReg(DstLo)
385     .addImm(AMDGPU::sub0)
386     .addReg(DstHi)
387     .addImm(AMDGPU::sub1);
388 
389 
390   if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
391     return false;
392 
393   I.eraseFromParent();
394   return true;
395 }
396 
397 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
398   MachineInstr &I) const {
399   MachineBasicBlock *BB = I.getParent();
400   MachineFunction *MF = BB->getParent();
401   const DebugLoc &DL = I.getDebugLoc();
402   Register Dst0Reg = I.getOperand(0).getReg();
403   Register Dst1Reg = I.getOperand(1).getReg();
404   const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
405                      I.getOpcode() == AMDGPU::G_UADDE;
406   const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
407                           I.getOpcode() == AMDGPU::G_USUBE;
408 
409   if (isVCC(Dst1Reg, *MRI)) {
410     unsigned NoCarryOpc =
411         IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
412     unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
413     I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
414     I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
415     I.addOperand(*MF, MachineOperand::CreateImm(0));
416     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
417   }
418 
419   Register Src0Reg = I.getOperand(2).getReg();
420   Register Src1Reg = I.getOperand(3).getReg();
421 
422   if (HasCarryIn) {
423     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
424       .addReg(I.getOperand(4).getReg());
425   }
426 
427   unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
428   unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
429 
430   BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
431     .add(I.getOperand(2))
432     .add(I.getOperand(3));
433   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
434     .addReg(AMDGPU::SCC);
435 
436   if (!MRI->getRegClassOrNull(Dst1Reg))
437     MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
438 
439   if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
440       !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
441       !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
442     return false;
443 
444   if (HasCarryIn &&
445       !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
446                                     AMDGPU::SReg_32RegClass, *MRI))
447     return false;
448 
449   I.eraseFromParent();
450   return true;
451 }
452 
453 // TODO: We should probably legalize these to only using 32-bit results.
454 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
455   MachineBasicBlock *BB = I.getParent();
456   Register DstReg = I.getOperand(0).getReg();
457   Register SrcReg = I.getOperand(1).getReg();
458   LLT DstTy = MRI->getType(DstReg);
459   LLT SrcTy = MRI->getType(SrcReg);
460   const unsigned SrcSize = SrcTy.getSizeInBits();
461   unsigned DstSize = DstTy.getSizeInBits();
462 
463   // TODO: Should handle any multiple of 32 offset.
464   unsigned Offset = I.getOperand(2).getImm();
465   if (Offset % 32 != 0 || DstSize > 128)
466     return false;
467 
468   // 16-bit operations really use 32-bit registers.
469   // FIXME: Probably should not allow 16-bit G_EXTRACT results.
470   if (DstSize == 16)
471     DstSize = 32;
472 
473   const TargetRegisterClass *DstRC =
474     TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
475   if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
476     return false;
477 
478   const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
479   const TargetRegisterClass *SrcRC =
480     TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
481   if (!SrcRC)
482     return false;
483   unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32,
484                                                          DstSize / 32);
485   SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
486   if (!SrcRC)
487     return false;
488 
489   SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
490                                     *SrcRC, I.getOperand(1));
491   const DebugLoc &DL = I.getDebugLoc();
492   BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
493     .addReg(SrcReg, 0, SubReg);
494 
495   I.eraseFromParent();
496   return true;
497 }
498 
499 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
500   MachineBasicBlock *BB = MI.getParent();
501   Register DstReg = MI.getOperand(0).getReg();
502   LLT DstTy = MRI->getType(DstReg);
503   LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
504 
505   const unsigned SrcSize = SrcTy.getSizeInBits();
506   if (SrcSize < 32)
507     return selectImpl(MI, *CoverageInfo);
508 
509   const DebugLoc &DL = MI.getDebugLoc();
510   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
511   const unsigned DstSize = DstTy.getSizeInBits();
512   const TargetRegisterClass *DstRC =
513     TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
514   if (!DstRC)
515     return false;
516 
517   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
518   MachineInstrBuilder MIB =
519     BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
520   for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
521     MachineOperand &Src = MI.getOperand(I + 1);
522     MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
523     MIB.addImm(SubRegs[I]);
524 
525     const TargetRegisterClass *SrcRC
526       = TRI.getConstrainedRegClassForOperand(Src, *MRI);
527     if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
528       return false;
529   }
530 
531   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
532     return false;
533 
534   MI.eraseFromParent();
535   return true;
536 }
537 
538 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
539   MachineBasicBlock *BB = MI.getParent();
540   const int NumDst = MI.getNumOperands() - 1;
541 
542   MachineOperand &Src = MI.getOperand(NumDst);
543 
544   Register SrcReg = Src.getReg();
545   Register DstReg0 = MI.getOperand(0).getReg();
546   LLT DstTy = MRI->getType(DstReg0);
547   LLT SrcTy = MRI->getType(SrcReg);
548 
549   const unsigned DstSize = DstTy.getSizeInBits();
550   const unsigned SrcSize = SrcTy.getSizeInBits();
551   const DebugLoc &DL = MI.getDebugLoc();
552   const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
553 
554   const TargetRegisterClass *SrcRC =
555     TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
556   if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
557     return false;
558 
559   const unsigned SrcFlags = getUndefRegState(Src.isUndef());
560 
561   // Note we could have mixed SGPR and VGPR destination banks for an SGPR
562   // source, and this relies on the fact that the same subregister indices are
563   // used for both.
564   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
565   for (int I = 0, E = NumDst; I != E; ++I) {
566     MachineOperand &Dst = MI.getOperand(I);
567     BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
568       .addReg(SrcReg, SrcFlags, SubRegs[I]);
569 
570     // Make sure the subregister index is valid for the source register.
571     SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
572     if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
573       return false;
574 
575     const TargetRegisterClass *DstRC =
576       TRI.getConstrainedRegClassForOperand(Dst, *MRI);
577     if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
578       return false;
579   }
580 
581   MI.eraseFromParent();
582   return true;
583 }
584 
585 bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
586   MachineInstr &MI) const {
587   if (selectImpl(MI, *CoverageInfo))
588     return true;
589 
590   const LLT S32 = LLT::scalar(32);
591   const LLT V2S16 = LLT::vector(2, 16);
592 
593   Register Dst = MI.getOperand(0).getReg();
594   if (MRI->getType(Dst) != V2S16)
595     return false;
596 
597   const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
598   if (DstBank->getID() != AMDGPU::SGPRRegBankID)
599     return false;
600 
601   Register Src0 = MI.getOperand(1).getReg();
602   Register Src1 = MI.getOperand(2).getReg();
603   if (MRI->getType(Src0) != S32)
604     return false;
605 
606   const DebugLoc &DL = MI.getDebugLoc();
607   MachineBasicBlock *BB = MI.getParent();
608 
609   auto ConstSrc1 = getConstantVRegValWithLookThrough(Src1, *MRI, true, true);
610   if (ConstSrc1) {
611     auto ConstSrc0 = getConstantVRegValWithLookThrough(Src0, *MRI, true, true);
612     if (ConstSrc0) {
613       uint32_t Lo16 = static_cast<uint32_t>(ConstSrc0->Value) & 0xffff;
614       uint32_t Hi16 = static_cast<uint32_t>(ConstSrc1->Value) & 0xffff;
615 
616       BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst)
617         .addImm(Lo16 | (Hi16 << 16));
618       MI.eraseFromParent();
619       return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
620     }
621   }
622 
623   // TODO: This should probably be a combine somewhere
624   // (build_vector_trunc $src0, undef -> copy $src0
625   MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
626   if (Src1Def && Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
627     MI.setDesc(TII.get(AMDGPU::COPY));
628     MI.RemoveOperand(2);
629     return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI) &&
630            RBI.constrainGenericRegister(Src0, AMDGPU::SReg_32RegClass, *MRI);
631   }
632 
633   Register ShiftSrc0;
634   Register ShiftSrc1;
635   int64_t ShiftAmt;
636 
637   // With multiple uses of the shift, this will duplicate the shift and
638   // increase register pressure.
639   //
640   // (build_vector_trunc (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
641   //  => (S_PACK_HH_B32_B16 $src0, $src1)
642   // (build_vector_trunc $src0, (lshr_oneuse SReg_32:$src1, 16))
643   //  => (S_PACK_LH_B32_B16 $src0, $src1)
644   // (build_vector_trunc $src0, $src1)
645   //  => (S_PACK_LL_B32_B16 $src0, $src1)
646 
647   // FIXME: This is an inconvenient way to check a specific value
648   bool Shift0 = mi_match(
649     Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_ICst(ShiftAmt)))) &&
650     ShiftAmt == 16;
651 
652   bool Shift1 = mi_match(
653     Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_ICst(ShiftAmt)))) &&
654     ShiftAmt == 16;
655 
656   unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
657   if (Shift0 && Shift1) {
658     Opc = AMDGPU::S_PACK_HH_B32_B16;
659     MI.getOperand(1).setReg(ShiftSrc0);
660     MI.getOperand(2).setReg(ShiftSrc1);
661   } else if (Shift1) {
662     Opc = AMDGPU::S_PACK_LH_B32_B16;
663     MI.getOperand(2).setReg(ShiftSrc1);
664   } else if (Shift0 && ConstSrc1 && ConstSrc1->Value == 0) {
665     // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
666     auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
667       .addReg(ShiftSrc0)
668       .addImm(16);
669 
670     MI.eraseFromParent();
671     return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
672   }
673 
674   MI.setDesc(TII.get(Opc));
675   return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
676 }
677 
678 bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const {
679   return selectG_ADD_SUB(I);
680 }
681 
682 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
683   const MachineOperand &MO = I.getOperand(0);
684 
685   // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
686   // regbank check here is to know why getConstrainedRegClassForOperand failed.
687   const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
688   if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
689       (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
690     I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
691     return true;
692   }
693 
694   return false;
695 }
696 
697 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
698   MachineBasicBlock *BB = I.getParent();
699 
700   Register DstReg = I.getOperand(0).getReg();
701   Register Src0Reg = I.getOperand(1).getReg();
702   Register Src1Reg = I.getOperand(2).getReg();
703   LLT Src1Ty = MRI->getType(Src1Reg);
704 
705   unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
706   unsigned InsSize = Src1Ty.getSizeInBits();
707 
708   int64_t Offset = I.getOperand(3).getImm();
709 
710   // FIXME: These cases should have been illegal and unnecessary to check here.
711   if (Offset % 32 != 0 || InsSize % 32 != 0)
712     return false;
713 
714   // Currently not handled by getSubRegFromChannel.
715   if (InsSize > 128)
716     return false;
717 
718   unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
719   if (SubReg == AMDGPU::NoSubRegister)
720     return false;
721 
722   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
723   const TargetRegisterClass *DstRC =
724     TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
725   if (!DstRC)
726     return false;
727 
728   const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
729   const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
730   const TargetRegisterClass *Src0RC =
731     TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI);
732   const TargetRegisterClass *Src1RC =
733     TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI);
734 
735   // Deal with weird cases where the class only partially supports the subreg
736   // index.
737   Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
738   if (!Src0RC || !Src1RC)
739     return false;
740 
741   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
742       !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
743       !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
744     return false;
745 
746   const DebugLoc &DL = I.getDebugLoc();
747   BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
748     .addReg(Src0Reg)
749     .addReg(Src1Reg)
750     .addImm(SubReg);
751 
752   I.eraseFromParent();
753   return true;
754 }
755 
756 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
757   if (STI.getLDSBankCount() != 16)
758     return selectImpl(MI, *CoverageInfo);
759 
760   Register Dst = MI.getOperand(0).getReg();
761   Register Src0 = MI.getOperand(2).getReg();
762   Register M0Val = MI.getOperand(6).getReg();
763   if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
764       !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
765       !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
766     return false;
767 
768   // This requires 2 instructions. It is possible to write a pattern to support
769   // this, but the generated isel emitter doesn't correctly deal with multiple
770   // output instructions using the same physical register input. The copy to m0
771   // is incorrectly placed before the second instruction.
772   //
773   // TODO: Match source modifiers.
774 
775   Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
776   const DebugLoc &DL = MI.getDebugLoc();
777   MachineBasicBlock *MBB = MI.getParent();
778 
779   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
780     .addReg(M0Val);
781   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
782     .addImm(2)
783     .addImm(MI.getOperand(4).getImm())  // $attr
784     .addImm(MI.getOperand(3).getImm()); // $attrchan
785 
786   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
787     .addImm(0)                          // $src0_modifiers
788     .addReg(Src0)                       // $src0
789     .addImm(MI.getOperand(4).getImm())  // $attr
790     .addImm(MI.getOperand(3).getImm())  // $attrchan
791     .addImm(0)                          // $src2_modifiers
792     .addReg(InterpMov)                  // $src2 - 2 f16 values selected by high
793     .addImm(MI.getOperand(5).getImm())  // $high
794     .addImm(0)                          // $clamp
795     .addImm(0);                         // $omod
796 
797   MI.eraseFromParent();
798   return true;
799 }
800 
801 // We need to handle this here because tablegen doesn't support matching
802 // instructions with multiple outputs.
803 bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
804   Register Dst0 = MI.getOperand(0).getReg();
805   Register Dst1 = MI.getOperand(1).getReg();
806 
807   LLT Ty = MRI->getType(Dst0);
808   unsigned Opc;
809   if (Ty == LLT::scalar(32))
810     Opc = AMDGPU::V_DIV_SCALE_F32;
811   else if (Ty == LLT::scalar(64))
812     Opc = AMDGPU::V_DIV_SCALE_F64;
813   else
814     return false;
815 
816   const DebugLoc &DL = MI.getDebugLoc();
817   MachineBasicBlock *MBB = MI.getParent();
818 
819   Register Numer = MI.getOperand(3).getReg();
820   Register Denom = MI.getOperand(4).getReg();
821   unsigned ChooseDenom = MI.getOperand(5).getImm();
822 
823   Register Src0 = ChooseDenom != 0 ? Numer : Denom;
824 
825   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
826     .addDef(Dst1)
827     .addUse(Src0)
828     .addUse(Denom)
829     .addUse(Numer);
830 
831   MI.eraseFromParent();
832   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
833 }
834 
835 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
836   unsigned IntrinsicID = I.getIntrinsicID();
837   switch (IntrinsicID) {
838   case Intrinsic::amdgcn_if_break: {
839     MachineBasicBlock *BB = I.getParent();
840 
841     // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
842     // SelectionDAG uses for wave32 vs wave64.
843     BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
844       .add(I.getOperand(0))
845       .add(I.getOperand(2))
846       .add(I.getOperand(3));
847 
848     Register DstReg = I.getOperand(0).getReg();
849     Register Src0Reg = I.getOperand(2).getReg();
850     Register Src1Reg = I.getOperand(3).getReg();
851 
852     I.eraseFromParent();
853 
854     for (Register Reg : { DstReg, Src0Reg, Src1Reg })
855       MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
856 
857     return true;
858   }
859   case Intrinsic::amdgcn_interp_p1_f16:
860     return selectInterpP1F16(I);
861   case Intrinsic::amdgcn_wqm:
862     return constrainCopyLikeIntrin(I, AMDGPU::WQM);
863   case Intrinsic::amdgcn_softwqm:
864     return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
865   case Intrinsic::amdgcn_wwm:
866     return constrainCopyLikeIntrin(I, AMDGPU::WWM);
867   case Intrinsic::amdgcn_div_scale:
868     return selectDivScale(I);
869   case Intrinsic::amdgcn_icmp:
870     return selectIntrinsicIcmp(I);
871   case Intrinsic::amdgcn_ballot:
872     return selectBallot(I);
873   case Intrinsic::amdgcn_reloc_constant:
874     return selectRelocConstant(I);
875   case Intrinsic::returnaddress:
876     return selectReturnAddress(I);
877   default:
878     return selectImpl(I, *CoverageInfo);
879   }
880 }
881 
882 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) {
883   if (Size != 32 && Size != 64)
884     return -1;
885   switch (P) {
886   default:
887     llvm_unreachable("Unknown condition code!");
888   case CmpInst::ICMP_NE:
889     return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64;
890   case CmpInst::ICMP_EQ:
891     return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64;
892   case CmpInst::ICMP_SGT:
893     return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64;
894   case CmpInst::ICMP_SGE:
895     return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64;
896   case CmpInst::ICMP_SLT:
897     return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64;
898   case CmpInst::ICMP_SLE:
899     return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64;
900   case CmpInst::ICMP_UGT:
901     return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64;
902   case CmpInst::ICMP_UGE:
903     return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64;
904   case CmpInst::ICMP_ULT:
905     return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64;
906   case CmpInst::ICMP_ULE:
907     return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64;
908   }
909 }
910 
911 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
912                                               unsigned Size) const {
913   if (Size == 64) {
914     if (!STI.hasScalarCompareEq64())
915       return -1;
916 
917     switch (P) {
918     case CmpInst::ICMP_NE:
919       return AMDGPU::S_CMP_LG_U64;
920     case CmpInst::ICMP_EQ:
921       return AMDGPU::S_CMP_EQ_U64;
922     default:
923       return -1;
924     }
925   }
926 
927   if (Size != 32)
928     return -1;
929 
930   switch (P) {
931   case CmpInst::ICMP_NE:
932     return AMDGPU::S_CMP_LG_U32;
933   case CmpInst::ICMP_EQ:
934     return AMDGPU::S_CMP_EQ_U32;
935   case CmpInst::ICMP_SGT:
936     return AMDGPU::S_CMP_GT_I32;
937   case CmpInst::ICMP_SGE:
938     return AMDGPU::S_CMP_GE_I32;
939   case CmpInst::ICMP_SLT:
940     return AMDGPU::S_CMP_LT_I32;
941   case CmpInst::ICMP_SLE:
942     return AMDGPU::S_CMP_LE_I32;
943   case CmpInst::ICMP_UGT:
944     return AMDGPU::S_CMP_GT_U32;
945   case CmpInst::ICMP_UGE:
946     return AMDGPU::S_CMP_GE_U32;
947   case CmpInst::ICMP_ULT:
948     return AMDGPU::S_CMP_LT_U32;
949   case CmpInst::ICMP_ULE:
950     return AMDGPU::S_CMP_LE_U32;
951   default:
952     llvm_unreachable("Unknown condition code!");
953   }
954 }
955 
956 bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
957   MachineBasicBlock *BB = I.getParent();
958   const DebugLoc &DL = I.getDebugLoc();
959 
960   Register SrcReg = I.getOperand(2).getReg();
961   unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
962 
963   auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
964 
965   Register CCReg = I.getOperand(0).getReg();
966   if (!isVCC(CCReg, *MRI)) {
967     int Opcode = getS_CMPOpcode(Pred, Size);
968     if (Opcode == -1)
969       return false;
970     MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
971             .add(I.getOperand(2))
972             .add(I.getOperand(3));
973     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
974       .addReg(AMDGPU::SCC);
975     bool Ret =
976         constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
977         RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
978     I.eraseFromParent();
979     return Ret;
980   }
981 
982   int Opcode = getV_CMPOpcode(Pred, Size);
983   if (Opcode == -1)
984     return false;
985 
986   MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode),
987             I.getOperand(0).getReg())
988             .add(I.getOperand(2))
989             .add(I.getOperand(3));
990   RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
991                                *TRI.getBoolRC(), *MRI);
992   bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
993   I.eraseFromParent();
994   return Ret;
995 }
996 
997 bool AMDGPUInstructionSelector::selectIntrinsicIcmp(MachineInstr &I) const {
998   Register Dst = I.getOperand(0).getReg();
999   if (isVCC(Dst, *MRI))
1000     return false;
1001 
1002   if (MRI->getType(Dst).getSizeInBits() != STI.getWavefrontSize())
1003     return false;
1004 
1005   MachineBasicBlock *BB = I.getParent();
1006   const DebugLoc &DL = I.getDebugLoc();
1007   Register SrcReg = I.getOperand(2).getReg();
1008   unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1009   auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1010 
1011   int Opcode = getV_CMPOpcode(Pred, Size);
1012   if (Opcode == -1)
1013     return false;
1014 
1015   MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst)
1016                            .add(I.getOperand(2))
1017                            .add(I.getOperand(3));
1018   RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), *TRI.getBoolRC(),
1019                                *MRI);
1020   bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1021   I.eraseFromParent();
1022   return Ret;
1023 }
1024 
1025 bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1026   MachineBasicBlock *BB = I.getParent();
1027   const DebugLoc &DL = I.getDebugLoc();
1028   Register DstReg = I.getOperand(0).getReg();
1029   const unsigned Size = MRI->getType(DstReg).getSizeInBits();
1030   const bool Is64 = Size == 64;
1031 
1032   if (Size != STI.getWavefrontSize())
1033     return false;
1034 
1035   Optional<ValueAndVReg> Arg =
1036       getConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI, true);
1037 
1038   if (Arg.hasValue()) {
1039     const int64_t Value = Arg.getValue().Value;
1040     if (Value == 0) {
1041       unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1042       BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
1043     } else if (Value == -1) { // all ones
1044       Register SrcReg = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
1045       BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
1046     } else
1047       return false;
1048   } else {
1049     Register SrcReg = I.getOperand(2).getReg();
1050     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
1051   }
1052 
1053   I.eraseFromParent();
1054   return true;
1055 }
1056 
1057 bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1058   Register DstReg = I.getOperand(0).getReg();
1059   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1060   const TargetRegisterClass *DstRC =
1061     TRI.getRegClassForSizeOnBank(32, *DstBank, *MRI);
1062   if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1063     return false;
1064 
1065   const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1066 
1067   Module *M = MF->getFunction().getParent();
1068   const MDNode *Metadata = I.getOperand(2).getMetadata();
1069   auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1070   auto RelocSymbol = cast<GlobalVariable>(
1071     M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1072 
1073   MachineBasicBlock *BB = I.getParent();
1074   BuildMI(*BB, &I, I.getDebugLoc(),
1075           TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1076     .addGlobalAddress(RelocSymbol, 0, SIInstrInfo::MO_ABS32_LO);
1077 
1078   I.eraseFromParent();
1079   return true;
1080 }
1081 
1082 bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1083   MachineBasicBlock *MBB = I.getParent();
1084   MachineFunction &MF = *MBB->getParent();
1085   const DebugLoc &DL = I.getDebugLoc();
1086 
1087   MachineOperand &Dst = I.getOperand(0);
1088   Register DstReg = Dst.getReg();
1089   unsigned Depth = I.getOperand(2).getImm();
1090 
1091   const TargetRegisterClass *RC
1092     = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1093   if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1094       !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1095     return false;
1096 
1097   // Check for kernel and shader functions
1098   if (Depth != 0 ||
1099       MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1100     BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1101       .addImm(0);
1102     I.eraseFromParent();
1103     return true;
1104   }
1105 
1106   MachineFrameInfo &MFI = MF.getFrameInfo();
1107   // There is a call to @llvm.returnaddress in this function
1108   MFI.setReturnAddressIsTaken(true);
1109 
1110   // Get the return address reg and mark it as an implicit live-in
1111   Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1112   Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1113                                              AMDGPU::SReg_64RegClass);
1114   BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1115     .addReg(LiveIn);
1116   I.eraseFromParent();
1117   return true;
1118 }
1119 
1120 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1121   // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
1122   // SelectionDAG uses for wave32 vs wave64.
1123   MachineBasicBlock *BB = MI.getParent();
1124   BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1125       .add(MI.getOperand(1));
1126 
1127   Register Reg = MI.getOperand(1).getReg();
1128   MI.eraseFromParent();
1129 
1130   if (!MRI->getRegClassOrNull(Reg))
1131     MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1132   return true;
1133 }
1134 
1135 static unsigned getDSShaderTypeValue(const MachineFunction &MF) {
1136   switch (MF.getFunction().getCallingConv()) {
1137   case CallingConv::AMDGPU_PS:
1138     return 1;
1139   case CallingConv::AMDGPU_VS:
1140     return 2;
1141   case CallingConv::AMDGPU_GS:
1142     return 3;
1143   case CallingConv::AMDGPU_HS:
1144   case CallingConv::AMDGPU_LS:
1145   case CallingConv::AMDGPU_ES:
1146     report_fatal_error("ds_ordered_count unsupported for this calling conv");
1147   case CallingConv::AMDGPU_CS:
1148   case CallingConv::AMDGPU_KERNEL:
1149   case CallingConv::C:
1150   case CallingConv::Fast:
1151   default:
1152     // Assume other calling conventions are various compute callable functions
1153     return 0;
1154   }
1155 }
1156 
1157 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1158   MachineInstr &MI, Intrinsic::ID IntrID) const {
1159   MachineBasicBlock *MBB = MI.getParent();
1160   MachineFunction *MF = MBB->getParent();
1161   const DebugLoc &DL = MI.getDebugLoc();
1162 
1163   unsigned IndexOperand = MI.getOperand(7).getImm();
1164   bool WaveRelease = MI.getOperand(8).getImm() != 0;
1165   bool WaveDone = MI.getOperand(9).getImm() != 0;
1166 
1167   if (WaveDone && !WaveRelease)
1168     report_fatal_error("ds_ordered_count: wave_done requires wave_release");
1169 
1170   unsigned OrderedCountIndex = IndexOperand & 0x3f;
1171   IndexOperand &= ~0x3f;
1172   unsigned CountDw = 0;
1173 
1174   if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1175     CountDw = (IndexOperand >> 24) & 0xf;
1176     IndexOperand &= ~(0xf << 24);
1177 
1178     if (CountDw < 1 || CountDw > 4) {
1179       report_fatal_error(
1180         "ds_ordered_count: dword count must be between 1 and 4");
1181     }
1182   }
1183 
1184   if (IndexOperand)
1185     report_fatal_error("ds_ordered_count: bad index operand");
1186 
1187   unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1188   unsigned ShaderType = getDSShaderTypeValue(*MF);
1189 
1190   unsigned Offset0 = OrderedCountIndex << 2;
1191   unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) |
1192                      (Instruction << 4);
1193 
1194   if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
1195     Offset1 |= (CountDw - 1) << 6;
1196 
1197   unsigned Offset = Offset0 | (Offset1 << 8);
1198 
1199   Register M0Val = MI.getOperand(2).getReg();
1200   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1201     .addReg(M0Val);
1202 
1203   Register DstReg = MI.getOperand(0).getReg();
1204   Register ValReg = MI.getOperand(3).getReg();
1205   MachineInstrBuilder DS =
1206     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1207       .addReg(ValReg)
1208       .addImm(Offset)
1209       .cloneMemRefs(MI);
1210 
1211   if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1212     return false;
1213 
1214   bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1215   MI.eraseFromParent();
1216   return Ret;
1217 }
1218 
1219 static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1220   switch (IntrID) {
1221   case Intrinsic::amdgcn_ds_gws_init:
1222     return AMDGPU::DS_GWS_INIT;
1223   case Intrinsic::amdgcn_ds_gws_barrier:
1224     return AMDGPU::DS_GWS_BARRIER;
1225   case Intrinsic::amdgcn_ds_gws_sema_v:
1226     return AMDGPU::DS_GWS_SEMA_V;
1227   case Intrinsic::amdgcn_ds_gws_sema_br:
1228     return AMDGPU::DS_GWS_SEMA_BR;
1229   case Intrinsic::amdgcn_ds_gws_sema_p:
1230     return AMDGPU::DS_GWS_SEMA_P;
1231   case Intrinsic::amdgcn_ds_gws_sema_release_all:
1232     return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1233   default:
1234     llvm_unreachable("not a gws intrinsic");
1235   }
1236 }
1237 
1238 bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1239                                                      Intrinsic::ID IID) const {
1240   if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1241       !STI.hasGWSSemaReleaseAll())
1242     return false;
1243 
1244   // intrinsic ID, vsrc, offset
1245   const bool HasVSrc = MI.getNumOperands() == 3;
1246   assert(HasVSrc || MI.getNumOperands() == 2);
1247 
1248   Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1249   const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1250   if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1251     return false;
1252 
1253   MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1254   assert(OffsetDef);
1255 
1256   unsigned ImmOffset;
1257 
1258   MachineBasicBlock *MBB = MI.getParent();
1259   const DebugLoc &DL = MI.getDebugLoc();
1260 
1261   MachineInstr *Readfirstlane = nullptr;
1262 
1263   // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1264   // incoming offset, in case there's an add of a constant. We'll have to put it
1265   // back later.
1266   if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1267     Readfirstlane = OffsetDef;
1268     BaseOffset = OffsetDef->getOperand(1).getReg();
1269     OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1270   }
1271 
1272   if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1273     // If we have a constant offset, try to use the 0 in m0 as the base.
1274     // TODO: Look into changing the default m0 initialization value. If the
1275     // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1276     // the immediate offset.
1277 
1278     ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1279     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1280       .addImm(0);
1281   } else {
1282     std::tie(BaseOffset, ImmOffset, OffsetDef)
1283       = AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset);
1284 
1285     if (Readfirstlane) {
1286       // We have the constant offset now, so put the readfirstlane back on the
1287       // variable component.
1288       if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1289         return false;
1290 
1291       Readfirstlane->getOperand(1).setReg(BaseOffset);
1292       BaseOffset = Readfirstlane->getOperand(0).getReg();
1293     } else {
1294       if (!RBI.constrainGenericRegister(BaseOffset,
1295                                         AMDGPU::SReg_32RegClass, *MRI))
1296         return false;
1297     }
1298 
1299     Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1300     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1301       .addReg(BaseOffset)
1302       .addImm(16);
1303 
1304     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1305       .addReg(M0Base);
1306   }
1307 
1308   // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1309   // offset field) % 64. Some versions of the programming guide omit the m0
1310   // part, or claim it's from offset 0.
1311   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1312 
1313   if (HasVSrc) {
1314     Register VSrc = MI.getOperand(1).getReg();
1315     MIB.addReg(VSrc);
1316     if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1317       return false;
1318   }
1319 
1320   MIB.addImm(ImmOffset)
1321      .addImm(-1) // $gds
1322      .cloneMemRefs(MI);
1323 
1324   MI.eraseFromParent();
1325   return true;
1326 }
1327 
1328 bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1329                                                       bool IsAppend) const {
1330   Register PtrBase = MI.getOperand(2).getReg();
1331   LLT PtrTy = MRI->getType(PtrBase);
1332   bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1333 
1334   unsigned Offset;
1335   std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1336 
1337   // TODO: Should this try to look through readfirstlane like GWS?
1338   if (!isDSOffsetLegal(PtrBase, Offset, 16)) {
1339     PtrBase = MI.getOperand(2).getReg();
1340     Offset = 0;
1341   }
1342 
1343   MachineBasicBlock *MBB = MI.getParent();
1344   const DebugLoc &DL = MI.getDebugLoc();
1345   const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1346 
1347   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1348     .addReg(PtrBase);
1349   if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1350     return false;
1351 
1352   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1353     .addImm(Offset)
1354     .addImm(IsGDS ? -1 : 0)
1355     .cloneMemRefs(MI);
1356   MI.eraseFromParent();
1357   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1358 }
1359 
1360 static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
1361                          bool &IsTexFail) {
1362   if (TexFailCtrl)
1363     IsTexFail = true;
1364 
1365   TFE = (TexFailCtrl & 0x1) ? 1 : 0;
1366   TexFailCtrl &= ~(uint64_t)0x1;
1367   LWE = (TexFailCtrl & 0x2) ? 1 : 0;
1368   TexFailCtrl &= ~(uint64_t)0x2;
1369 
1370   return TexFailCtrl == 0;
1371 }
1372 
1373 static bool parseCachePolicy(uint64_t Value,
1374                              bool *GLC, bool *SLC, bool *DLC) {
1375   if (GLC) {
1376     *GLC = (Value & 0x1) ? 1 : 0;
1377     Value &= ~(uint64_t)0x1;
1378   }
1379   if (SLC) {
1380     *SLC = (Value & 0x2) ? 1 : 0;
1381     Value &= ~(uint64_t)0x2;
1382   }
1383   if (DLC) {
1384     *DLC = (Value & 0x4) ? 1 : 0;
1385     Value &= ~(uint64_t)0x4;
1386   }
1387 
1388   return Value == 0;
1389 }
1390 
1391 bool AMDGPUInstructionSelector::selectImageIntrinsic(
1392   MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
1393   MachineBasicBlock *MBB = MI.getParent();
1394   const DebugLoc &DL = MI.getDebugLoc();
1395 
1396   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1397     AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1398 
1399   const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
1400   const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
1401       AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
1402   const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo =
1403       AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode);
1404   unsigned IntrOpcode = Intr->BaseOpcode;
1405   const bool IsGFX10 = STI.getGeneration() >= AMDGPUSubtarget::GFX10;
1406 
1407   const int VAddrIdx = getImageVAddrIdxBegin(BaseOpcode,
1408                                              MI.getNumExplicitDefs());
1409   int NumVAddr, NumGradients;
1410   std::tie(NumVAddr, NumGradients) = getImageNumVAddr(Intr, BaseOpcode);
1411 
1412   Register VDataIn, VDataOut;
1413   LLT VDataTy;
1414   int NumVDataDwords = -1;
1415   bool IsD16 = false;
1416 
1417   // XXX - Can we just get the second to last argument for ctrl?
1418   unsigned CtrlIdx; // Index of texfailctrl argument
1419   bool Unorm;
1420   if (!BaseOpcode->Sampler) {
1421     Unorm = true;
1422     CtrlIdx = VAddrIdx + NumVAddr + 1;
1423   } else {
1424     Unorm = MI.getOperand(VAddrIdx + NumVAddr + 2).getImm() != 0;
1425     CtrlIdx = VAddrIdx + NumVAddr + 3;
1426   }
1427 
1428   bool TFE;
1429   bool LWE;
1430   bool IsTexFail = false;
1431   if (!parseTexFail(MI.getOperand(CtrlIdx).getImm(), TFE, LWE, IsTexFail))
1432     return false;
1433 
1434   const int Flags = MI.getOperand(CtrlIdx + 2).getImm();
1435   const bool IsA16 = (Flags & 1) != 0;
1436   const bool IsG16 = (Flags & 2) != 0;
1437 
1438   // A16 implies 16 bit gradients
1439   if (IsA16 && !IsG16)
1440     return false;
1441 
1442   unsigned DMask = 0;
1443   unsigned DMaskLanes = 0;
1444 
1445   if (BaseOpcode->Atomic) {
1446     VDataOut = MI.getOperand(0).getReg();
1447     VDataIn = MI.getOperand(2).getReg();
1448     LLT Ty = MRI->getType(VDataIn);
1449 
1450     // Be careful to allow atomic swap on 16-bit element vectors.
1451     const bool Is64Bit = BaseOpcode->AtomicX2 ?
1452       Ty.getSizeInBits() == 128 :
1453       Ty.getSizeInBits() == 64;
1454 
1455     if (BaseOpcode->AtomicX2) {
1456       assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
1457 
1458       DMask = Is64Bit ? 0xf : 0x3;
1459       NumVDataDwords = Is64Bit ? 4 : 2;
1460     } else {
1461       DMask = Is64Bit ? 0x3 : 0x1;
1462       NumVDataDwords = Is64Bit ? 2 : 1;
1463     }
1464   } else {
1465     const int DMaskIdx = 2; // Input/output + intrinsic ID.
1466 
1467     DMask = MI.getOperand(DMaskIdx).getImm();
1468     DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
1469 
1470     if (BaseOpcode->Store) {
1471       VDataIn = MI.getOperand(1).getReg();
1472       VDataTy = MRI->getType(VDataIn);
1473       NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
1474     } else {
1475       VDataOut = MI.getOperand(0).getReg();
1476       VDataTy = MRI->getType(VDataOut);
1477       NumVDataDwords = DMaskLanes;
1478 
1479       // One memoperand is mandatory, except for getresinfo.
1480       // FIXME: Check this in verifier.
1481       if (!MI.memoperands_empty()) {
1482         const MachineMemOperand *MMO = *MI.memoperands_begin();
1483 
1484         // Infer d16 from the memory size, as the register type will be mangled by
1485         // unpacked subtargets, or by TFE.
1486         IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32;
1487 
1488         if (IsD16 && !STI.hasUnpackedD16VMem())
1489           NumVDataDwords = (DMaskLanes + 1) / 2;
1490       }
1491     }
1492   }
1493 
1494   // Optimize _L to _LZ when _L is zero
1495   if (LZMappingInfo) {
1496     // The legalizer replaced the register with an immediate 0 if we need to
1497     // change the opcode.
1498     const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1);
1499     if (Lod.isImm()) {
1500       assert(Lod.getImm() == 0);
1501       IntrOpcode = LZMappingInfo->LZ;  // set new opcode to _lz variant of _l
1502     }
1503   }
1504 
1505   // Optimize _mip away, when 'lod' is zero
1506   if (MIPMappingInfo) {
1507     const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1);
1508     if (Lod.isImm()) {
1509       assert(Lod.getImm() == 0);
1510       IntrOpcode = MIPMappingInfo->NONMIP;  // set new opcode to variant without _mip
1511     }
1512   }
1513 
1514   // Set G16 opcode
1515   if (IsG16 && !IsA16) {
1516     const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
1517         AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
1518     assert(G16MappingInfo);
1519     IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
1520   }
1521 
1522   // TODO: Check this in verifier.
1523   assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
1524 
1525   bool GLC = false;
1526   bool SLC = false;
1527   bool DLC = false;
1528   if (BaseOpcode->Atomic) {
1529     GLC = true; // TODO no-return optimization
1530     if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), nullptr, &SLC,
1531                           IsGFX10 ? &DLC : nullptr))
1532       return false;
1533   } else {
1534     if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), &GLC, &SLC,
1535                           IsGFX10 ? &DLC : nullptr))
1536       return false;
1537   }
1538 
1539   int NumVAddrRegs = 0;
1540   int NumVAddrDwords = 0;
1541   for (int I = 0; I < NumVAddr; ++I) {
1542     // Skip the $noregs and 0s inserted during legalization.
1543     MachineOperand &AddrOp = MI.getOperand(VAddrIdx + I);
1544     if (!AddrOp.isReg())
1545       continue; // XXX - Break?
1546 
1547     Register Addr = AddrOp.getReg();
1548     if (!Addr)
1549       break;
1550 
1551     ++NumVAddrRegs;
1552     NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
1553   }
1554 
1555   // The legalizer preprocessed the intrinsic arguments. If we aren't using
1556   // NSA, these should have beeen packed into a single value in the first
1557   // address register
1558   const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs;
1559   if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
1560     LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
1561     return false;
1562   }
1563 
1564   if (IsTexFail)
1565     ++NumVDataDwords;
1566 
1567   int Opcode = -1;
1568   if (IsGFX10) {
1569     Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1570                                    UseNSA ? AMDGPU::MIMGEncGfx10NSA
1571                                           : AMDGPU::MIMGEncGfx10Default,
1572                                    NumVDataDwords, NumVAddrDwords);
1573   } else {
1574     if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
1575       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
1576                                      NumVDataDwords, NumVAddrDwords);
1577     if (Opcode == -1)
1578       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
1579                                      NumVDataDwords, NumVAddrDwords);
1580   }
1581   assert(Opcode != -1);
1582 
1583   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
1584     .cloneMemRefs(MI);
1585 
1586   if (VDataOut) {
1587     if (BaseOpcode->AtomicX2) {
1588       const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
1589 
1590       Register TmpReg = MRI->createVirtualRegister(
1591         Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
1592       unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
1593 
1594       MIB.addDef(TmpReg);
1595       BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
1596         .addReg(TmpReg, RegState::Kill, SubReg);
1597 
1598     } else {
1599       MIB.addDef(VDataOut); // vdata output
1600     }
1601   }
1602 
1603   if (VDataIn)
1604     MIB.addReg(VDataIn); // vdata input
1605 
1606   for (int i = 0; i != NumVAddrRegs; ++i) {
1607     MachineOperand &SrcOp = MI.getOperand(VAddrIdx + i);
1608     if (SrcOp.isReg()) {
1609       assert(SrcOp.getReg() != 0);
1610       MIB.addReg(SrcOp.getReg());
1611     }
1612   }
1613 
1614   MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr).getReg()); // rsrc
1615   if (BaseOpcode->Sampler)
1616     MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr + 1).getReg()); // sampler
1617 
1618   MIB.addImm(DMask); // dmask
1619 
1620   if (IsGFX10)
1621     MIB.addImm(DimInfo->Encoding);
1622   MIB.addImm(Unorm);
1623   if (IsGFX10)
1624     MIB.addImm(DLC);
1625 
1626   MIB.addImm(GLC);
1627   MIB.addImm(SLC);
1628   MIB.addImm(IsA16 &&  // a16 or r128
1629              STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
1630   if (IsGFX10)
1631     MIB.addImm(IsA16 ? -1 : 0);
1632 
1633   MIB.addImm(TFE); // tfe
1634   MIB.addImm(LWE); // lwe
1635   if (!IsGFX10)
1636     MIB.addImm(DimInfo->DA ? -1 : 0);
1637   if (BaseOpcode->HasD16)
1638     MIB.addImm(IsD16 ? -1 : 0);
1639 
1640   MI.eraseFromParent();
1641   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1642 }
1643 
1644 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
1645     MachineInstr &I) const {
1646   unsigned IntrinsicID = I.getIntrinsicID();
1647   switch (IntrinsicID) {
1648   case Intrinsic::amdgcn_end_cf:
1649     return selectEndCfIntrinsic(I);
1650   case Intrinsic::amdgcn_ds_ordered_add:
1651   case Intrinsic::amdgcn_ds_ordered_swap:
1652     return selectDSOrderedIntrinsic(I, IntrinsicID);
1653   case Intrinsic::amdgcn_ds_gws_init:
1654   case Intrinsic::amdgcn_ds_gws_barrier:
1655   case Intrinsic::amdgcn_ds_gws_sema_v:
1656   case Intrinsic::amdgcn_ds_gws_sema_br:
1657   case Intrinsic::amdgcn_ds_gws_sema_p:
1658   case Intrinsic::amdgcn_ds_gws_sema_release_all:
1659     return selectDSGWSIntrinsic(I, IntrinsicID);
1660   case Intrinsic::amdgcn_ds_append:
1661     return selectDSAppendConsume(I, true);
1662   case Intrinsic::amdgcn_ds_consume:
1663     return selectDSAppendConsume(I, false);
1664   default: {
1665     return selectImpl(I, *CoverageInfo);
1666   }
1667   }
1668 }
1669 
1670 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
1671   if (selectImpl(I, *CoverageInfo))
1672     return true;
1673 
1674   MachineBasicBlock *BB = I.getParent();
1675   const DebugLoc &DL = I.getDebugLoc();
1676 
1677   Register DstReg = I.getOperand(0).getReg();
1678   unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
1679   assert(Size <= 32 || Size == 64);
1680   const MachineOperand &CCOp = I.getOperand(1);
1681   Register CCReg = CCOp.getReg();
1682   if (!isVCC(CCReg, *MRI)) {
1683     unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
1684                                          AMDGPU::S_CSELECT_B32;
1685     MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
1686             .addReg(CCReg);
1687 
1688     // The generic constrainSelectedInstRegOperands doesn't work for the scc register
1689     // bank, because it does not cover the register class that we used to represent
1690     // for it.  So we need to manually set the register class here.
1691     if (!MRI->getRegClassOrNull(CCReg))
1692         MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
1693     MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
1694             .add(I.getOperand(2))
1695             .add(I.getOperand(3));
1696 
1697     bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI) |
1698                constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
1699     I.eraseFromParent();
1700     return Ret;
1701   }
1702 
1703   // Wide VGPR select should have been split in RegBankSelect.
1704   if (Size > 32)
1705     return false;
1706 
1707   MachineInstr *Select =
1708       BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1709               .addImm(0)
1710               .add(I.getOperand(3))
1711               .addImm(0)
1712               .add(I.getOperand(2))
1713               .add(I.getOperand(1));
1714 
1715   bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
1716   I.eraseFromParent();
1717   return Ret;
1718 }
1719 
1720 bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const {
1721   initM0(I);
1722   return selectImpl(I, *CoverageInfo);
1723 }
1724 
1725 static int sizeToSubRegIndex(unsigned Size) {
1726   switch (Size) {
1727   case 32:
1728     return AMDGPU::sub0;
1729   case 64:
1730     return AMDGPU::sub0_sub1;
1731   case 96:
1732     return AMDGPU::sub0_sub1_sub2;
1733   case 128:
1734     return AMDGPU::sub0_sub1_sub2_sub3;
1735   case 256:
1736     return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
1737   default:
1738     if (Size < 32)
1739       return AMDGPU::sub0;
1740     if (Size > 256)
1741       return -1;
1742     return sizeToSubRegIndex(PowerOf2Ceil(Size));
1743   }
1744 }
1745 
1746 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
1747   Register DstReg = I.getOperand(0).getReg();
1748   Register SrcReg = I.getOperand(1).getReg();
1749   const LLT DstTy = MRI->getType(DstReg);
1750   const LLT SrcTy = MRI->getType(SrcReg);
1751   const LLT S1 = LLT::scalar(1);
1752 
1753   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
1754   const RegisterBank *DstRB;
1755   if (DstTy == S1) {
1756     // This is a special case. We don't treat s1 for legalization artifacts as
1757     // vcc booleans.
1758     DstRB = SrcRB;
1759   } else {
1760     DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1761     if (SrcRB != DstRB)
1762       return false;
1763   }
1764 
1765   const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
1766 
1767   unsigned DstSize = DstTy.getSizeInBits();
1768   unsigned SrcSize = SrcTy.getSizeInBits();
1769 
1770   const TargetRegisterClass *SrcRC
1771     = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI);
1772   const TargetRegisterClass *DstRC
1773     = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI);
1774   if (!SrcRC || !DstRC)
1775     return false;
1776 
1777   if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
1778       !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
1779     LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
1780     return false;
1781   }
1782 
1783   if (DstTy == LLT::vector(2, 16) && SrcTy == LLT::vector(2, 32)) {
1784     MachineBasicBlock *MBB = I.getParent();
1785     const DebugLoc &DL = I.getDebugLoc();
1786 
1787     Register LoReg = MRI->createVirtualRegister(DstRC);
1788     Register HiReg = MRI->createVirtualRegister(DstRC);
1789     BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
1790       .addReg(SrcReg, 0, AMDGPU::sub0);
1791     BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
1792       .addReg(SrcReg, 0, AMDGPU::sub1);
1793 
1794     if (IsVALU && STI.hasSDWA()) {
1795       // Write the low 16-bits of the high element into the high 16-bits of the
1796       // low element.
1797       MachineInstr *MovSDWA =
1798         BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
1799         .addImm(0)                             // $src0_modifiers
1800         .addReg(HiReg)                         // $src0
1801         .addImm(0)                             // $clamp
1802         .addImm(AMDGPU::SDWA::WORD_1)          // $dst_sel
1803         .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
1804         .addImm(AMDGPU::SDWA::WORD_0)          // $src0_sel
1805         .addReg(LoReg, RegState::Implicit);
1806       MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
1807     } else {
1808       Register TmpReg0 = MRI->createVirtualRegister(DstRC);
1809       Register TmpReg1 = MRI->createVirtualRegister(DstRC);
1810       Register ImmReg = MRI->createVirtualRegister(DstRC);
1811       if (IsVALU) {
1812         BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
1813           .addImm(16)
1814           .addReg(HiReg);
1815       } else {
1816         BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
1817           .addReg(HiReg)
1818           .addImm(16);
1819       }
1820 
1821       unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
1822       unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
1823       unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
1824 
1825       BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
1826         .addImm(0xffff);
1827       BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
1828         .addReg(LoReg)
1829         .addReg(ImmReg);
1830       BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
1831         .addReg(TmpReg0)
1832         .addReg(TmpReg1);
1833     }
1834 
1835     I.eraseFromParent();
1836     return true;
1837   }
1838 
1839   if (!DstTy.isScalar())
1840     return false;
1841 
1842   if (SrcSize > 32) {
1843     int SubRegIdx = sizeToSubRegIndex(DstSize);
1844     if (SubRegIdx == -1)
1845       return false;
1846 
1847     // Deal with weird cases where the class only partially supports the subreg
1848     // index.
1849     const TargetRegisterClass *SrcWithSubRC
1850       = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
1851     if (!SrcWithSubRC)
1852       return false;
1853 
1854     if (SrcWithSubRC != SrcRC) {
1855       if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
1856         return false;
1857     }
1858 
1859     I.getOperand(1).setSubReg(SubRegIdx);
1860   }
1861 
1862   I.setDesc(TII.get(TargetOpcode::COPY));
1863   return true;
1864 }
1865 
1866 /// \returns true if a bitmask for \p Size bits will be an inline immediate.
1867 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
1868   Mask = maskTrailingOnes<unsigned>(Size);
1869   int SignedMask = static_cast<int>(Mask);
1870   return SignedMask >= -16 && SignedMask <= 64;
1871 }
1872 
1873 // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
1874 const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
1875   Register Reg, const MachineRegisterInfo &MRI,
1876   const TargetRegisterInfo &TRI) const {
1877   const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
1878   if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>())
1879     return RB;
1880 
1881   // Ignore the type, since we don't use vcc in artifacts.
1882   if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
1883     return &RBI.getRegBankFromRegClass(*RC, LLT());
1884   return nullptr;
1885 }
1886 
1887 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
1888   bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
1889   bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
1890   const DebugLoc &DL = I.getDebugLoc();
1891   MachineBasicBlock &MBB = *I.getParent();
1892   const Register DstReg = I.getOperand(0).getReg();
1893   const Register SrcReg = I.getOperand(1).getReg();
1894 
1895   const LLT DstTy = MRI->getType(DstReg);
1896   const LLT SrcTy = MRI->getType(SrcReg);
1897   const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
1898     I.getOperand(2).getImm() : SrcTy.getSizeInBits();
1899   const unsigned DstSize = DstTy.getSizeInBits();
1900   if (!DstTy.isScalar())
1901     return false;
1902 
1903   // Artifact casts should never use vcc.
1904   const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
1905 
1906   // FIXME: This should probably be illegal and split earlier.
1907   if (I.getOpcode() == AMDGPU::G_ANYEXT) {
1908     if (DstSize <= 32)
1909       return selectCOPY(I);
1910 
1911     const TargetRegisterClass *SrcRC =
1912         TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank, *MRI);
1913     const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1914     const TargetRegisterClass *DstRC =
1915         TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
1916 
1917     Register UndefReg = MRI->createVirtualRegister(SrcRC);
1918     BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
1919     BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1920       .addReg(SrcReg)
1921       .addImm(AMDGPU::sub0)
1922       .addReg(UndefReg)
1923       .addImm(AMDGPU::sub1);
1924     I.eraseFromParent();
1925 
1926     return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
1927            RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
1928   }
1929 
1930   if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
1931     // 64-bit should have been split up in RegBankSelect
1932 
1933     // Try to use an and with a mask if it will save code size.
1934     unsigned Mask;
1935     if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
1936       MachineInstr *ExtI =
1937       BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
1938         .addImm(Mask)
1939         .addReg(SrcReg);
1940       I.eraseFromParent();
1941       return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
1942     }
1943 
1944     const unsigned BFE = Signed ? AMDGPU::V_BFE_I32 : AMDGPU::V_BFE_U32;
1945     MachineInstr *ExtI =
1946       BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
1947       .addReg(SrcReg)
1948       .addImm(0) // Offset
1949       .addImm(SrcSize); // Width
1950     I.eraseFromParent();
1951     return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
1952   }
1953 
1954   if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
1955     const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
1956       AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
1957     if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
1958       return false;
1959 
1960     if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
1961       const unsigned SextOpc = SrcSize == 8 ?
1962         AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
1963       BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
1964         .addReg(SrcReg);
1965       I.eraseFromParent();
1966       return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
1967     }
1968 
1969     const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
1970     const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
1971 
1972     // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
1973     if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
1974       // We need a 64-bit register source, but the high bits don't matter.
1975       Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
1976       Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1977       unsigned SubReg = InReg ? AMDGPU::sub0 : 0;
1978 
1979       BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
1980       BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
1981         .addReg(SrcReg, 0, SubReg)
1982         .addImm(AMDGPU::sub0)
1983         .addReg(UndefReg)
1984         .addImm(AMDGPU::sub1);
1985 
1986       BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
1987         .addReg(ExtReg)
1988         .addImm(SrcSize << 16);
1989 
1990       I.eraseFromParent();
1991       return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
1992     }
1993 
1994     unsigned Mask;
1995     if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
1996       BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
1997         .addReg(SrcReg)
1998         .addImm(Mask);
1999     } else {
2000       BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2001         .addReg(SrcReg)
2002         .addImm(SrcSize << 16);
2003     }
2004 
2005     I.eraseFromParent();
2006     return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2007   }
2008 
2009   return false;
2010 }
2011 
2012 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
2013   MachineBasicBlock *BB = I.getParent();
2014   MachineOperand &ImmOp = I.getOperand(1);
2015 
2016   // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
2017   if (ImmOp.isFPImm()) {
2018     const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
2019     ImmOp.ChangeToImmediate(Imm.getZExtValue());
2020   } else if (ImmOp.isCImm()) {
2021     ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue());
2022   }
2023 
2024   Register DstReg = I.getOperand(0).getReg();
2025   unsigned Size;
2026   bool IsSgpr;
2027   const RegisterBank *RB = MRI->getRegBankOrNull(I.getOperand(0).getReg());
2028   if (RB) {
2029     IsSgpr = RB->getID() == AMDGPU::SGPRRegBankID;
2030     Size = MRI->getType(DstReg).getSizeInBits();
2031   } else {
2032     const TargetRegisterClass *RC = TRI.getRegClassForReg(*MRI, DstReg);
2033     IsSgpr = TRI.isSGPRClass(RC);
2034     Size = TRI.getRegSizeInBits(*RC);
2035   }
2036 
2037   if (Size != 32 && Size != 64)
2038     return false;
2039 
2040   unsigned Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
2041   if (Size == 32) {
2042     I.setDesc(TII.get(Opcode));
2043     I.addImplicitDefUseOperands(*MF);
2044     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2045   }
2046 
2047   const DebugLoc &DL = I.getDebugLoc();
2048 
2049   APInt Imm(Size, I.getOperand(1).getImm());
2050 
2051   MachineInstr *ResInst;
2052   if (IsSgpr && TII.isInlineConstant(Imm)) {
2053     ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
2054       .addImm(I.getOperand(1).getImm());
2055   } else {
2056     const TargetRegisterClass *RC = IsSgpr ?
2057       &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass;
2058     Register LoReg = MRI->createVirtualRegister(RC);
2059     Register HiReg = MRI->createVirtualRegister(RC);
2060 
2061     BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
2062       .addImm(Imm.trunc(32).getZExtValue());
2063 
2064     BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
2065       .addImm(Imm.ashr(32).getZExtValue());
2066 
2067     ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2068       .addReg(LoReg)
2069       .addImm(AMDGPU::sub0)
2070       .addReg(HiReg)
2071       .addImm(AMDGPU::sub1);
2072   }
2073 
2074   // We can't call constrainSelectedInstRegOperands here, because it doesn't
2075   // work for target independent opcodes
2076   I.eraseFromParent();
2077   const TargetRegisterClass *DstRC =
2078     TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI);
2079   if (!DstRC)
2080     return true;
2081   return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI);
2082 }
2083 
2084 bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2085   // Only manually handle the f64 SGPR case.
2086   //
2087   // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2088   // the bit ops theoretically have a second result due to the implicit def of
2089   // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2090   // that is easy by disabling the check. The result works, but uses a
2091   // nonsensical sreg32orlds_and_sreg_1 regclass.
2092   //
2093   // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2094   // the variadic REG_SEQUENCE operands.
2095 
2096   Register Dst = MI.getOperand(0).getReg();
2097   const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2098   if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2099       MRI->getType(Dst) != LLT::scalar(64))
2100     return false;
2101 
2102   Register Src = MI.getOperand(1).getReg();
2103   MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2104   if (Fabs)
2105     Src = Fabs->getOperand(1).getReg();
2106 
2107   if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2108       !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2109     return false;
2110 
2111   MachineBasicBlock *BB = MI.getParent();
2112   const DebugLoc &DL = MI.getDebugLoc();
2113   Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2114   Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2115   Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2116   Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2117 
2118   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2119     .addReg(Src, 0, AMDGPU::sub0);
2120   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2121     .addReg(Src, 0, AMDGPU::sub1);
2122   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2123     .addImm(0x80000000);
2124 
2125   // Set or toggle sign bit.
2126   unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2127   BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2128     .addReg(HiReg)
2129     .addReg(ConstReg);
2130   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2131     .addReg(LoReg)
2132     .addImm(AMDGPU::sub0)
2133     .addReg(OpReg)
2134     .addImm(AMDGPU::sub1);
2135   MI.eraseFromParent();
2136   return true;
2137 }
2138 
2139 // FIXME: This is a workaround for the same tablegen problems as G_FNEG
2140 bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2141   Register Dst = MI.getOperand(0).getReg();
2142   const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2143   if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2144       MRI->getType(Dst) != LLT::scalar(64))
2145     return false;
2146 
2147   Register Src = MI.getOperand(1).getReg();
2148   MachineBasicBlock *BB = MI.getParent();
2149   const DebugLoc &DL = MI.getDebugLoc();
2150   Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2151   Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2152   Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2153   Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2154 
2155   if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2156       !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2157     return false;
2158 
2159   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2160     .addReg(Src, 0, AMDGPU::sub0);
2161   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2162     .addReg(Src, 0, AMDGPU::sub1);
2163   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2164     .addImm(0x7fffffff);
2165 
2166   // Clear sign bit.
2167   // TODO: Should this used S_BITSET0_*?
2168   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2169     .addReg(HiReg)
2170     .addReg(ConstReg);
2171   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2172     .addReg(LoReg)
2173     .addImm(AMDGPU::sub0)
2174     .addReg(OpReg)
2175     .addImm(AMDGPU::sub1);
2176 
2177   MI.eraseFromParent();
2178   return true;
2179 }
2180 
2181 static bool isConstant(const MachineInstr &MI) {
2182   return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2183 }
2184 
2185 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2186     const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2187 
2188   const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg());
2189 
2190   assert(PtrMI);
2191 
2192   if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2193     return;
2194 
2195   GEPInfo GEPInfo(*PtrMI);
2196 
2197   for (unsigned i = 1; i != 3; ++i) {
2198     const MachineOperand &GEPOp = PtrMI->getOperand(i);
2199     const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2200     assert(OpDef);
2201     if (i == 2 && isConstant(*OpDef)) {
2202       // TODO: Could handle constant base + variable offset, but a combine
2203       // probably should have commuted it.
2204       assert(GEPInfo.Imm == 0);
2205       GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2206       continue;
2207     }
2208     const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2209     if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2210       GEPInfo.SgprParts.push_back(GEPOp.getReg());
2211     else
2212       GEPInfo.VgprParts.push_back(GEPOp.getReg());
2213   }
2214 
2215   AddrInfo.push_back(GEPInfo);
2216   getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2217 }
2218 
2219 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2220   if (!MI.hasOneMemOperand())
2221     return false;
2222 
2223   const MachineMemOperand *MMO = *MI.memoperands_begin();
2224   const Value *Ptr = MMO->getValue();
2225 
2226   // UndefValue means this is a load of a kernel input.  These are uniform.
2227   // Sometimes LDS instructions have constant pointers.
2228   // If Ptr is null, then that means this mem operand contains a
2229   // PseudoSourceValue like GOT.
2230   if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
2231       isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
2232     return true;
2233 
2234   if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
2235     return true;
2236 
2237   const Instruction *I = dyn_cast<Instruction>(Ptr);
2238   return I && I->getMetadata("amdgpu.uniform");
2239 }
2240 
2241 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2242   for (const GEPInfo &GEPInfo : AddrInfo) {
2243     if (!GEPInfo.VgprParts.empty())
2244       return true;
2245   }
2246   return false;
2247 }
2248 
2249 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2250   MachineBasicBlock *BB = I.getParent();
2251 
2252   const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2253   unsigned AS = PtrTy.getAddressSpace();
2254   if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) &&
2255       STI.ldsRequiresM0Init()) {
2256     // If DS instructions require M0 initializtion, insert it before selecting.
2257     BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2258       .addImm(-1);
2259   }
2260 }
2261 
2262 bool AMDGPUInstructionSelector::selectG_LOAD_ATOMICRMW(MachineInstr &I) const {
2263   initM0(I);
2264   return selectImpl(I, *CoverageInfo);
2265 }
2266 
2267 // TODO: No rtn optimization.
2268 bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG(
2269   MachineInstr &MI) const {
2270   Register PtrReg = MI.getOperand(1).getReg();
2271   const LLT PtrTy = MRI->getType(PtrReg);
2272   if (PtrTy.getAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
2273       STI.useFlatForGlobal())
2274     return selectImpl(MI, *CoverageInfo);
2275 
2276   Register DstReg = MI.getOperand(0).getReg();
2277   const LLT Ty = MRI->getType(DstReg);
2278   const bool Is64 = Ty.getSizeInBits() == 64;
2279   const unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2280   Register TmpReg = MRI->createVirtualRegister(
2281     Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2282 
2283   const DebugLoc &DL = MI.getDebugLoc();
2284   MachineBasicBlock *BB = MI.getParent();
2285 
2286   Register VAddr, RSrcReg, SOffset;
2287   int64_t Offset = 0;
2288 
2289   unsigned Opcode;
2290   if (selectMUBUFOffsetImpl(MI.getOperand(1), RSrcReg, SOffset, Offset)) {
2291     Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN :
2292                              AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN;
2293   } else if (selectMUBUFAddr64Impl(MI.getOperand(1), VAddr,
2294                                    RSrcReg, SOffset, Offset)) {
2295     Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN :
2296                     AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN;
2297   } else
2298     return selectImpl(MI, *CoverageInfo);
2299 
2300   auto MIB = BuildMI(*BB, &MI, DL, TII.get(Opcode), TmpReg)
2301     .addReg(MI.getOperand(2).getReg());
2302 
2303   if (VAddr)
2304     MIB.addReg(VAddr);
2305 
2306   MIB.addReg(RSrcReg);
2307   if (SOffset)
2308     MIB.addReg(SOffset);
2309   else
2310     MIB.addImm(0);
2311 
2312   MIB.addImm(Offset);
2313   MIB.addImm(0); // slc
2314   MIB.cloneMemRefs(MI);
2315 
2316   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), DstReg)
2317     .addReg(TmpReg, RegState::Kill, SubReg);
2318 
2319   MI.eraseFromParent();
2320 
2321   MRI->setRegClass(
2322     DstReg, Is64 ? &AMDGPU::VReg_64RegClass : &AMDGPU::VGPR_32RegClass);
2323   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2324 }
2325 
2326 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
2327   MachineBasicBlock *BB = I.getParent();
2328   MachineOperand &CondOp = I.getOperand(0);
2329   Register CondReg = CondOp.getReg();
2330   const DebugLoc &DL = I.getDebugLoc();
2331 
2332   unsigned BrOpcode;
2333   Register CondPhysReg;
2334   const TargetRegisterClass *ConstrainRC;
2335 
2336   // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
2337   // whether the branch is uniform when selecting the instruction. In
2338   // GlobalISel, we should push that decision into RegBankSelect. Assume for now
2339   // RegBankSelect knows what it's doing if the branch condition is scc, even
2340   // though it currently does not.
2341   if (!isVCC(CondReg, *MRI)) {
2342     if (MRI->getType(CondReg) != LLT::scalar(32))
2343       return false;
2344 
2345     CondPhysReg = AMDGPU::SCC;
2346     BrOpcode = AMDGPU::S_CBRANCH_SCC1;
2347     ConstrainRC = &AMDGPU::SReg_32RegClass;
2348   } else {
2349     // FIXME: Do we have to insert an and with exec here, like in SelectionDAG?
2350     // We sort of know that a VCC producer based on the register bank, that ands
2351     // inactive lanes with 0. What if there was a logical operation with vcc
2352     // producers in different blocks/with different exec masks?
2353     // FIXME: Should scc->vcc copies and with exec?
2354     CondPhysReg = TRI.getVCC();
2355     BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
2356     ConstrainRC = TRI.getBoolRC();
2357   }
2358 
2359   if (!MRI->getRegClassOrNull(CondReg))
2360     MRI->setRegClass(CondReg, ConstrainRC);
2361 
2362   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
2363     .addReg(CondReg);
2364   BuildMI(*BB, &I, DL, TII.get(BrOpcode))
2365     .addMBB(I.getOperand(1).getMBB());
2366 
2367   I.eraseFromParent();
2368   return true;
2369 }
2370 
2371 bool AMDGPUInstructionSelector::selectG_FRAME_INDEX_GLOBAL_VALUE(
2372   MachineInstr &I) const {
2373   Register DstReg = I.getOperand(0).getReg();
2374   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2375   const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2376   I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
2377   if (IsVGPR)
2378     I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
2379 
2380   return RBI.constrainGenericRegister(
2381     DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
2382 }
2383 
2384 bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
2385   Register DstReg = I.getOperand(0).getReg();
2386   Register SrcReg = I.getOperand(1).getReg();
2387   Register MaskReg = I.getOperand(2).getReg();
2388   LLT Ty = MRI->getType(DstReg);
2389   LLT MaskTy = MRI->getType(MaskReg);
2390 
2391   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2392   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2393   const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
2394   const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2395   if (DstRB != SrcRB) // Should only happen for hand written MIR.
2396     return false;
2397 
2398   unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2399   const TargetRegisterClass &RegRC
2400     = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2401 
2402   const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB,
2403                                                                   *MRI);
2404   const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB,
2405                                                                   *MRI);
2406   const TargetRegisterClass *MaskRC =
2407       TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB, *MRI);
2408 
2409   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2410       !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2411       !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
2412     return false;
2413 
2414   MachineBasicBlock *BB = I.getParent();
2415   const DebugLoc &DL = I.getDebugLoc();
2416   if (Ty.getSizeInBits() == 32) {
2417     assert(MaskTy.getSizeInBits() == 32 &&
2418            "ptrmask should have been narrowed during legalize");
2419 
2420     BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
2421       .addReg(SrcReg)
2422       .addReg(MaskReg);
2423     I.eraseFromParent();
2424     return true;
2425   }
2426 
2427   Register HiReg = MRI->createVirtualRegister(&RegRC);
2428   Register LoReg = MRI->createVirtualRegister(&RegRC);
2429 
2430   // Extract the subregisters from the source pointer.
2431   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
2432     .addReg(SrcReg, 0, AMDGPU::sub0);
2433   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
2434     .addReg(SrcReg, 0, AMDGPU::sub1);
2435 
2436   Register MaskedLo, MaskedHi;
2437 
2438   // Try to avoid emitting a bit operation when we only need to touch half of
2439   // the 64-bit pointer.
2440   APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64);
2441 
2442   const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
2443   const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
2444   if ((MaskOnes & MaskLo32) == MaskLo32) {
2445     // If all the bits in the low half are 1, we only need a copy for it.
2446     MaskedLo = LoReg;
2447   } else {
2448     // Extract the mask subregister and apply the and.
2449     Register MaskLo = MRI->createVirtualRegister(&RegRC);
2450     MaskedLo = MRI->createVirtualRegister(&RegRC);
2451 
2452     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
2453       .addReg(MaskReg, 0, AMDGPU::sub0);
2454     BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
2455       .addReg(LoReg)
2456       .addReg(MaskLo);
2457   }
2458 
2459   if ((MaskOnes & MaskHi32) == MaskHi32) {
2460     // If all the bits in the high half are 1, we only need a copy for it.
2461     MaskedHi = HiReg;
2462   } else {
2463     Register MaskHi = MRI->createVirtualRegister(&RegRC);
2464     MaskedHi = MRI->createVirtualRegister(&RegRC);
2465 
2466     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
2467       .addReg(MaskReg, 0, AMDGPU::sub1);
2468     BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
2469       .addReg(HiReg)
2470       .addReg(MaskHi);
2471   }
2472 
2473   BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2474     .addReg(MaskedLo)
2475     .addImm(AMDGPU::sub0)
2476     .addReg(MaskedHi)
2477     .addImm(AMDGPU::sub1);
2478   I.eraseFromParent();
2479   return true;
2480 }
2481 
2482 /// Return the register to use for the index value, and the subregister to use
2483 /// for the indirectly accessed register.
2484 static std::pair<Register, unsigned>
2485 computeIndirectRegIndex(MachineRegisterInfo &MRI,
2486                         const SIRegisterInfo &TRI,
2487                         const TargetRegisterClass *SuperRC,
2488                         Register IdxReg,
2489                         unsigned EltSize) {
2490   Register IdxBaseReg;
2491   int Offset;
2492   MachineInstr *Unused;
2493 
2494   std::tie(IdxBaseReg, Offset, Unused)
2495     = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg);
2496   if (IdxBaseReg == AMDGPU::NoRegister) {
2497     // This will happen if the index is a known constant. This should ordinarily
2498     // be legalized out, but handle it as a register just in case.
2499     assert(Offset == 0);
2500     IdxBaseReg = IdxReg;
2501   }
2502 
2503   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
2504 
2505   // Skip out of bounds offsets, or else we would end up using an undefined
2506   // register.
2507   if (static_cast<unsigned>(Offset) >= SubRegs.size())
2508     return std::make_pair(IdxReg, SubRegs[0]);
2509   return std::make_pair(IdxBaseReg, SubRegs[Offset]);
2510 }
2511 
2512 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
2513   MachineInstr &MI) const {
2514   Register DstReg = MI.getOperand(0).getReg();
2515   Register SrcReg = MI.getOperand(1).getReg();
2516   Register IdxReg = MI.getOperand(2).getReg();
2517 
2518   LLT DstTy = MRI->getType(DstReg);
2519   LLT SrcTy = MRI->getType(SrcReg);
2520 
2521   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2522   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2523   const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
2524 
2525   // The index must be scalar. If it wasn't RegBankSelect should have moved this
2526   // into a waterfall loop.
2527   if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
2528     return false;
2529 
2530   const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB,
2531                                                                   *MRI);
2532   const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(DstTy, *DstRB,
2533                                                                   *MRI);
2534   if (!SrcRC || !DstRC)
2535     return false;
2536   if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2537       !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2538       !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
2539     return false;
2540 
2541   MachineBasicBlock *BB = MI.getParent();
2542   const DebugLoc &DL = MI.getDebugLoc();
2543   const bool Is64 = DstTy.getSizeInBits() == 64;
2544 
2545   unsigned SubReg;
2546   std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, SrcRC, IdxReg,
2547                                                      DstTy.getSizeInBits() / 8);
2548 
2549   if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
2550     if (DstTy.getSizeInBits() != 32 && !Is64)
2551       return false;
2552 
2553     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2554       .addReg(IdxReg);
2555 
2556     unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
2557     BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
2558       .addReg(SrcReg, 0, SubReg)
2559       .addReg(SrcReg, RegState::Implicit);
2560     MI.eraseFromParent();
2561     return true;
2562   }
2563 
2564   if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
2565     return false;
2566 
2567   if (!STI.useVGPRIndexMode()) {
2568     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2569       .addReg(IdxReg);
2570     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
2571       .addReg(SrcReg, 0, SubReg)
2572       .addReg(SrcReg, RegState::Implicit);
2573     MI.eraseFromParent();
2574     return true;
2575   }
2576 
2577   BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON))
2578     .addReg(IdxReg)
2579     .addImm(AMDGPU::VGPRIndexMode::SRC0_ENABLE);
2580   BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), DstReg)
2581     .addReg(SrcReg, 0, SubReg)
2582     .addReg(SrcReg, RegState::Implicit)
2583     .addReg(AMDGPU::M0, RegState::Implicit);
2584   BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF));
2585 
2586   MI.eraseFromParent();
2587   return true;
2588 }
2589 
2590 // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
2591 bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
2592   MachineInstr &MI) const {
2593   Register DstReg = MI.getOperand(0).getReg();
2594   Register VecReg = MI.getOperand(1).getReg();
2595   Register ValReg = MI.getOperand(2).getReg();
2596   Register IdxReg = MI.getOperand(3).getReg();
2597 
2598   LLT VecTy = MRI->getType(DstReg);
2599   LLT ValTy = MRI->getType(ValReg);
2600   unsigned VecSize = VecTy.getSizeInBits();
2601   unsigned ValSize = ValTy.getSizeInBits();
2602 
2603   const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
2604   const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
2605   const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
2606 
2607   assert(VecTy.getElementType() == ValTy);
2608 
2609   // The index must be scalar. If it wasn't RegBankSelect should have moved this
2610   // into a waterfall loop.
2611   if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
2612     return false;
2613 
2614   const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB,
2615                                                                   *MRI);
2616   const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB,
2617                                                                   *MRI);
2618 
2619   if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
2620       !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
2621       !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
2622       !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
2623     return false;
2624 
2625   if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
2626     return false;
2627 
2628   unsigned SubReg;
2629   std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg,
2630                                                      ValSize / 8);
2631 
2632   const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
2633                          STI.useVGPRIndexMode();
2634 
2635   MachineBasicBlock *BB = MI.getParent();
2636   const DebugLoc &DL = MI.getDebugLoc();
2637 
2638   if (IndexMode) {
2639     BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON))
2640       .addReg(IdxReg)
2641       .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE);
2642   } else {
2643     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2644       .addReg(IdxReg);
2645   }
2646 
2647   const MCInstrDesc &RegWriteOp
2648     = TII.getIndirectRegWritePseudo(VecSize, ValSize,
2649                                     VecRB->getID() == AMDGPU::SGPRRegBankID);
2650   BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
2651     .addReg(VecReg)
2652     .addReg(ValReg)
2653     .addImm(SubReg);
2654 
2655   if (IndexMode)
2656     BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF));
2657 
2658   MI.eraseFromParent();
2659   return true;
2660 }
2661 
2662 static bool isZeroOrUndef(int X) {
2663   return X == 0 || X == -1;
2664 }
2665 
2666 static bool isOneOrUndef(int X) {
2667   return X == 1 || X == -1;
2668 }
2669 
2670 static bool isZeroOrOneOrUndef(int X) {
2671   return X == 0 || X == 1 || X == -1;
2672 }
2673 
2674 // Normalize a VOP3P shuffle mask to refer to the low/high half of a single
2675 // 32-bit register.
2676 static Register normalizeVOP3PMask(int NewMask[2], Register Src0, Register Src1,
2677                                    ArrayRef<int> Mask) {
2678   NewMask[0] = Mask[0];
2679   NewMask[1] = Mask[1];
2680   if (isZeroOrOneOrUndef(Mask[0]) && isZeroOrOneOrUndef(Mask[1]))
2681     return Src0;
2682 
2683   assert(NewMask[0] == 2 || NewMask[0] == 3 || NewMask[0] == -1);
2684   assert(NewMask[1] == 2 || NewMask[1] == 3 || NewMask[1] == -1);
2685 
2686   // Shift the mask inputs to be 0/1;
2687   NewMask[0] = NewMask[0] == -1 ? -1 : NewMask[0] - 2;
2688   NewMask[1] = NewMask[1] == -1 ? -1 : NewMask[1] - 2;
2689   return Src1;
2690 }
2691 
2692 // This is only legal with VOP3P instructions as an aid to op_sel matching.
2693 bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
2694   MachineInstr &MI) const {
2695   Register DstReg = MI.getOperand(0).getReg();
2696   Register Src0Reg = MI.getOperand(1).getReg();
2697   Register Src1Reg = MI.getOperand(2).getReg();
2698   ArrayRef<int> ShufMask = MI.getOperand(3).getShuffleMask();
2699 
2700   const LLT V2S16 = LLT::vector(2, 16);
2701   if (MRI->getType(DstReg) != V2S16 || MRI->getType(Src0Reg) != V2S16)
2702     return false;
2703 
2704   if (!AMDGPU::isLegalVOP3PShuffleMask(ShufMask))
2705     return false;
2706 
2707   assert(ShufMask.size() == 2);
2708   assert(STI.hasSDWA() && "no target has VOP3P but not SDWA");
2709 
2710   MachineBasicBlock *MBB = MI.getParent();
2711   const DebugLoc &DL = MI.getDebugLoc();
2712 
2713   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2714   const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2715   const TargetRegisterClass &RC = IsVALU ?
2716     AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2717 
2718   // Handle the degenerate case which should have folded out.
2719   if (ShufMask[0] == -1 && ShufMask[1] == -1) {
2720     BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), DstReg);
2721 
2722     MI.eraseFromParent();
2723     return RBI.constrainGenericRegister(DstReg, RC, *MRI);
2724   }
2725 
2726   // A legal VOP3P mask only reads one of the sources.
2727   int Mask[2];
2728   Register SrcVec = normalizeVOP3PMask(Mask, Src0Reg, Src1Reg, ShufMask);
2729 
2730   if (!RBI.constrainGenericRegister(DstReg, RC, *MRI) ||
2731       !RBI.constrainGenericRegister(SrcVec, RC, *MRI))
2732     return false;
2733 
2734   // TODO: This also should have been folded out
2735   if (isZeroOrUndef(Mask[0]) && isOneOrUndef(Mask[1])) {
2736     BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), DstReg)
2737       .addReg(SrcVec);
2738 
2739     MI.eraseFromParent();
2740     return true;
2741   }
2742 
2743   if (Mask[0] == 1 && Mask[1] == -1) {
2744     if (IsVALU) {
2745       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
2746         .addImm(16)
2747         .addReg(SrcVec);
2748     } else {
2749       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
2750         .addReg(SrcVec)
2751         .addImm(16);
2752     }
2753   } else if (Mask[0] == -1 && Mask[1] == 0) {
2754     if (IsVALU) {
2755       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), DstReg)
2756         .addImm(16)
2757         .addReg(SrcVec);
2758     } else {
2759       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHL_B32), DstReg)
2760         .addReg(SrcVec)
2761         .addImm(16);
2762     }
2763   } else if (Mask[0] == 0 && Mask[1] == 0) {
2764     if (IsVALU) {
2765       // Write low half of the register into the high half.
2766       MachineInstr *MovSDWA =
2767         BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2768         .addImm(0)                             // $src0_modifiers
2769         .addReg(SrcVec)                        // $src0
2770         .addImm(0)                             // $clamp
2771         .addImm(AMDGPU::SDWA::WORD_1)          // $dst_sel
2772         .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2773         .addImm(AMDGPU::SDWA::WORD_0)          // $src0_sel
2774         .addReg(SrcVec, RegState::Implicit);
2775       MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2776     } else {
2777       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
2778         .addReg(SrcVec)
2779         .addReg(SrcVec);
2780     }
2781   } else if (Mask[0] == 1 && Mask[1] == 1) {
2782     if (IsVALU) {
2783       // Write high half of the register into the low half.
2784       MachineInstr *MovSDWA =
2785         BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2786         .addImm(0)                             // $src0_modifiers
2787         .addReg(SrcVec)                        // $src0
2788         .addImm(0)                             // $clamp
2789         .addImm(AMDGPU::SDWA::WORD_0)          // $dst_sel
2790         .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2791         .addImm(AMDGPU::SDWA::WORD_1)          // $src0_sel
2792         .addReg(SrcVec, RegState::Implicit);
2793       MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2794     } else {
2795       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_HH_B32_B16), DstReg)
2796         .addReg(SrcVec)
2797         .addReg(SrcVec);
2798     }
2799   } else if (Mask[0] == 1 && Mask[1] == 0) {
2800     if (IsVALU) {
2801       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_ALIGNBIT_B32), DstReg)
2802         .addReg(SrcVec)
2803         .addReg(SrcVec)
2804         .addImm(16);
2805     } else {
2806       Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2807       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg)
2808         .addReg(SrcVec)
2809         .addImm(16);
2810       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
2811         .addReg(TmpReg)
2812         .addReg(SrcVec);
2813     }
2814   } else
2815     llvm_unreachable("all shuffle masks should be handled");
2816 
2817   MI.eraseFromParent();
2818   return true;
2819 }
2820 
2821 bool AMDGPUInstructionSelector::select(MachineInstr &I) {
2822   if (I.isPHI())
2823     return selectPHI(I);
2824 
2825   if (!I.isPreISelOpcode()) {
2826     if (I.isCopy())
2827       return selectCOPY(I);
2828     return true;
2829   }
2830 
2831   switch (I.getOpcode()) {
2832   case TargetOpcode::G_AND:
2833   case TargetOpcode::G_OR:
2834   case TargetOpcode::G_XOR:
2835     if (selectImpl(I, *CoverageInfo))
2836       return true;
2837     return selectG_AND_OR_XOR(I);
2838   case TargetOpcode::G_ADD:
2839   case TargetOpcode::G_SUB:
2840     if (selectImpl(I, *CoverageInfo))
2841       return true;
2842     return selectG_ADD_SUB(I);
2843   case TargetOpcode::G_UADDO:
2844   case TargetOpcode::G_USUBO:
2845   case TargetOpcode::G_UADDE:
2846   case TargetOpcode::G_USUBE:
2847     return selectG_UADDO_USUBO_UADDE_USUBE(I);
2848   case TargetOpcode::G_INTTOPTR:
2849   case TargetOpcode::G_BITCAST:
2850   case TargetOpcode::G_PTRTOINT:
2851     return selectCOPY(I);
2852   case TargetOpcode::G_CONSTANT:
2853   case TargetOpcode::G_FCONSTANT:
2854     return selectG_CONSTANT(I);
2855   case TargetOpcode::G_FNEG:
2856     if (selectImpl(I, *CoverageInfo))
2857       return true;
2858     return selectG_FNEG(I);
2859   case TargetOpcode::G_FABS:
2860     if (selectImpl(I, *CoverageInfo))
2861       return true;
2862     return selectG_FABS(I);
2863   case TargetOpcode::G_EXTRACT:
2864     return selectG_EXTRACT(I);
2865   case TargetOpcode::G_MERGE_VALUES:
2866   case TargetOpcode::G_BUILD_VECTOR:
2867   case TargetOpcode::G_CONCAT_VECTORS:
2868     return selectG_MERGE_VALUES(I);
2869   case TargetOpcode::G_UNMERGE_VALUES:
2870     return selectG_UNMERGE_VALUES(I);
2871   case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2872     return selectG_BUILD_VECTOR_TRUNC(I);
2873   case TargetOpcode::G_PTR_ADD:
2874     return selectG_PTR_ADD(I);
2875   case TargetOpcode::G_IMPLICIT_DEF:
2876     return selectG_IMPLICIT_DEF(I);
2877   case TargetOpcode::G_FREEZE:
2878     return selectCOPY(I);
2879   case TargetOpcode::G_INSERT:
2880     return selectG_INSERT(I);
2881   case TargetOpcode::G_INTRINSIC:
2882     return selectG_INTRINSIC(I);
2883   case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
2884     return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
2885   case TargetOpcode::G_ICMP:
2886     if (selectG_ICMP(I))
2887       return true;
2888     return selectImpl(I, *CoverageInfo);
2889   case TargetOpcode::G_LOAD:
2890   case TargetOpcode::G_ATOMIC_CMPXCHG:
2891   case TargetOpcode::G_ATOMICRMW_XCHG:
2892   case TargetOpcode::G_ATOMICRMW_ADD:
2893   case TargetOpcode::G_ATOMICRMW_SUB:
2894   case TargetOpcode::G_ATOMICRMW_AND:
2895   case TargetOpcode::G_ATOMICRMW_OR:
2896   case TargetOpcode::G_ATOMICRMW_XOR:
2897   case TargetOpcode::G_ATOMICRMW_MIN:
2898   case TargetOpcode::G_ATOMICRMW_MAX:
2899   case TargetOpcode::G_ATOMICRMW_UMIN:
2900   case TargetOpcode::G_ATOMICRMW_UMAX:
2901   case TargetOpcode::G_ATOMICRMW_FADD:
2902   case AMDGPU::G_AMDGPU_ATOMIC_INC:
2903   case AMDGPU::G_AMDGPU_ATOMIC_DEC:
2904     return selectG_LOAD_ATOMICRMW(I);
2905   case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
2906     return selectG_AMDGPU_ATOMIC_CMPXCHG(I);
2907   case TargetOpcode::G_SELECT:
2908     return selectG_SELECT(I);
2909   case TargetOpcode::G_STORE:
2910     return selectG_STORE(I);
2911   case TargetOpcode::G_TRUNC:
2912     return selectG_TRUNC(I);
2913   case TargetOpcode::G_SEXT:
2914   case TargetOpcode::G_ZEXT:
2915   case TargetOpcode::G_ANYEXT:
2916   case TargetOpcode::G_SEXT_INREG:
2917     if (selectImpl(I, *CoverageInfo))
2918       return true;
2919     return selectG_SZA_EXT(I);
2920   case TargetOpcode::G_BRCOND:
2921     return selectG_BRCOND(I);
2922   case TargetOpcode::G_FRAME_INDEX:
2923   case TargetOpcode::G_GLOBAL_VALUE:
2924     return selectG_FRAME_INDEX_GLOBAL_VALUE(I);
2925   case TargetOpcode::G_PTRMASK:
2926     return selectG_PTRMASK(I);
2927   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2928     return selectG_EXTRACT_VECTOR_ELT(I);
2929   case TargetOpcode::G_INSERT_VECTOR_ELT:
2930     return selectG_INSERT_VECTOR_ELT(I);
2931   case TargetOpcode::G_SHUFFLE_VECTOR:
2932     return selectG_SHUFFLE_VECTOR(I);
2933   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
2934   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
2935     const AMDGPU::ImageDimIntrinsicInfo *Intr
2936       = AMDGPU::getImageDimIntrinsicInfo(I.getIntrinsicID());
2937     assert(Intr && "not an image intrinsic with image pseudo");
2938     return selectImageIntrinsic(I, Intr);
2939   }
2940   default:
2941     return selectImpl(I, *CoverageInfo);
2942   }
2943   return false;
2944 }
2945 
2946 InstructionSelector::ComplexRendererFns
2947 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
2948   return {{
2949       [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
2950   }};
2951 
2952 }
2953 
2954 std::pair<Register, unsigned>
2955 AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root) const {
2956   Register Src = Root.getReg();
2957   Register OrigSrc = Src;
2958   unsigned Mods = 0;
2959   MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
2960 
2961   if (MI && MI->getOpcode() == AMDGPU::G_FNEG) {
2962     Src = MI->getOperand(1).getReg();
2963     Mods |= SISrcMods::NEG;
2964     MI = getDefIgnoringCopies(Src, *MRI);
2965   }
2966 
2967   if (MI && MI->getOpcode() == AMDGPU::G_FABS) {
2968     Src = MI->getOperand(1).getReg();
2969     Mods |= SISrcMods::ABS;
2970   }
2971 
2972   if (Mods != 0 &&
2973       RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
2974     MachineInstr *UseMI = Root.getParent();
2975 
2976     // If we looked through copies to find source modifiers on an SGPR operand,
2977     // we now have an SGPR register source. To avoid potentially violating the
2978     // constant bus restriction, we need to insert a copy to a VGPR.
2979     Register VGPRSrc = MRI->cloneVirtualRegister(OrigSrc);
2980     BuildMI(*UseMI->getParent(), UseMI, UseMI->getDebugLoc(),
2981             TII.get(AMDGPU::COPY), VGPRSrc)
2982       .addReg(Src);
2983     Src = VGPRSrc;
2984   }
2985 
2986   return std::make_pair(Src, Mods);
2987 }
2988 
2989 ///
2990 /// This will select either an SGPR or VGPR operand and will save us from
2991 /// having to write an extra tablegen pattern.
2992 InstructionSelector::ComplexRendererFns
2993 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
2994   return {{
2995       [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
2996   }};
2997 }
2998 
2999 InstructionSelector::ComplexRendererFns
3000 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
3001   Register Src;
3002   unsigned Mods;
3003   std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3004 
3005   return {{
3006       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3007       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3008       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },    // clamp
3009       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }     // omod
3010   }};
3011 }
3012 
3013 InstructionSelector::ComplexRendererFns
3014 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
3015   return {{
3016       [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
3017       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3018       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }  // omod
3019   }};
3020 }
3021 
3022 InstructionSelector::ComplexRendererFns
3023 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
3024   Register Src;
3025   unsigned Mods;
3026   std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3027 
3028   return {{
3029       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3030       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
3031   }};
3032 }
3033 
3034 InstructionSelector::ComplexRendererFns
3035 AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
3036   Register Reg = Root.getReg();
3037   const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3038   if (Def && (Def->getOpcode() == AMDGPU::G_FNEG ||
3039               Def->getOpcode() == AMDGPU::G_FABS))
3040     return {};
3041   return {{
3042       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3043   }};
3044 }
3045 
3046 std::pair<Register, unsigned>
3047 AMDGPUInstructionSelector::selectVOP3PModsImpl(
3048   Register Src, const MachineRegisterInfo &MRI) const {
3049   unsigned Mods = 0;
3050   MachineInstr *MI = MRI.getVRegDef(Src);
3051 
3052   if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
3053       // It's possible to see an f32 fneg here, but unlikely.
3054       // TODO: Treat f32 fneg as only high bit.
3055       MRI.getType(Src) == LLT::vector(2, 16)) {
3056     Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
3057     Src = MI->getOperand(1).getReg();
3058     MI = MRI.getVRegDef(Src);
3059   }
3060 
3061   // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
3062 
3063   // Packed instructions do not have abs modifiers.
3064   Mods |= SISrcMods::OP_SEL_1;
3065 
3066   return std::make_pair(Src, Mods);
3067 }
3068 
3069 InstructionSelector::ComplexRendererFns
3070 AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
3071   MachineRegisterInfo &MRI
3072     = Root.getParent()->getParent()->getParent()->getRegInfo();
3073 
3074   Register Src;
3075   unsigned Mods;
3076   std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
3077 
3078   return {{
3079       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3080       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
3081   }};
3082 }
3083 
3084 InstructionSelector::ComplexRendererFns
3085 AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {
3086   Register Src;
3087   unsigned Mods;
3088   std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3089   if (!TM.Options.NoNaNsFPMath && !isKnownNeverNaN(Src, *MRI))
3090     return None;
3091 
3092   return {{
3093       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3094       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
3095   }};
3096 }
3097 
3098 InstructionSelector::ComplexRendererFns
3099 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
3100   // FIXME: Handle op_sel
3101   return {{
3102       [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
3103       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods
3104   }};
3105 }
3106 
3107 InstructionSelector::ComplexRendererFns
3108 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
3109   SmallVector<GEPInfo, 4> AddrInfo;
3110   getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
3111 
3112   if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3113     return None;
3114 
3115   const GEPInfo &GEPInfo = AddrInfo[0];
3116   Optional<int64_t> EncodedImm =
3117       AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm, false);
3118   if (!EncodedImm)
3119     return None;
3120 
3121   unsigned PtrReg = GEPInfo.SgprParts[0];
3122   return {{
3123     [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3124     [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
3125   }};
3126 }
3127 
3128 InstructionSelector::ComplexRendererFns
3129 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
3130   SmallVector<GEPInfo, 4> AddrInfo;
3131   getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
3132 
3133   if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3134     return None;
3135 
3136   const GEPInfo &GEPInfo = AddrInfo[0];
3137   Register PtrReg = GEPInfo.SgprParts[0];
3138   Optional<int64_t> EncodedImm =
3139       AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
3140   if (!EncodedImm)
3141     return None;
3142 
3143   return {{
3144     [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3145     [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
3146   }};
3147 }
3148 
3149 InstructionSelector::ComplexRendererFns
3150 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
3151   MachineInstr *MI = Root.getParent();
3152   MachineBasicBlock *MBB = MI->getParent();
3153 
3154   SmallVector<GEPInfo, 4> AddrInfo;
3155   getAddrModeInfo(*MI, *MRI, AddrInfo);
3156 
3157   // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
3158   // then we can select all ptr + 32-bit offsets not just immediate offsets.
3159   if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3160     return None;
3161 
3162   const GEPInfo &GEPInfo = AddrInfo[0];
3163   // SGPR offset is unsigned.
3164   if (!GEPInfo.Imm || GEPInfo.Imm < 0 || !isUInt<32>(GEPInfo.Imm))
3165     return None;
3166 
3167   // If we make it this far we have a load with an 32-bit immediate offset.
3168   // It is OK to select this using a sgpr offset, because we have already
3169   // failed trying to select this load into one of the _IMM variants since
3170   // the _IMM Patterns are considered before the _SGPR patterns.
3171   Register PtrReg = GEPInfo.SgprParts[0];
3172   Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3173   BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg)
3174           .addImm(GEPInfo.Imm);
3175   return {{
3176     [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3177     [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }
3178   }};
3179 }
3180 
3181 template <bool Signed>
3182 InstructionSelector::ComplexRendererFns
3183 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const {
3184   MachineInstr *MI = Root.getParent();
3185 
3186   InstructionSelector::ComplexRendererFns Default = {{
3187       [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
3188       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },  // offset
3189       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }  // slc
3190     }};
3191 
3192   if (!STI.hasFlatInstOffsets())
3193     return Default;
3194 
3195   const MachineInstr *OpDef = MRI->getVRegDef(Root.getReg());
3196   if (!OpDef || OpDef->getOpcode() != AMDGPU::G_PTR_ADD)
3197     return Default;
3198 
3199   Optional<int64_t> Offset =
3200     getConstantVRegVal(OpDef->getOperand(2).getReg(), *MRI);
3201   if (!Offset.hasValue())
3202     return Default;
3203 
3204   unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
3205   if (!TII.isLegalFLATOffset(Offset.getValue(), AddrSpace, Signed))
3206     return Default;
3207 
3208   Register BasePtr = OpDef->getOperand(1).getReg();
3209 
3210   return {{
3211       [=](MachineInstrBuilder &MIB) { MIB.addReg(BasePtr); },
3212       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset.getValue()); },
3213       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }  // slc
3214     }};
3215 }
3216 
3217 InstructionSelector::ComplexRendererFns
3218 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
3219   return selectFlatOffsetImpl<false>(Root);
3220 }
3221 
3222 InstructionSelector::ComplexRendererFns
3223 AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const {
3224   return selectFlatOffsetImpl<true>(Root);
3225 }
3226 
3227 static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
3228   auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>();
3229   return PSV && PSV->isStack();
3230 }
3231 
3232 InstructionSelector::ComplexRendererFns
3233 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
3234   MachineInstr *MI = Root.getParent();
3235   MachineBasicBlock *MBB = MI->getParent();
3236   MachineFunction *MF = MBB->getParent();
3237   const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3238 
3239   int64_t Offset = 0;
3240   if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
3241       Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) {
3242     Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3243 
3244     // TODO: Should this be inside the render function? The iterator seems to
3245     // move.
3246     BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
3247             HighBits)
3248       .addImm(Offset & ~4095);
3249 
3250     return {{[=](MachineInstrBuilder &MIB) { // rsrc
3251                MIB.addReg(Info->getScratchRSrcReg());
3252              },
3253              [=](MachineInstrBuilder &MIB) { // vaddr
3254                MIB.addReg(HighBits);
3255              },
3256              [=](MachineInstrBuilder &MIB) { // soffset
3257                const MachineMemOperand *MMO = *MI->memoperands_begin();
3258                const MachinePointerInfo &PtrInfo = MMO->getPointerInfo();
3259 
3260                if (isStackPtrRelative(PtrInfo))
3261                  MIB.addReg(Info->getStackPtrOffsetReg());
3262                else
3263                  MIB.addImm(0);
3264              },
3265              [=](MachineInstrBuilder &MIB) { // offset
3266                MIB.addImm(Offset & 4095);
3267              }}};
3268   }
3269 
3270   assert(Offset == 0 || Offset == -1);
3271 
3272   // Try to fold a frame index directly into the MUBUF vaddr field, and any
3273   // offsets.
3274   Optional<int> FI;
3275   Register VAddr = Root.getReg();
3276   if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) {
3277     if (isBaseWithConstantOffset(Root, *MRI)) {
3278       const MachineOperand &LHS = RootDef->getOperand(1);
3279       const MachineOperand &RHS = RootDef->getOperand(2);
3280       const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg());
3281       const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg());
3282       if (LHSDef && RHSDef) {
3283         int64_t PossibleOffset =
3284             RHSDef->getOperand(1).getCImm()->getSExtValue();
3285         if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) &&
3286             (!STI.privateMemoryResourceIsRangeChecked() ||
3287              KnownBits->signBitIsZero(LHS.getReg()))) {
3288           if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
3289             FI = LHSDef->getOperand(1).getIndex();
3290           else
3291             VAddr = LHS.getReg();
3292           Offset = PossibleOffset;
3293         }
3294       }
3295     } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
3296       FI = RootDef->getOperand(1).getIndex();
3297     }
3298   }
3299 
3300   return {{[=](MachineInstrBuilder &MIB) { // rsrc
3301              MIB.addReg(Info->getScratchRSrcReg());
3302            },
3303            [=](MachineInstrBuilder &MIB) { // vaddr
3304              if (FI.hasValue())
3305                MIB.addFrameIndex(FI.getValue());
3306              else
3307                MIB.addReg(VAddr);
3308            },
3309            [=](MachineInstrBuilder &MIB) { // soffset
3310              // If we don't know this private access is a local stack object, it
3311              // needs to be relative to the entry point's scratch wave offset.
3312              // TODO: Should split large offsets that don't fit like above.
3313              // TODO: Don't use scratch wave offset just because the offset
3314              // didn't fit.
3315              if (!Info->isEntryFunction() && FI.hasValue())
3316                MIB.addReg(Info->getStackPtrOffsetReg());
3317              else
3318                MIB.addImm(0);
3319            },
3320            [=](MachineInstrBuilder &MIB) { // offset
3321              MIB.addImm(Offset);
3322            }}};
3323 }
3324 
3325 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
3326                                                 int64_t Offset,
3327                                                 unsigned OffsetBits) const {
3328   if ((OffsetBits == 16 && !isUInt<16>(Offset)) ||
3329       (OffsetBits == 8 && !isUInt<8>(Offset)))
3330     return false;
3331 
3332   if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
3333     return true;
3334 
3335   // On Southern Islands instruction with a negative base value and an offset
3336   // don't seem to work.
3337   return KnownBits->signBitIsZero(Base);
3338 }
3339 
3340 InstructionSelector::ComplexRendererFns
3341 AMDGPUInstructionSelector::selectMUBUFScratchOffset(
3342     MachineOperand &Root) const {
3343   MachineInstr *MI = Root.getParent();
3344   MachineBasicBlock *MBB = MI->getParent();
3345 
3346   int64_t Offset = 0;
3347   if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
3348       !SIInstrInfo::isLegalMUBUFImmOffset(Offset))
3349     return {};
3350 
3351   const MachineFunction *MF = MBB->getParent();
3352   const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3353   const MachineMemOperand *MMO = *MI->memoperands_begin();
3354   const MachinePointerInfo &PtrInfo = MMO->getPointerInfo();
3355 
3356   return {{
3357       [=](MachineInstrBuilder &MIB) { // rsrc
3358         MIB.addReg(Info->getScratchRSrcReg());
3359       },
3360       [=](MachineInstrBuilder &MIB) { // soffset
3361         if (isStackPtrRelative(PtrInfo))
3362           MIB.addReg(Info->getStackPtrOffsetReg());
3363         else
3364           MIB.addImm(0);
3365       },
3366       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
3367   }};
3368 }
3369 
3370 std::pair<Register, unsigned>
3371 AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
3372   const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
3373   if (!RootDef)
3374     return std::make_pair(Root.getReg(), 0);
3375 
3376   int64_t ConstAddr = 0;
3377 
3378   Register PtrBase;
3379   int64_t Offset;
3380   std::tie(PtrBase, Offset) =
3381     getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3382 
3383   if (Offset) {
3384     if (isDSOffsetLegal(PtrBase, Offset, 16)) {
3385       // (add n0, c0)
3386       return std::make_pair(PtrBase, Offset);
3387     }
3388   } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
3389     // TODO
3390 
3391 
3392   } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
3393     // TODO
3394 
3395   }
3396 
3397   return std::make_pair(Root.getReg(), 0);
3398 }
3399 
3400 InstructionSelector::ComplexRendererFns
3401 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
3402   Register Reg;
3403   unsigned Offset;
3404   std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
3405   return {{
3406       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3407       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
3408     }};
3409 }
3410 
3411 InstructionSelector::ComplexRendererFns
3412 AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
3413   Register Reg;
3414   unsigned Offset;
3415   std::tie(Reg, Offset) = selectDS64Bit4ByteAlignedImpl(Root);
3416   return {{
3417       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3418       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
3419       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
3420     }};
3421 }
3422 
3423 std::pair<Register, unsigned>
3424 AMDGPUInstructionSelector::selectDS64Bit4ByteAlignedImpl(MachineOperand &Root) const {
3425   const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
3426   if (!RootDef)
3427     return std::make_pair(Root.getReg(), 0);
3428 
3429   int64_t ConstAddr = 0;
3430 
3431   Register PtrBase;
3432   int64_t Offset;
3433   std::tie(PtrBase, Offset) =
3434     getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3435 
3436   if (Offset) {
3437     int64_t DWordOffset0 = Offset / 4;
3438     int64_t DWordOffset1 = DWordOffset0 + 1;
3439     if (isDSOffsetLegal(PtrBase, DWordOffset1, 8)) {
3440       // (add n0, c0)
3441       return std::make_pair(PtrBase, DWordOffset0);
3442     }
3443   } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
3444     // TODO
3445 
3446   } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
3447     // TODO
3448 
3449   }
3450 
3451   return std::make_pair(Root.getReg(), 0);
3452 }
3453 
3454 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
3455 /// the base value with the constant offset. There may be intervening copies
3456 /// between \p Root and the identified constant. Returns \p Root, 0 if this does
3457 /// not match the pattern.
3458 std::pair<Register, int64_t>
3459 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
3460   Register Root, const MachineRegisterInfo &MRI) const {
3461   MachineInstr *RootI = MRI.getVRegDef(Root);
3462   if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
3463     return {Root, 0};
3464 
3465   MachineOperand &RHS = RootI->getOperand(2);
3466   Optional<ValueAndVReg> MaybeOffset
3467     = getConstantVRegValWithLookThrough(RHS.getReg(), MRI, true);
3468   if (!MaybeOffset)
3469     return {Root, 0};
3470   return {RootI->getOperand(1).getReg(), MaybeOffset->Value};
3471 }
3472 
3473 static void addZeroImm(MachineInstrBuilder &MIB) {
3474   MIB.addImm(0);
3475 }
3476 
3477 /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
3478 /// BasePtr is not valid, a null base pointer will be used.
3479 static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI,
3480                           uint32_t FormatLo, uint32_t FormatHi,
3481                           Register BasePtr) {
3482   Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
3483   Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
3484   Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3485   Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
3486 
3487   B.buildInstr(AMDGPU::S_MOV_B32)
3488     .addDef(RSrc2)
3489     .addImm(FormatLo);
3490   B.buildInstr(AMDGPU::S_MOV_B32)
3491     .addDef(RSrc3)
3492     .addImm(FormatHi);
3493 
3494   // Build the half of the subregister with the constants before building the
3495   // full 128-bit register. If we are building multiple resource descriptors,
3496   // this will allow CSEing of the 2-component register.
3497   B.buildInstr(AMDGPU::REG_SEQUENCE)
3498     .addDef(RSrcHi)
3499     .addReg(RSrc2)
3500     .addImm(AMDGPU::sub0)
3501     .addReg(RSrc3)
3502     .addImm(AMDGPU::sub1);
3503 
3504   Register RSrcLo = BasePtr;
3505   if (!BasePtr) {
3506     RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3507     B.buildInstr(AMDGPU::S_MOV_B64)
3508       .addDef(RSrcLo)
3509       .addImm(0);
3510   }
3511 
3512   B.buildInstr(AMDGPU::REG_SEQUENCE)
3513     .addDef(RSrc)
3514     .addReg(RSrcLo)
3515     .addImm(AMDGPU::sub0_sub1)
3516     .addReg(RSrcHi)
3517     .addImm(AMDGPU::sub2_sub3);
3518 
3519   return RSrc;
3520 }
3521 
3522 static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
3523                                 const SIInstrInfo &TII, Register BasePtr) {
3524   uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
3525 
3526   // FIXME: Why are half the "default" bits ignored based on the addressing
3527   // mode?
3528   return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
3529 }
3530 
3531 static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
3532                                const SIInstrInfo &TII, Register BasePtr) {
3533   uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
3534 
3535   // FIXME: Why are half the "default" bits ignored based on the addressing
3536   // mode?
3537   return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
3538 }
3539 
3540 AMDGPUInstructionSelector::MUBUFAddressData
3541 AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
3542   MUBUFAddressData Data;
3543   Data.N0 = Src;
3544 
3545   Register PtrBase;
3546   int64_t Offset;
3547 
3548   std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
3549   if (isUInt<32>(Offset)) {
3550     Data.N0 = PtrBase;
3551     Data.Offset = Offset;
3552   }
3553 
3554   if (MachineInstr *InputAdd
3555       = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
3556     Data.N2 = InputAdd->getOperand(1).getReg();
3557     Data.N3 = InputAdd->getOperand(2).getReg();
3558 
3559     // FIXME: Need to fix extra SGPR->VGPRcopies inserted
3560     // FIXME: Don't know this was defined by operand 0
3561     //
3562     // TODO: Remove this when we have copy folding optimizations after
3563     // RegBankSelect.
3564     Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
3565     Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
3566   }
3567 
3568   return Data;
3569 }
3570 
3571 /// Return if the addr64 mubuf mode should be used for the given address.
3572 bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
3573   // (ptr_add N2, N3) -> addr64, or
3574   // (ptr_add (ptr_add N2, N3), C1) -> addr64
3575   if (Addr.N2)
3576     return true;
3577 
3578   const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
3579   return N0Bank->getID() == AMDGPU::VGPRRegBankID;
3580 }
3581 
3582 /// Split an immediate offset \p ImmOffset depending on whether it fits in the
3583 /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
3584 /// component.
3585 void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
3586   MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
3587   if (SIInstrInfo::isLegalMUBUFImmOffset(ImmOffset))
3588     return;
3589 
3590   // Illegal offset, store it in soffset.
3591   SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3592   B.buildInstr(AMDGPU::S_MOV_B32)
3593     .addDef(SOffset)
3594     .addImm(ImmOffset);
3595   ImmOffset = 0;
3596 }
3597 
3598 bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
3599   MachineOperand &Root, Register &VAddr, Register &RSrcReg,
3600   Register &SOffset, int64_t &Offset) const {
3601   // FIXME: Predicates should stop this from reaching here.
3602   // addr64 bit was removed for volcanic islands.
3603   if (!STI.hasAddr64() || STI.useFlatForGlobal())
3604     return false;
3605 
3606   MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
3607   if (!shouldUseAddr64(AddrData))
3608     return false;
3609 
3610   Register N0 = AddrData.N0;
3611   Register N2 = AddrData.N2;
3612   Register N3 = AddrData.N3;
3613   Offset = AddrData.Offset;
3614 
3615   // Base pointer for the SRD.
3616   Register SRDPtr;
3617 
3618   if (N2) {
3619     if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
3620       assert(N3);
3621       if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
3622         // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
3623         // addr64, and construct the default resource from a 0 address.
3624         VAddr = N0;
3625       } else {
3626         SRDPtr = N3;
3627         VAddr = N2;
3628       }
3629     } else {
3630       // N2 is not divergent.
3631       SRDPtr = N2;
3632       VAddr = N3;
3633     }
3634   } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
3635     // Use the default null pointer in the resource
3636     VAddr = N0;
3637   } else {
3638     // N0 -> offset, or
3639     // (N0 + C1) -> offset
3640     SRDPtr = N0;
3641   }
3642 
3643   MachineIRBuilder B(*Root.getParent());
3644   RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
3645   splitIllegalMUBUFOffset(B, SOffset, Offset);
3646   return true;
3647 }
3648 
3649 bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
3650   MachineOperand &Root, Register &RSrcReg, Register &SOffset,
3651   int64_t &Offset) const {
3652   MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
3653   if (shouldUseAddr64(AddrData))
3654     return false;
3655 
3656   // N0 -> offset, or
3657   // (N0 + C1) -> offset
3658   Register SRDPtr = AddrData.N0;
3659   Offset = AddrData.Offset;
3660 
3661   // TODO: Look through extensions for 32-bit soffset.
3662   MachineIRBuilder B(*Root.getParent());
3663 
3664   RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
3665   splitIllegalMUBUFOffset(B, SOffset, Offset);
3666   return true;
3667 }
3668 
3669 InstructionSelector::ComplexRendererFns
3670 AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
3671   Register VAddr;
3672   Register RSrcReg;
3673   Register SOffset;
3674   int64_t Offset = 0;
3675 
3676   if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
3677     return {};
3678 
3679   // FIXME: Use defaulted operands for trailing 0s and remove from the complex
3680   // pattern.
3681   return {{
3682       [=](MachineInstrBuilder &MIB) {  // rsrc
3683         MIB.addReg(RSrcReg);
3684       },
3685       [=](MachineInstrBuilder &MIB) { // vaddr
3686         MIB.addReg(VAddr);
3687       },
3688       [=](MachineInstrBuilder &MIB) { // soffset
3689         if (SOffset)
3690           MIB.addReg(SOffset);
3691         else
3692           MIB.addImm(0);
3693       },
3694       [=](MachineInstrBuilder &MIB) { // offset
3695         MIB.addImm(Offset);
3696       },
3697       addZeroImm, //  glc
3698       addZeroImm, //  slc
3699       addZeroImm, //  tfe
3700       addZeroImm, //  dlc
3701       addZeroImm  //  swz
3702     }};
3703 }
3704 
3705 InstructionSelector::ComplexRendererFns
3706 AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
3707   Register RSrcReg;
3708   Register SOffset;
3709   int64_t Offset = 0;
3710 
3711   if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
3712     return {};
3713 
3714   return {{
3715       [=](MachineInstrBuilder &MIB) {  // rsrc
3716         MIB.addReg(RSrcReg);
3717       },
3718       [=](MachineInstrBuilder &MIB) { // soffset
3719         if (SOffset)
3720           MIB.addReg(SOffset);
3721         else
3722           MIB.addImm(0);
3723       },
3724       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
3725       addZeroImm, //  glc
3726       addZeroImm, //  slc
3727       addZeroImm, //  tfe
3728       addZeroImm, //  dlc
3729       addZeroImm  //  swz
3730     }};
3731 }
3732 
3733 InstructionSelector::ComplexRendererFns
3734 AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const {
3735   Register VAddr;
3736   Register RSrcReg;
3737   Register SOffset;
3738   int64_t Offset = 0;
3739 
3740   if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
3741     return {};
3742 
3743   // FIXME: Use defaulted operands for trailing 0s and remove from the complex
3744   // pattern.
3745   return {{
3746       [=](MachineInstrBuilder &MIB) {  // rsrc
3747         MIB.addReg(RSrcReg);
3748       },
3749       [=](MachineInstrBuilder &MIB) { // vaddr
3750         MIB.addReg(VAddr);
3751       },
3752       [=](MachineInstrBuilder &MIB) { // soffset
3753         if (SOffset)
3754           MIB.addReg(SOffset);
3755         else
3756           MIB.addImm(0);
3757       },
3758       [=](MachineInstrBuilder &MIB) { // offset
3759         MIB.addImm(Offset);
3760       },
3761       addZeroImm //  slc
3762     }};
3763 }
3764 
3765 InstructionSelector::ComplexRendererFns
3766 AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const {
3767   Register RSrcReg;
3768   Register SOffset;
3769   int64_t Offset = 0;
3770 
3771   if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
3772     return {};
3773 
3774   return {{
3775       [=](MachineInstrBuilder &MIB) {  // rsrc
3776         MIB.addReg(RSrcReg);
3777       },
3778       [=](MachineInstrBuilder &MIB) { // soffset
3779         if (SOffset)
3780           MIB.addReg(SOffset);
3781         else
3782           MIB.addImm(0);
3783       },
3784       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
3785       addZeroImm //  slc
3786     }};
3787 }
3788 
3789 /// Get an immediate that must be 32-bits, and treated as zero extended.
3790 static Optional<uint64_t> getConstantZext32Val(Register Reg,
3791                                                const MachineRegisterInfo &MRI) {
3792   // getConstantVRegVal sexts any values, so see if that matters.
3793   Optional<int64_t> OffsetVal = getConstantVRegVal(Reg, MRI);
3794   if (!OffsetVal || !isInt<32>(*OffsetVal))
3795     return None;
3796   return Lo_32(*OffsetVal);
3797 }
3798 
3799 InstructionSelector::ComplexRendererFns
3800 AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
3801   Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
3802   if (!OffsetVal)
3803     return {};
3804 
3805   Optional<int64_t> EncodedImm =
3806       AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
3807   if (!EncodedImm)
3808     return {};
3809 
3810   return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }  }};
3811 }
3812 
3813 InstructionSelector::ComplexRendererFns
3814 AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
3815   assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
3816 
3817   Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
3818   if (!OffsetVal)
3819     return {};
3820 
3821   Optional<int64_t> EncodedImm
3822     = AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal);
3823   if (!EncodedImm)
3824     return {};
3825 
3826   return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }  }};
3827 }
3828 
3829 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
3830                                                  const MachineInstr &MI,
3831                                                  int OpIdx) const {
3832   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
3833          "Expected G_CONSTANT");
3834   MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
3835 }
3836 
3837 void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
3838                                                 const MachineInstr &MI,
3839                                                 int OpIdx) const {
3840   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
3841          "Expected G_CONSTANT");
3842   MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
3843 }
3844 
3845 void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB,
3846                                                  const MachineInstr &MI,
3847                                                  int OpIdx) const {
3848   assert(OpIdx == -1);
3849 
3850   const MachineOperand &Op = MI.getOperand(1);
3851   if (MI.getOpcode() == TargetOpcode::G_FCONSTANT)
3852     MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
3853   else {
3854     assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
3855     MIB.addImm(Op.getCImm()->getSExtValue());
3856   }
3857 }
3858 
3859 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
3860                                                 const MachineInstr &MI,
3861                                                 int OpIdx) const {
3862   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
3863          "Expected G_CONSTANT");
3864   MIB.addImm(MI.getOperand(1).getCImm()->getValue().countPopulation());
3865 }
3866 
3867 /// This only really exists to satisfy DAG type checking machinery, so is a
3868 /// no-op here.
3869 void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
3870                                                 const MachineInstr &MI,
3871                                                 int OpIdx) const {
3872   MIB.addImm(MI.getOperand(OpIdx).getImm());
3873 }
3874 
3875 void AMDGPUInstructionSelector::renderExtractGLC(MachineInstrBuilder &MIB,
3876                                                  const MachineInstr &MI,
3877                                                  int OpIdx) const {
3878   assert(OpIdx >= 0 && "expected to match an immediate operand");
3879   MIB.addImm(MI.getOperand(OpIdx).getImm() & 1);
3880 }
3881 
3882 void AMDGPUInstructionSelector::renderExtractSLC(MachineInstrBuilder &MIB,
3883                                                  const MachineInstr &MI,
3884                                                  int OpIdx) const {
3885   assert(OpIdx >= 0 && "expected to match an immediate operand");
3886   MIB.addImm((MI.getOperand(OpIdx).getImm() >> 1) & 1);
3887 }
3888 
3889 void AMDGPUInstructionSelector::renderExtractDLC(MachineInstrBuilder &MIB,
3890                                                  const MachineInstr &MI,
3891                                                  int OpIdx) const {
3892   assert(OpIdx >= 0 && "expected to match an immediate operand");
3893   MIB.addImm((MI.getOperand(OpIdx).getImm() >> 2) & 1);
3894 }
3895 
3896 void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
3897                                                  const MachineInstr &MI,
3898                                                  int OpIdx) const {
3899   assert(OpIdx >= 0 && "expected to match an immediate operand");
3900   MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1);
3901 }
3902 
3903 bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const {
3904   return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm());
3905 }
3906 
3907 bool AMDGPUInstructionSelector::isInlineImmediate32(int64_t Imm) const {
3908   return AMDGPU::isInlinableLiteral32(Imm, STI.hasInv2PiInlineImm());
3909 }
3910 
3911 bool AMDGPUInstructionSelector::isInlineImmediate64(int64_t Imm) const {
3912   return AMDGPU::isInlinableLiteral64(Imm, STI.hasInv2PiInlineImm());
3913 }
3914 
3915 bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
3916   return TII.isInlineConstant(Imm);
3917 }
3918