1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the InstructionSelector class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUInstructionSelector.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUGlobalISelUtils.h"
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
22 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
25 #include "llvm/IR/DiagnosticInfo.h"
26 
27 #define DEBUG_TYPE "amdgpu-isel"
28 
29 using namespace llvm;
30 using namespace MIPatternMatch;
31 
32 static cl::opt<bool> AllowRiskySelect(
33   "amdgpu-global-isel-risky-select",
34   cl::desc("Allow GlobalISel to select cases that are likely to not work yet"),
35   cl::init(false),
36   cl::ReallyHidden);
37 
38 #define GET_GLOBALISEL_IMPL
39 #define AMDGPUSubtarget GCNSubtarget
40 #include "AMDGPUGenGlobalISel.inc"
41 #undef GET_GLOBALISEL_IMPL
42 #undef AMDGPUSubtarget
43 
44 AMDGPUInstructionSelector::AMDGPUInstructionSelector(
45     const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
46     const AMDGPUTargetMachine &TM)
47     : InstructionSelector(), TII(*STI.getInstrInfo()),
48       TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
49       STI(STI),
50       EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG),
51 #define GET_GLOBALISEL_PREDICATES_INIT
52 #include "AMDGPUGenGlobalISel.inc"
53 #undef GET_GLOBALISEL_PREDICATES_INIT
54 #define GET_GLOBALISEL_TEMPORARIES_INIT
55 #include "AMDGPUGenGlobalISel.inc"
56 #undef GET_GLOBALISEL_TEMPORARIES_INIT
57 {
58 }
59 
60 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
61 
62 void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB,
63                                         CodeGenCoverage &CoverageInfo) {
64   MRI = &MF.getRegInfo();
65   Subtarget = &MF.getSubtarget<GCNSubtarget>();
66   InstructionSelector::setupMF(MF, KB, CoverageInfo);
67 }
68 
69 bool AMDGPUInstructionSelector::isVCC(Register Reg,
70                                       const MachineRegisterInfo &MRI) const {
71   // The verifier is oblivious to s1 being a valid value for wavesize registers.
72   if (Reg.isPhysical())
73     return false;
74 
75   auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
76   const TargetRegisterClass *RC =
77       RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
78   if (RC) {
79     const LLT Ty = MRI.getType(Reg);
80     return RC->hasSuperClassEq(TRI.getBoolRC()) &&
81            Ty.isValid() && Ty.getSizeInBits() == 1;
82   }
83 
84   const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
85   return RB->getID() == AMDGPU::VCCRegBankID;
86 }
87 
88 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
89                                                         unsigned NewOpc) const {
90   MI.setDesc(TII.get(NewOpc));
91   MI.RemoveOperand(1); // Remove intrinsic ID.
92   MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
93 
94   MachineOperand &Dst = MI.getOperand(0);
95   MachineOperand &Src = MI.getOperand(1);
96 
97   // TODO: This should be legalized to s32 if needed
98   if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
99     return false;
100 
101   const TargetRegisterClass *DstRC
102     = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
103   const TargetRegisterClass *SrcRC
104     = TRI.getConstrainedRegClassForOperand(Src, *MRI);
105   if (!DstRC || DstRC != SrcRC)
106     return false;
107 
108   return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
109          RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
110 }
111 
112 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
113   const DebugLoc &DL = I.getDebugLoc();
114   MachineBasicBlock *BB = I.getParent();
115   I.setDesc(TII.get(TargetOpcode::COPY));
116 
117   const MachineOperand &Src = I.getOperand(1);
118   MachineOperand &Dst = I.getOperand(0);
119   Register DstReg = Dst.getReg();
120   Register SrcReg = Src.getReg();
121 
122   if (isVCC(DstReg, *MRI)) {
123     if (SrcReg == AMDGPU::SCC) {
124       const TargetRegisterClass *RC
125         = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
126       if (!RC)
127         return true;
128       return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
129     }
130 
131     if (!isVCC(SrcReg, *MRI)) {
132       // TODO: Should probably leave the copy and let copyPhysReg expand it.
133       if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
134         return false;
135 
136       const TargetRegisterClass *SrcRC
137         = TRI.getConstrainedRegClassForOperand(Src, *MRI);
138 
139       Optional<ValueAndVReg> ConstVal =
140           getConstantVRegValWithLookThrough(SrcReg, *MRI, true, true);
141       if (ConstVal) {
142         unsigned MovOpc =
143             STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
144         BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
145             .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
146       } else {
147         Register MaskedReg = MRI->createVirtualRegister(SrcRC);
148 
149         // We can't trust the high bits at this point, so clear them.
150 
151         // TODO: Skip masking high bits if def is known boolean.
152 
153         unsigned AndOpc =
154             TRI.isSGPRClass(SrcRC) ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
155         BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
156             .addImm(1)
157             .addReg(SrcReg);
158         BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
159             .addImm(0)
160             .addReg(MaskedReg);
161       }
162 
163       if (!MRI->getRegClassOrNull(SrcReg))
164         MRI->setRegClass(SrcReg, SrcRC);
165       I.eraseFromParent();
166       return true;
167     }
168 
169     const TargetRegisterClass *RC =
170       TRI.getConstrainedRegClassForOperand(Dst, *MRI);
171     if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
172       return false;
173 
174     return true;
175   }
176 
177   for (const MachineOperand &MO : I.operands()) {
178     if (MO.getReg().isPhysical())
179       continue;
180 
181     const TargetRegisterClass *RC =
182             TRI.getConstrainedRegClassForOperand(MO, *MRI);
183     if (!RC)
184       continue;
185     RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
186   }
187   return true;
188 }
189 
190 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
191   const Register DefReg = I.getOperand(0).getReg();
192   const LLT DefTy = MRI->getType(DefReg);
193   if (DefTy == LLT::scalar(1)) {
194     if (!AllowRiskySelect) {
195       LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n");
196       return false;
197     }
198 
199     LLVM_DEBUG(dbgs() << "Selecting risky boolean phi\n");
200   }
201 
202   // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
203 
204   const RegClassOrRegBank &RegClassOrBank =
205     MRI->getRegClassOrRegBank(DefReg);
206 
207   const TargetRegisterClass *DefRC
208     = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
209   if (!DefRC) {
210     if (!DefTy.isValid()) {
211       LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
212       return false;
213     }
214 
215     const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
216     DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI);
217     if (!DefRC) {
218       LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
219       return false;
220     }
221   }
222 
223   // TODO: Verify that all registers have the same bank
224   I.setDesc(TII.get(TargetOpcode::PHI));
225   return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
226 }
227 
228 MachineOperand
229 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
230                                            const TargetRegisterClass &SubRC,
231                                            unsigned SubIdx) const {
232 
233   MachineInstr *MI = MO.getParent();
234   MachineBasicBlock *BB = MO.getParent()->getParent();
235   Register DstReg = MRI->createVirtualRegister(&SubRC);
236 
237   if (MO.isReg()) {
238     unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
239     Register Reg = MO.getReg();
240     BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
241             .addReg(Reg, 0, ComposedSubIdx);
242 
243     return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
244                                      MO.isKill(), MO.isDead(), MO.isUndef(),
245                                      MO.isEarlyClobber(), 0, MO.isDebug(),
246                                      MO.isInternalRead());
247   }
248 
249   assert(MO.isImm());
250 
251   APInt Imm(64, MO.getImm());
252 
253   switch (SubIdx) {
254   default:
255     llvm_unreachable("do not know to split immediate with this sub index.");
256   case AMDGPU::sub0:
257     return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
258   case AMDGPU::sub1:
259     return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
260   }
261 }
262 
263 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
264   switch (Opc) {
265   case AMDGPU::G_AND:
266     return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
267   case AMDGPU::G_OR:
268     return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
269   case AMDGPU::G_XOR:
270     return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
271   default:
272     llvm_unreachable("not a bit op");
273   }
274 }
275 
276 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
277   Register DstReg = I.getOperand(0).getReg();
278   unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
279 
280   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
281   if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
282       DstRB->getID() != AMDGPU::VCCRegBankID)
283     return false;
284 
285   bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
286                             STI.isWave64());
287   I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
288 
289   // Dead implicit-def of scc
290   I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
291                                          true, // isImp
292                                          false, // isKill
293                                          true)); // isDead
294   return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
295 }
296 
297 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
298   MachineBasicBlock *BB = I.getParent();
299   MachineFunction *MF = BB->getParent();
300   Register DstReg = I.getOperand(0).getReg();
301   const DebugLoc &DL = I.getDebugLoc();
302   LLT Ty = MRI->getType(DstReg);
303   if (Ty.isVector())
304     return false;
305 
306   unsigned Size = Ty.getSizeInBits();
307   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
308   const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
309   const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
310 
311   if (Size == 32) {
312     if (IsSALU) {
313       const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
314       MachineInstr *Add =
315         BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
316         .add(I.getOperand(1))
317         .add(I.getOperand(2));
318       I.eraseFromParent();
319       return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
320     }
321 
322     if (STI.hasAddNoCarry()) {
323       const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
324       I.setDesc(TII.get(Opc));
325       I.addOperand(*MF, MachineOperand::CreateImm(0));
326       I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
327       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
328     }
329 
330     const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
331 
332     Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
333     MachineInstr *Add
334       = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
335       .addDef(UnusedCarry, RegState::Dead)
336       .add(I.getOperand(1))
337       .add(I.getOperand(2))
338       .addImm(0);
339     I.eraseFromParent();
340     return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
341   }
342 
343   assert(!Sub && "illegal sub should not reach here");
344 
345   const TargetRegisterClass &RC
346     = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
347   const TargetRegisterClass &HalfRC
348     = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
349 
350   MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
351   MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
352   MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
353   MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
354 
355   Register DstLo = MRI->createVirtualRegister(&HalfRC);
356   Register DstHi = MRI->createVirtualRegister(&HalfRC);
357 
358   if (IsSALU) {
359     BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
360       .add(Lo1)
361       .add(Lo2);
362     BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
363       .add(Hi1)
364       .add(Hi2);
365   } else {
366     const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
367     Register CarryReg = MRI->createVirtualRegister(CarryRC);
368     BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
369       .addDef(CarryReg)
370       .add(Lo1)
371       .add(Lo2)
372       .addImm(0);
373     MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
374       .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
375       .add(Hi1)
376       .add(Hi2)
377       .addReg(CarryReg, RegState::Kill)
378       .addImm(0);
379 
380     if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
381       return false;
382   }
383 
384   BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
385     .addReg(DstLo)
386     .addImm(AMDGPU::sub0)
387     .addReg(DstHi)
388     .addImm(AMDGPU::sub1);
389 
390 
391   if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
392     return false;
393 
394   I.eraseFromParent();
395   return true;
396 }
397 
398 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
399   MachineInstr &I) const {
400   MachineBasicBlock *BB = I.getParent();
401   MachineFunction *MF = BB->getParent();
402   const DebugLoc &DL = I.getDebugLoc();
403   Register Dst0Reg = I.getOperand(0).getReg();
404   Register Dst1Reg = I.getOperand(1).getReg();
405   const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
406                      I.getOpcode() == AMDGPU::G_UADDE;
407   const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
408                           I.getOpcode() == AMDGPU::G_USUBE;
409 
410   if (isVCC(Dst1Reg, *MRI)) {
411     unsigned NoCarryOpc =
412         IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
413     unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
414     I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
415     I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
416     I.addOperand(*MF, MachineOperand::CreateImm(0));
417     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
418   }
419 
420   Register Src0Reg = I.getOperand(2).getReg();
421   Register Src1Reg = I.getOperand(3).getReg();
422 
423   if (HasCarryIn) {
424     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
425       .addReg(I.getOperand(4).getReg());
426   }
427 
428   unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
429   unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
430 
431   BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
432     .add(I.getOperand(2))
433     .add(I.getOperand(3));
434   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
435     .addReg(AMDGPU::SCC);
436 
437   if (!MRI->getRegClassOrNull(Dst1Reg))
438     MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
439 
440   if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
441       !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
442       !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
443     return false;
444 
445   if (HasCarryIn &&
446       !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
447                                     AMDGPU::SReg_32RegClass, *MRI))
448     return false;
449 
450   I.eraseFromParent();
451   return true;
452 }
453 
454 // TODO: We should probably legalize these to only using 32-bit results.
455 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
456   MachineBasicBlock *BB = I.getParent();
457   Register DstReg = I.getOperand(0).getReg();
458   Register SrcReg = I.getOperand(1).getReg();
459   LLT DstTy = MRI->getType(DstReg);
460   LLT SrcTy = MRI->getType(SrcReg);
461   const unsigned SrcSize = SrcTy.getSizeInBits();
462   unsigned DstSize = DstTy.getSizeInBits();
463 
464   // TODO: Should handle any multiple of 32 offset.
465   unsigned Offset = I.getOperand(2).getImm();
466   if (Offset % 32 != 0 || DstSize > 128)
467     return false;
468 
469   // 16-bit operations really use 32-bit registers.
470   // FIXME: Probably should not allow 16-bit G_EXTRACT results.
471   if (DstSize == 16)
472     DstSize = 32;
473 
474   const TargetRegisterClass *DstRC =
475     TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
476   if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
477     return false;
478 
479   const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
480   const TargetRegisterClass *SrcRC =
481     TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
482   if (!SrcRC)
483     return false;
484   unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32,
485                                                          DstSize / 32);
486   SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
487   if (!SrcRC)
488     return false;
489 
490   SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
491                                     *SrcRC, I.getOperand(1));
492   const DebugLoc &DL = I.getDebugLoc();
493   BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
494     .addReg(SrcReg, 0, SubReg);
495 
496   I.eraseFromParent();
497   return true;
498 }
499 
500 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
501   MachineBasicBlock *BB = MI.getParent();
502   Register DstReg = MI.getOperand(0).getReg();
503   LLT DstTy = MRI->getType(DstReg);
504   LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
505 
506   const unsigned SrcSize = SrcTy.getSizeInBits();
507   if (SrcSize < 32)
508     return selectImpl(MI, *CoverageInfo);
509 
510   const DebugLoc &DL = MI.getDebugLoc();
511   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
512   const unsigned DstSize = DstTy.getSizeInBits();
513   const TargetRegisterClass *DstRC =
514     TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
515   if (!DstRC)
516     return false;
517 
518   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
519   MachineInstrBuilder MIB =
520     BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
521   for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
522     MachineOperand &Src = MI.getOperand(I + 1);
523     MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
524     MIB.addImm(SubRegs[I]);
525 
526     const TargetRegisterClass *SrcRC
527       = TRI.getConstrainedRegClassForOperand(Src, *MRI);
528     if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
529       return false;
530   }
531 
532   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
533     return false;
534 
535   MI.eraseFromParent();
536   return true;
537 }
538 
539 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
540   MachineBasicBlock *BB = MI.getParent();
541   const int NumDst = MI.getNumOperands() - 1;
542 
543   MachineOperand &Src = MI.getOperand(NumDst);
544 
545   Register SrcReg = Src.getReg();
546   Register DstReg0 = MI.getOperand(0).getReg();
547   LLT DstTy = MRI->getType(DstReg0);
548   LLT SrcTy = MRI->getType(SrcReg);
549 
550   const unsigned DstSize = DstTy.getSizeInBits();
551   const unsigned SrcSize = SrcTy.getSizeInBits();
552   const DebugLoc &DL = MI.getDebugLoc();
553   const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
554 
555   const TargetRegisterClass *SrcRC =
556     TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
557   if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
558     return false;
559 
560   // Note we could have mixed SGPR and VGPR destination banks for an SGPR
561   // source, and this relies on the fact that the same subregister indices are
562   // used for both.
563   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
564   for (int I = 0, E = NumDst; I != E; ++I) {
565     MachineOperand &Dst = MI.getOperand(I);
566     BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
567       .addReg(SrcReg, 0, SubRegs[I]);
568 
569     // Make sure the subregister index is valid for the source register.
570     SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
571     if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
572       return false;
573 
574     const TargetRegisterClass *DstRC =
575       TRI.getConstrainedRegClassForOperand(Dst, *MRI);
576     if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
577       return false;
578   }
579 
580   MI.eraseFromParent();
581   return true;
582 }
583 
584 bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
585   MachineInstr &MI) const {
586   if (selectImpl(MI, *CoverageInfo))
587     return true;
588 
589   const LLT S32 = LLT::scalar(32);
590   const LLT V2S16 = LLT::vector(2, 16);
591 
592   Register Dst = MI.getOperand(0).getReg();
593   if (MRI->getType(Dst) != V2S16)
594     return false;
595 
596   const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
597   if (DstBank->getID() != AMDGPU::SGPRRegBankID)
598     return false;
599 
600   Register Src0 = MI.getOperand(1).getReg();
601   Register Src1 = MI.getOperand(2).getReg();
602   if (MRI->getType(Src0) != S32)
603     return false;
604 
605   const DebugLoc &DL = MI.getDebugLoc();
606   MachineBasicBlock *BB = MI.getParent();
607 
608   auto ConstSrc1 =
609       getConstantVRegValWithLookThrough(Src1, *MRI, true, true, true);
610   if (ConstSrc1) {
611     auto ConstSrc0 =
612         getConstantVRegValWithLookThrough(Src0, *MRI, true, true, true);
613     if (ConstSrc0) {
614       const int64_t K0 = ConstSrc0->Value.getSExtValue();
615       const int64_t K1 = ConstSrc1->Value.getSExtValue();
616       uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
617       uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
618 
619       BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst)
620         .addImm(Lo16 | (Hi16 << 16));
621       MI.eraseFromParent();
622       return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
623     }
624   }
625 
626   // TODO: This should probably be a combine somewhere
627   // (build_vector_trunc $src0, undef -> copy $src0
628   MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
629   if (Src1Def && Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
630     MI.setDesc(TII.get(AMDGPU::COPY));
631     MI.RemoveOperand(2);
632     return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI) &&
633            RBI.constrainGenericRegister(Src0, AMDGPU::SReg_32RegClass, *MRI);
634   }
635 
636   Register ShiftSrc0;
637   Register ShiftSrc1;
638 
639   // With multiple uses of the shift, this will duplicate the shift and
640   // increase register pressure.
641   //
642   // (build_vector_trunc (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
643   //  => (S_PACK_HH_B32_B16 $src0, $src1)
644   // (build_vector_trunc $src0, (lshr_oneuse SReg_32:$src1, 16))
645   //  => (S_PACK_LH_B32_B16 $src0, $src1)
646   // (build_vector_trunc $src0, $src1)
647   //  => (S_PACK_LL_B32_B16 $src0, $src1)
648 
649   bool Shift0 = mi_match(
650       Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
651 
652   bool Shift1 = mi_match(
653       Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
654 
655   unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
656   if (Shift0 && Shift1) {
657     Opc = AMDGPU::S_PACK_HH_B32_B16;
658     MI.getOperand(1).setReg(ShiftSrc0);
659     MI.getOperand(2).setReg(ShiftSrc1);
660   } else if (Shift1) {
661     Opc = AMDGPU::S_PACK_LH_B32_B16;
662     MI.getOperand(2).setReg(ShiftSrc1);
663   } else if (Shift0 && ConstSrc1 && ConstSrc1->Value == 0) {
664     // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
665     auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
666       .addReg(ShiftSrc0)
667       .addImm(16);
668 
669     MI.eraseFromParent();
670     return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
671   }
672 
673   MI.setDesc(TII.get(Opc));
674   return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
675 }
676 
677 bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const {
678   return selectG_ADD_SUB(I);
679 }
680 
681 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
682   const MachineOperand &MO = I.getOperand(0);
683 
684   // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
685   // regbank check here is to know why getConstrainedRegClassForOperand failed.
686   const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
687   if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
688       (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
689     I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
690     return true;
691   }
692 
693   return false;
694 }
695 
696 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
697   MachineBasicBlock *BB = I.getParent();
698 
699   Register DstReg = I.getOperand(0).getReg();
700   Register Src0Reg = I.getOperand(1).getReg();
701   Register Src1Reg = I.getOperand(2).getReg();
702   LLT Src1Ty = MRI->getType(Src1Reg);
703 
704   unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
705   unsigned InsSize = Src1Ty.getSizeInBits();
706 
707   int64_t Offset = I.getOperand(3).getImm();
708 
709   // FIXME: These cases should have been illegal and unnecessary to check here.
710   if (Offset % 32 != 0 || InsSize % 32 != 0)
711     return false;
712 
713   // Currently not handled by getSubRegFromChannel.
714   if (InsSize > 128)
715     return false;
716 
717   unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
718   if (SubReg == AMDGPU::NoSubRegister)
719     return false;
720 
721   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
722   const TargetRegisterClass *DstRC =
723     TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
724   if (!DstRC)
725     return false;
726 
727   const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
728   const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
729   const TargetRegisterClass *Src0RC =
730     TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI);
731   const TargetRegisterClass *Src1RC =
732     TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI);
733 
734   // Deal with weird cases where the class only partially supports the subreg
735   // index.
736   Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
737   if (!Src0RC || !Src1RC)
738     return false;
739 
740   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
741       !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
742       !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
743     return false;
744 
745   const DebugLoc &DL = I.getDebugLoc();
746   BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
747     .addReg(Src0Reg)
748     .addReg(Src1Reg)
749     .addImm(SubReg);
750 
751   I.eraseFromParent();
752   return true;
753 }
754 
755 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
756   if (STI.getLDSBankCount() != 16)
757     return selectImpl(MI, *CoverageInfo);
758 
759   Register Dst = MI.getOperand(0).getReg();
760   Register Src0 = MI.getOperand(2).getReg();
761   Register M0Val = MI.getOperand(6).getReg();
762   if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
763       !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
764       !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
765     return false;
766 
767   // This requires 2 instructions. It is possible to write a pattern to support
768   // this, but the generated isel emitter doesn't correctly deal with multiple
769   // output instructions using the same physical register input. The copy to m0
770   // is incorrectly placed before the second instruction.
771   //
772   // TODO: Match source modifiers.
773 
774   Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
775   const DebugLoc &DL = MI.getDebugLoc();
776   MachineBasicBlock *MBB = MI.getParent();
777 
778   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
779     .addReg(M0Val);
780   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
781     .addImm(2)
782     .addImm(MI.getOperand(4).getImm())  // $attr
783     .addImm(MI.getOperand(3).getImm()); // $attrchan
784 
785   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
786     .addImm(0)                          // $src0_modifiers
787     .addReg(Src0)                       // $src0
788     .addImm(MI.getOperand(4).getImm())  // $attr
789     .addImm(MI.getOperand(3).getImm())  // $attrchan
790     .addImm(0)                          // $src2_modifiers
791     .addReg(InterpMov)                  // $src2 - 2 f16 values selected by high
792     .addImm(MI.getOperand(5).getImm())  // $high
793     .addImm(0)                          // $clamp
794     .addImm(0);                         // $omod
795 
796   MI.eraseFromParent();
797   return true;
798 }
799 
800 // Writelane is special in that it can use SGPR and M0 (which would normally
801 // count as using the constant bus twice - but in this case it is allowed since
802 // the lane selector doesn't count as a use of the constant bus). However, it is
803 // still required to abide by the 1 SGPR rule. Fix this up if we might have
804 // multiple SGPRs.
805 bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
806   // With a constant bus limit of at least 2, there's no issue.
807   if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
808     return selectImpl(MI, *CoverageInfo);
809 
810   MachineBasicBlock *MBB = MI.getParent();
811   const DebugLoc &DL = MI.getDebugLoc();
812   Register VDst = MI.getOperand(0).getReg();
813   Register Val = MI.getOperand(2).getReg();
814   Register LaneSelect = MI.getOperand(3).getReg();
815   Register VDstIn = MI.getOperand(4).getReg();
816 
817   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
818 
819   Optional<ValueAndVReg> ConstSelect =
820     getConstantVRegValWithLookThrough(LaneSelect, *MRI, true, true);
821   if (ConstSelect) {
822     // The selector has to be an inline immediate, so we can use whatever for
823     // the other operands.
824     MIB.addReg(Val);
825     MIB.addImm(ConstSelect->Value.getSExtValue() &
826                maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
827   } else {
828     Optional<ValueAndVReg> ConstVal =
829       getConstantVRegValWithLookThrough(Val, *MRI, true, true);
830 
831     // If the value written is an inline immediate, we can get away without a
832     // copy to m0.
833     if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
834                                                  STI.hasInv2PiInlineImm())) {
835       MIB.addImm(ConstVal->Value.getSExtValue());
836       MIB.addReg(LaneSelect);
837     } else {
838       MIB.addReg(Val);
839 
840       // If the lane selector was originally in a VGPR and copied with
841       // readfirstlane, there's a hazard to read the same SGPR from the
842       // VALU. Constrain to a different SGPR to help avoid needing a nop later.
843       RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
844 
845       BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
846         .addReg(LaneSelect);
847       MIB.addReg(AMDGPU::M0);
848     }
849   }
850 
851   MIB.addReg(VDstIn);
852 
853   MI.eraseFromParent();
854   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
855 }
856 
857 // We need to handle this here because tablegen doesn't support matching
858 // instructions with multiple outputs.
859 bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
860   Register Dst0 = MI.getOperand(0).getReg();
861   Register Dst1 = MI.getOperand(1).getReg();
862 
863   LLT Ty = MRI->getType(Dst0);
864   unsigned Opc;
865   if (Ty == LLT::scalar(32))
866     Opc = AMDGPU::V_DIV_SCALE_F32_e64;
867   else if (Ty == LLT::scalar(64))
868     Opc = AMDGPU::V_DIV_SCALE_F64_e64;
869   else
870     return false;
871 
872   // TODO: Match source modifiers.
873 
874   const DebugLoc &DL = MI.getDebugLoc();
875   MachineBasicBlock *MBB = MI.getParent();
876 
877   Register Numer = MI.getOperand(3).getReg();
878   Register Denom = MI.getOperand(4).getReg();
879   unsigned ChooseDenom = MI.getOperand(5).getImm();
880 
881   Register Src0 = ChooseDenom != 0 ? Numer : Denom;
882 
883   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
884     .addDef(Dst1)
885     .addImm(0)     // $src0_modifiers
886     .addUse(Src0)  // $src0
887     .addImm(0)     // $src1_modifiers
888     .addUse(Denom) // $src1
889     .addImm(0)     // $src2_modifiers
890     .addUse(Numer) // $src2
891     .addImm(0)     // $clamp
892     .addImm(0);    // $omod
893 
894   MI.eraseFromParent();
895   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
896 }
897 
898 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
899   unsigned IntrinsicID = I.getIntrinsicID();
900   switch (IntrinsicID) {
901   case Intrinsic::amdgcn_if_break: {
902     MachineBasicBlock *BB = I.getParent();
903 
904     // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
905     // SelectionDAG uses for wave32 vs wave64.
906     BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
907       .add(I.getOperand(0))
908       .add(I.getOperand(2))
909       .add(I.getOperand(3));
910 
911     Register DstReg = I.getOperand(0).getReg();
912     Register Src0Reg = I.getOperand(2).getReg();
913     Register Src1Reg = I.getOperand(3).getReg();
914 
915     I.eraseFromParent();
916 
917     for (Register Reg : { DstReg, Src0Reg, Src1Reg })
918       MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
919 
920     return true;
921   }
922   case Intrinsic::amdgcn_interp_p1_f16:
923     return selectInterpP1F16(I);
924   case Intrinsic::amdgcn_wqm:
925     return constrainCopyLikeIntrin(I, AMDGPU::WQM);
926   case Intrinsic::amdgcn_softwqm:
927     return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
928   case Intrinsic::amdgcn_wwm:
929     return constrainCopyLikeIntrin(I, AMDGPU::WWM);
930   case Intrinsic::amdgcn_writelane:
931     return selectWritelane(I);
932   case Intrinsic::amdgcn_div_scale:
933     return selectDivScale(I);
934   case Intrinsic::amdgcn_icmp:
935     return selectIntrinsicIcmp(I);
936   case Intrinsic::amdgcn_ballot:
937     return selectBallot(I);
938   case Intrinsic::amdgcn_reloc_constant:
939     return selectRelocConstant(I);
940   case Intrinsic::amdgcn_groupstaticsize:
941     return selectGroupStaticSize(I);
942   case Intrinsic::returnaddress:
943     return selectReturnAddress(I);
944   default:
945     return selectImpl(I, *CoverageInfo);
946   }
947 }
948 
949 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) {
950   if (Size != 32 && Size != 64)
951     return -1;
952   switch (P) {
953   default:
954     llvm_unreachable("Unknown condition code!");
955   case CmpInst::ICMP_NE:
956     return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64;
957   case CmpInst::ICMP_EQ:
958     return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64;
959   case CmpInst::ICMP_SGT:
960     return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64;
961   case CmpInst::ICMP_SGE:
962     return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64;
963   case CmpInst::ICMP_SLT:
964     return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64;
965   case CmpInst::ICMP_SLE:
966     return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64;
967   case CmpInst::ICMP_UGT:
968     return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64;
969   case CmpInst::ICMP_UGE:
970     return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64;
971   case CmpInst::ICMP_ULT:
972     return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64;
973   case CmpInst::ICMP_ULE:
974     return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64;
975   }
976 }
977 
978 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
979                                               unsigned Size) const {
980   if (Size == 64) {
981     if (!STI.hasScalarCompareEq64())
982       return -1;
983 
984     switch (P) {
985     case CmpInst::ICMP_NE:
986       return AMDGPU::S_CMP_LG_U64;
987     case CmpInst::ICMP_EQ:
988       return AMDGPU::S_CMP_EQ_U64;
989     default:
990       return -1;
991     }
992   }
993 
994   if (Size != 32)
995     return -1;
996 
997   switch (P) {
998   case CmpInst::ICMP_NE:
999     return AMDGPU::S_CMP_LG_U32;
1000   case CmpInst::ICMP_EQ:
1001     return AMDGPU::S_CMP_EQ_U32;
1002   case CmpInst::ICMP_SGT:
1003     return AMDGPU::S_CMP_GT_I32;
1004   case CmpInst::ICMP_SGE:
1005     return AMDGPU::S_CMP_GE_I32;
1006   case CmpInst::ICMP_SLT:
1007     return AMDGPU::S_CMP_LT_I32;
1008   case CmpInst::ICMP_SLE:
1009     return AMDGPU::S_CMP_LE_I32;
1010   case CmpInst::ICMP_UGT:
1011     return AMDGPU::S_CMP_GT_U32;
1012   case CmpInst::ICMP_UGE:
1013     return AMDGPU::S_CMP_GE_U32;
1014   case CmpInst::ICMP_ULT:
1015     return AMDGPU::S_CMP_LT_U32;
1016   case CmpInst::ICMP_ULE:
1017     return AMDGPU::S_CMP_LE_U32;
1018   default:
1019     llvm_unreachable("Unknown condition code!");
1020   }
1021 }
1022 
1023 bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
1024   MachineBasicBlock *BB = I.getParent();
1025   const DebugLoc &DL = I.getDebugLoc();
1026 
1027   Register SrcReg = I.getOperand(2).getReg();
1028   unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1029 
1030   auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1031 
1032   Register CCReg = I.getOperand(0).getReg();
1033   if (!isVCC(CCReg, *MRI)) {
1034     int Opcode = getS_CMPOpcode(Pred, Size);
1035     if (Opcode == -1)
1036       return false;
1037     MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1038             .add(I.getOperand(2))
1039             .add(I.getOperand(3));
1040     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1041       .addReg(AMDGPU::SCC);
1042     bool Ret =
1043         constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
1044         RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1045     I.eraseFromParent();
1046     return Ret;
1047   }
1048 
1049   int Opcode = getV_CMPOpcode(Pred, Size);
1050   if (Opcode == -1)
1051     return false;
1052 
1053   MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode),
1054             I.getOperand(0).getReg())
1055             .add(I.getOperand(2))
1056             .add(I.getOperand(3));
1057   RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
1058                                *TRI.getBoolRC(), *MRI);
1059   bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1060   I.eraseFromParent();
1061   return Ret;
1062 }
1063 
1064 bool AMDGPUInstructionSelector::selectIntrinsicIcmp(MachineInstr &I) const {
1065   Register Dst = I.getOperand(0).getReg();
1066   if (isVCC(Dst, *MRI))
1067     return false;
1068 
1069   if (MRI->getType(Dst).getSizeInBits() != STI.getWavefrontSize())
1070     return false;
1071 
1072   MachineBasicBlock *BB = I.getParent();
1073   const DebugLoc &DL = I.getDebugLoc();
1074   Register SrcReg = I.getOperand(2).getReg();
1075   unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1076   auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1077 
1078   int Opcode = getV_CMPOpcode(Pred, Size);
1079   if (Opcode == -1)
1080     return false;
1081 
1082   MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst)
1083                            .add(I.getOperand(2))
1084                            .add(I.getOperand(3));
1085   RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), *TRI.getBoolRC(),
1086                                *MRI);
1087   bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1088   I.eraseFromParent();
1089   return Ret;
1090 }
1091 
1092 bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1093   MachineBasicBlock *BB = I.getParent();
1094   const DebugLoc &DL = I.getDebugLoc();
1095   Register DstReg = I.getOperand(0).getReg();
1096   const unsigned Size = MRI->getType(DstReg).getSizeInBits();
1097   const bool Is64 = Size == 64;
1098 
1099   if (Size != STI.getWavefrontSize())
1100     return false;
1101 
1102   Optional<ValueAndVReg> Arg =
1103       getConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI, true);
1104 
1105   if (Arg.hasValue()) {
1106     const int64_t Value = Arg.getValue().Value.getSExtValue();
1107     if (Value == 0) {
1108       unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1109       BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
1110     } else if (Value == -1) { // all ones
1111       Register SrcReg = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
1112       BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
1113     } else
1114       return false;
1115   } else {
1116     Register SrcReg = I.getOperand(2).getReg();
1117     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
1118   }
1119 
1120   I.eraseFromParent();
1121   return true;
1122 }
1123 
1124 bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1125   Register DstReg = I.getOperand(0).getReg();
1126   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1127   const TargetRegisterClass *DstRC =
1128     TRI.getRegClassForSizeOnBank(32, *DstBank, *MRI);
1129   if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1130     return false;
1131 
1132   const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1133 
1134   Module *M = MF->getFunction().getParent();
1135   const MDNode *Metadata = I.getOperand(2).getMetadata();
1136   auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1137   auto RelocSymbol = cast<GlobalVariable>(
1138     M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1139 
1140   MachineBasicBlock *BB = I.getParent();
1141   BuildMI(*BB, &I, I.getDebugLoc(),
1142           TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1143     .addGlobalAddress(RelocSymbol, 0, SIInstrInfo::MO_ABS32_LO);
1144 
1145   I.eraseFromParent();
1146   return true;
1147 }
1148 
1149 bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1150   Triple::OSType OS = MF->getTarget().getTargetTriple().getOS();
1151 
1152   Register DstReg = I.getOperand(0).getReg();
1153   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1154   unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1155     AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1156 
1157   MachineBasicBlock *MBB = I.getParent();
1158   const DebugLoc &DL = I.getDebugLoc();
1159 
1160   auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1161 
1162   if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1163     const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1164     MIB.addImm(MFI->getLDSSize());
1165   } else {
1166     Module *M = MF->getFunction().getParent();
1167     const GlobalValue *GV
1168       = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1169     MIB.addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
1170   }
1171 
1172   I.eraseFromParent();
1173   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1174 }
1175 
1176 bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1177   MachineBasicBlock *MBB = I.getParent();
1178   MachineFunction &MF = *MBB->getParent();
1179   const DebugLoc &DL = I.getDebugLoc();
1180 
1181   MachineOperand &Dst = I.getOperand(0);
1182   Register DstReg = Dst.getReg();
1183   unsigned Depth = I.getOperand(2).getImm();
1184 
1185   const TargetRegisterClass *RC
1186     = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1187   if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1188       !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1189     return false;
1190 
1191   // Check for kernel and shader functions
1192   if (Depth != 0 ||
1193       MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1194     BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1195       .addImm(0);
1196     I.eraseFromParent();
1197     return true;
1198   }
1199 
1200   MachineFrameInfo &MFI = MF.getFrameInfo();
1201   // There is a call to @llvm.returnaddress in this function
1202   MFI.setReturnAddressIsTaken(true);
1203 
1204   // Get the return address reg and mark it as an implicit live-in
1205   Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1206   Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1207                                              AMDGPU::SReg_64RegClass);
1208   BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1209     .addReg(LiveIn);
1210   I.eraseFromParent();
1211   return true;
1212 }
1213 
1214 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1215   // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
1216   // SelectionDAG uses for wave32 vs wave64.
1217   MachineBasicBlock *BB = MI.getParent();
1218   BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1219       .add(MI.getOperand(1));
1220 
1221   Register Reg = MI.getOperand(1).getReg();
1222   MI.eraseFromParent();
1223 
1224   if (!MRI->getRegClassOrNull(Reg))
1225     MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1226   return true;
1227 }
1228 
1229 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1230   MachineInstr &MI, Intrinsic::ID IntrID) const {
1231   MachineBasicBlock *MBB = MI.getParent();
1232   MachineFunction *MF = MBB->getParent();
1233   const DebugLoc &DL = MI.getDebugLoc();
1234 
1235   unsigned IndexOperand = MI.getOperand(7).getImm();
1236   bool WaveRelease = MI.getOperand(8).getImm() != 0;
1237   bool WaveDone = MI.getOperand(9).getImm() != 0;
1238 
1239   if (WaveDone && !WaveRelease)
1240     report_fatal_error("ds_ordered_count: wave_done requires wave_release");
1241 
1242   unsigned OrderedCountIndex = IndexOperand & 0x3f;
1243   IndexOperand &= ~0x3f;
1244   unsigned CountDw = 0;
1245 
1246   if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1247     CountDw = (IndexOperand >> 24) & 0xf;
1248     IndexOperand &= ~(0xf << 24);
1249 
1250     if (CountDw < 1 || CountDw > 4) {
1251       report_fatal_error(
1252         "ds_ordered_count: dword count must be between 1 and 4");
1253     }
1254   }
1255 
1256   if (IndexOperand)
1257     report_fatal_error("ds_ordered_count: bad index operand");
1258 
1259   unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1260   unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1261 
1262   unsigned Offset0 = OrderedCountIndex << 2;
1263   unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) |
1264                      (Instruction << 4);
1265 
1266   if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
1267     Offset1 |= (CountDw - 1) << 6;
1268 
1269   unsigned Offset = Offset0 | (Offset1 << 8);
1270 
1271   Register M0Val = MI.getOperand(2).getReg();
1272   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1273     .addReg(M0Val);
1274 
1275   Register DstReg = MI.getOperand(0).getReg();
1276   Register ValReg = MI.getOperand(3).getReg();
1277   MachineInstrBuilder DS =
1278     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1279       .addReg(ValReg)
1280       .addImm(Offset)
1281       .cloneMemRefs(MI);
1282 
1283   if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1284     return false;
1285 
1286   bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1287   MI.eraseFromParent();
1288   return Ret;
1289 }
1290 
1291 static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1292   switch (IntrID) {
1293   case Intrinsic::amdgcn_ds_gws_init:
1294     return AMDGPU::DS_GWS_INIT;
1295   case Intrinsic::amdgcn_ds_gws_barrier:
1296     return AMDGPU::DS_GWS_BARRIER;
1297   case Intrinsic::amdgcn_ds_gws_sema_v:
1298     return AMDGPU::DS_GWS_SEMA_V;
1299   case Intrinsic::amdgcn_ds_gws_sema_br:
1300     return AMDGPU::DS_GWS_SEMA_BR;
1301   case Intrinsic::amdgcn_ds_gws_sema_p:
1302     return AMDGPU::DS_GWS_SEMA_P;
1303   case Intrinsic::amdgcn_ds_gws_sema_release_all:
1304     return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1305   default:
1306     llvm_unreachable("not a gws intrinsic");
1307   }
1308 }
1309 
1310 bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1311                                                      Intrinsic::ID IID) const {
1312   if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1313       !STI.hasGWSSemaReleaseAll())
1314     return false;
1315 
1316   // intrinsic ID, vsrc, offset
1317   const bool HasVSrc = MI.getNumOperands() == 3;
1318   assert(HasVSrc || MI.getNumOperands() == 2);
1319 
1320   Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1321   const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1322   if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1323     return false;
1324 
1325   MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1326   assert(OffsetDef);
1327 
1328   unsigned ImmOffset;
1329 
1330   MachineBasicBlock *MBB = MI.getParent();
1331   const DebugLoc &DL = MI.getDebugLoc();
1332 
1333   MachineInstr *Readfirstlane = nullptr;
1334 
1335   // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1336   // incoming offset, in case there's an add of a constant. We'll have to put it
1337   // back later.
1338   if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1339     Readfirstlane = OffsetDef;
1340     BaseOffset = OffsetDef->getOperand(1).getReg();
1341     OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1342   }
1343 
1344   if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1345     // If we have a constant offset, try to use the 0 in m0 as the base.
1346     // TODO: Look into changing the default m0 initialization value. If the
1347     // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1348     // the immediate offset.
1349 
1350     ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1351     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1352       .addImm(0);
1353   } else {
1354     std::tie(BaseOffset, ImmOffset) =
1355         AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset);
1356 
1357     if (Readfirstlane) {
1358       // We have the constant offset now, so put the readfirstlane back on the
1359       // variable component.
1360       if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1361         return false;
1362 
1363       Readfirstlane->getOperand(1).setReg(BaseOffset);
1364       BaseOffset = Readfirstlane->getOperand(0).getReg();
1365     } else {
1366       if (!RBI.constrainGenericRegister(BaseOffset,
1367                                         AMDGPU::SReg_32RegClass, *MRI))
1368         return false;
1369     }
1370 
1371     Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1372     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1373       .addReg(BaseOffset)
1374       .addImm(16);
1375 
1376     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1377       .addReg(M0Base);
1378   }
1379 
1380   // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1381   // offset field) % 64. Some versions of the programming guide omit the m0
1382   // part, or claim it's from offset 0.
1383   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1384 
1385   if (HasVSrc) {
1386     Register VSrc = MI.getOperand(1).getReg();
1387     MIB.addReg(VSrc);
1388     if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1389       return false;
1390   }
1391 
1392   MIB.addImm(ImmOffset)
1393      .cloneMemRefs(MI);
1394 
1395   MI.eraseFromParent();
1396   return true;
1397 }
1398 
1399 bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1400                                                       bool IsAppend) const {
1401   Register PtrBase = MI.getOperand(2).getReg();
1402   LLT PtrTy = MRI->getType(PtrBase);
1403   bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1404 
1405   unsigned Offset;
1406   std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1407 
1408   // TODO: Should this try to look through readfirstlane like GWS?
1409   if (!isDSOffsetLegal(PtrBase, Offset)) {
1410     PtrBase = MI.getOperand(2).getReg();
1411     Offset = 0;
1412   }
1413 
1414   MachineBasicBlock *MBB = MI.getParent();
1415   const DebugLoc &DL = MI.getDebugLoc();
1416   const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1417 
1418   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1419     .addReg(PtrBase);
1420   if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1421     return false;
1422 
1423   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1424     .addImm(Offset)
1425     .addImm(IsGDS ? -1 : 0)
1426     .cloneMemRefs(MI);
1427   MI.eraseFromParent();
1428   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1429 }
1430 
1431 bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
1432   if (TM.getOptLevel() > CodeGenOpt::None) {
1433     unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
1434     if (WGSize <= STI.getWavefrontSize()) {
1435       MachineBasicBlock *MBB = MI.getParent();
1436       const DebugLoc &DL = MI.getDebugLoc();
1437       BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER));
1438       MI.eraseFromParent();
1439       return true;
1440     }
1441   }
1442   return selectImpl(MI, *CoverageInfo);
1443 }
1444 
1445 static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
1446                          bool &IsTexFail) {
1447   if (TexFailCtrl)
1448     IsTexFail = true;
1449 
1450   TFE = (TexFailCtrl & 0x1) ? 1 : 0;
1451   TexFailCtrl &= ~(uint64_t)0x1;
1452   LWE = (TexFailCtrl & 0x2) ? 1 : 0;
1453   TexFailCtrl &= ~(uint64_t)0x2;
1454 
1455   return TexFailCtrl == 0;
1456 }
1457 
1458 static bool parseCachePolicy(uint64_t Value,
1459                              bool *GLC, bool *SLC, bool *DLC) {
1460   if (GLC) {
1461     *GLC = (Value & 0x1) ? 1 : 0;
1462     Value &= ~(uint64_t)0x1;
1463   }
1464   if (SLC) {
1465     *SLC = (Value & 0x2) ? 1 : 0;
1466     Value &= ~(uint64_t)0x2;
1467   }
1468   if (DLC) {
1469     *DLC = (Value & 0x4) ? 1 : 0;
1470     Value &= ~(uint64_t)0x4;
1471   }
1472 
1473   return Value == 0;
1474 }
1475 
1476 bool AMDGPUInstructionSelector::selectImageIntrinsic(
1477   MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
1478   MachineBasicBlock *MBB = MI.getParent();
1479   const DebugLoc &DL = MI.getDebugLoc();
1480 
1481   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1482     AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1483 
1484   const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
1485   const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
1486       AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
1487   const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo =
1488       AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode);
1489   unsigned IntrOpcode = Intr->BaseOpcode;
1490   const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
1491 
1492   const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
1493 
1494   Register VDataIn, VDataOut;
1495   LLT VDataTy;
1496   int NumVDataDwords = -1;
1497   bool IsD16 = false;
1498 
1499   bool Unorm;
1500   if (!BaseOpcode->Sampler)
1501     Unorm = true;
1502   else
1503     Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
1504 
1505   bool TFE;
1506   bool LWE;
1507   bool IsTexFail = false;
1508   if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
1509                     TFE, LWE, IsTexFail))
1510     return false;
1511 
1512   const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
1513   const bool IsA16 = (Flags & 1) != 0;
1514   const bool IsG16 = (Flags & 2) != 0;
1515 
1516   // A16 implies 16 bit gradients
1517   if (IsA16 && !IsG16)
1518     return false;
1519 
1520   unsigned DMask = 0;
1521   unsigned DMaskLanes = 0;
1522 
1523   if (BaseOpcode->Atomic) {
1524     VDataOut = MI.getOperand(0).getReg();
1525     VDataIn = MI.getOperand(2).getReg();
1526     LLT Ty = MRI->getType(VDataIn);
1527 
1528     // Be careful to allow atomic swap on 16-bit element vectors.
1529     const bool Is64Bit = BaseOpcode->AtomicX2 ?
1530       Ty.getSizeInBits() == 128 :
1531       Ty.getSizeInBits() == 64;
1532 
1533     if (BaseOpcode->AtomicX2) {
1534       assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
1535 
1536       DMask = Is64Bit ? 0xf : 0x3;
1537       NumVDataDwords = Is64Bit ? 4 : 2;
1538     } else {
1539       DMask = Is64Bit ? 0x3 : 0x1;
1540       NumVDataDwords = Is64Bit ? 2 : 1;
1541     }
1542   } else {
1543     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
1544     DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
1545 
1546     // One memoperand is mandatory, except for getresinfo.
1547     // FIXME: Check this in verifier.
1548     if (!MI.memoperands_empty()) {
1549       const MachineMemOperand *MMO = *MI.memoperands_begin();
1550 
1551       // Infer d16 from the memory size, as the register type will be mangled by
1552       // unpacked subtargets, or by TFE.
1553       IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32;
1554     }
1555 
1556     if (BaseOpcode->Store) {
1557       VDataIn = MI.getOperand(1).getReg();
1558       VDataTy = MRI->getType(VDataIn);
1559       NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
1560     } else {
1561       VDataOut = MI.getOperand(0).getReg();
1562       VDataTy = MRI->getType(VDataOut);
1563       NumVDataDwords = DMaskLanes;
1564 
1565       if (IsD16 && !STI.hasUnpackedD16VMem())
1566         NumVDataDwords = (DMaskLanes + 1) / 2;
1567     }
1568   }
1569 
1570   // Optimize _L to _LZ when _L is zero
1571   if (LZMappingInfo) {
1572     // The legalizer replaced the register with an immediate 0 if we need to
1573     // change the opcode.
1574     const MachineOperand &Lod = MI.getOperand(ArgOffset + Intr->LodIndex);
1575     if (Lod.isImm()) {
1576       assert(Lod.getImm() == 0);
1577       IntrOpcode = LZMappingInfo->LZ;  // set new opcode to _lz variant of _l
1578     }
1579   }
1580 
1581   // Optimize _mip away, when 'lod' is zero
1582   if (MIPMappingInfo) {
1583     const MachineOperand &Lod = MI.getOperand(ArgOffset + Intr->MipIndex);
1584     if (Lod.isImm()) {
1585       assert(Lod.getImm() == 0);
1586       IntrOpcode = MIPMappingInfo->NONMIP;  // set new opcode to variant without _mip
1587     }
1588   }
1589 
1590   // Set G16 opcode
1591   if (IsG16 && !IsA16) {
1592     const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
1593         AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
1594     assert(G16MappingInfo);
1595     IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
1596   }
1597 
1598   // TODO: Check this in verifier.
1599   assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
1600 
1601   bool GLC = false;
1602   bool SLC = false;
1603   bool DLC = false;
1604   if (BaseOpcode->Atomic) {
1605     GLC = true; // TODO no-return optimization
1606     if (!parseCachePolicy(
1607             MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm(), nullptr,
1608             &SLC, IsGFX10Plus ? &DLC : nullptr))
1609       return false;
1610   } else {
1611     if (!parseCachePolicy(
1612             MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm(), &GLC,
1613             &SLC, IsGFX10Plus ? &DLC : nullptr))
1614       return false;
1615   }
1616 
1617   int NumVAddrRegs = 0;
1618   int NumVAddrDwords = 0;
1619   for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
1620     // Skip the $noregs and 0s inserted during legalization.
1621     MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
1622     if (!AddrOp.isReg())
1623       continue; // XXX - Break?
1624 
1625     Register Addr = AddrOp.getReg();
1626     if (!Addr)
1627       break;
1628 
1629     ++NumVAddrRegs;
1630     NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
1631   }
1632 
1633   // The legalizer preprocessed the intrinsic arguments. If we aren't using
1634   // NSA, these should have beeen packed into a single value in the first
1635   // address register
1636   const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs;
1637   if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
1638     LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
1639     return false;
1640   }
1641 
1642   if (IsTexFail)
1643     ++NumVDataDwords;
1644 
1645   int Opcode = -1;
1646   if (IsGFX10Plus) {
1647     Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1648                                    UseNSA ? AMDGPU::MIMGEncGfx10NSA
1649                                           : AMDGPU::MIMGEncGfx10Default,
1650                                    NumVDataDwords, NumVAddrDwords);
1651   } else {
1652     if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
1653       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
1654                                      NumVDataDwords, NumVAddrDwords);
1655     if (Opcode == -1)
1656       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
1657                                      NumVDataDwords, NumVAddrDwords);
1658   }
1659   assert(Opcode != -1);
1660 
1661   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
1662     .cloneMemRefs(MI);
1663 
1664   if (VDataOut) {
1665     if (BaseOpcode->AtomicX2) {
1666       const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
1667 
1668       Register TmpReg = MRI->createVirtualRegister(
1669         Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
1670       unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
1671 
1672       MIB.addDef(TmpReg);
1673       BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
1674         .addReg(TmpReg, RegState::Kill, SubReg);
1675 
1676     } else {
1677       MIB.addDef(VDataOut); // vdata output
1678     }
1679   }
1680 
1681   if (VDataIn)
1682     MIB.addReg(VDataIn); // vdata input
1683 
1684   for (int I = 0; I != NumVAddrRegs; ++I) {
1685     MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
1686     if (SrcOp.isReg()) {
1687       assert(SrcOp.getReg() != 0);
1688       MIB.addReg(SrcOp.getReg());
1689     }
1690   }
1691 
1692   MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
1693   if (BaseOpcode->Sampler)
1694     MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
1695 
1696   MIB.addImm(DMask); // dmask
1697 
1698   if (IsGFX10Plus)
1699     MIB.addImm(DimInfo->Encoding);
1700   MIB.addImm(Unorm);
1701   if (IsGFX10Plus)
1702     MIB.addImm(DLC);
1703 
1704   MIB.addImm(GLC);
1705   MIB.addImm(SLC);
1706   MIB.addImm(IsA16 &&  // a16 or r128
1707              STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
1708   if (IsGFX10Plus)
1709     MIB.addImm(IsA16 ? -1 : 0);
1710 
1711   MIB.addImm(TFE); // tfe
1712   MIB.addImm(LWE); // lwe
1713   if (!IsGFX10Plus)
1714     MIB.addImm(DimInfo->DA ? -1 : 0);
1715   if (BaseOpcode->HasD16)
1716     MIB.addImm(IsD16 ? -1 : 0);
1717 
1718   MI.eraseFromParent();
1719   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1720 }
1721 
1722 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
1723     MachineInstr &I) const {
1724   unsigned IntrinsicID = I.getIntrinsicID();
1725   switch (IntrinsicID) {
1726   case Intrinsic::amdgcn_end_cf:
1727     return selectEndCfIntrinsic(I);
1728   case Intrinsic::amdgcn_ds_ordered_add:
1729   case Intrinsic::amdgcn_ds_ordered_swap:
1730     return selectDSOrderedIntrinsic(I, IntrinsicID);
1731   case Intrinsic::amdgcn_ds_gws_init:
1732   case Intrinsic::amdgcn_ds_gws_barrier:
1733   case Intrinsic::amdgcn_ds_gws_sema_v:
1734   case Intrinsic::amdgcn_ds_gws_sema_br:
1735   case Intrinsic::amdgcn_ds_gws_sema_p:
1736   case Intrinsic::amdgcn_ds_gws_sema_release_all:
1737     return selectDSGWSIntrinsic(I, IntrinsicID);
1738   case Intrinsic::amdgcn_ds_append:
1739     return selectDSAppendConsume(I, true);
1740   case Intrinsic::amdgcn_ds_consume:
1741     return selectDSAppendConsume(I, false);
1742   case Intrinsic::amdgcn_s_barrier:
1743     return selectSBarrier(I);
1744   case Intrinsic::amdgcn_global_atomic_fadd:
1745     return selectGlobalAtomicFaddIntrinsic(I);
1746   default: {
1747     return selectImpl(I, *CoverageInfo);
1748   }
1749   }
1750 }
1751 
1752 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
1753   if (selectImpl(I, *CoverageInfo))
1754     return true;
1755 
1756   MachineBasicBlock *BB = I.getParent();
1757   const DebugLoc &DL = I.getDebugLoc();
1758 
1759   Register DstReg = I.getOperand(0).getReg();
1760   unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
1761   assert(Size <= 32 || Size == 64);
1762   const MachineOperand &CCOp = I.getOperand(1);
1763   Register CCReg = CCOp.getReg();
1764   if (!isVCC(CCReg, *MRI)) {
1765     unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
1766                                          AMDGPU::S_CSELECT_B32;
1767     MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
1768             .addReg(CCReg);
1769 
1770     // The generic constrainSelectedInstRegOperands doesn't work for the scc register
1771     // bank, because it does not cover the register class that we used to represent
1772     // for it.  So we need to manually set the register class here.
1773     if (!MRI->getRegClassOrNull(CCReg))
1774         MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
1775     MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
1776             .add(I.getOperand(2))
1777             .add(I.getOperand(3));
1778 
1779     bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI) |
1780                constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
1781     I.eraseFromParent();
1782     return Ret;
1783   }
1784 
1785   // Wide VGPR select should have been split in RegBankSelect.
1786   if (Size > 32)
1787     return false;
1788 
1789   MachineInstr *Select =
1790       BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1791               .addImm(0)
1792               .add(I.getOperand(3))
1793               .addImm(0)
1794               .add(I.getOperand(2))
1795               .add(I.getOperand(1));
1796 
1797   bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
1798   I.eraseFromParent();
1799   return Ret;
1800 }
1801 
1802 static int sizeToSubRegIndex(unsigned Size) {
1803   switch (Size) {
1804   case 32:
1805     return AMDGPU::sub0;
1806   case 64:
1807     return AMDGPU::sub0_sub1;
1808   case 96:
1809     return AMDGPU::sub0_sub1_sub2;
1810   case 128:
1811     return AMDGPU::sub0_sub1_sub2_sub3;
1812   case 256:
1813     return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
1814   default:
1815     if (Size < 32)
1816       return AMDGPU::sub0;
1817     if (Size > 256)
1818       return -1;
1819     return sizeToSubRegIndex(PowerOf2Ceil(Size));
1820   }
1821 }
1822 
1823 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
1824   Register DstReg = I.getOperand(0).getReg();
1825   Register SrcReg = I.getOperand(1).getReg();
1826   const LLT DstTy = MRI->getType(DstReg);
1827   const LLT SrcTy = MRI->getType(SrcReg);
1828   const LLT S1 = LLT::scalar(1);
1829 
1830   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
1831   const RegisterBank *DstRB;
1832   if (DstTy == S1) {
1833     // This is a special case. We don't treat s1 for legalization artifacts as
1834     // vcc booleans.
1835     DstRB = SrcRB;
1836   } else {
1837     DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1838     if (SrcRB != DstRB)
1839       return false;
1840   }
1841 
1842   const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
1843 
1844   unsigned DstSize = DstTy.getSizeInBits();
1845   unsigned SrcSize = SrcTy.getSizeInBits();
1846 
1847   const TargetRegisterClass *SrcRC
1848     = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI);
1849   const TargetRegisterClass *DstRC
1850     = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI);
1851   if (!SrcRC || !DstRC)
1852     return false;
1853 
1854   if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
1855       !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
1856     LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
1857     return false;
1858   }
1859 
1860   if (DstTy == LLT::vector(2, 16) && SrcTy == LLT::vector(2, 32)) {
1861     MachineBasicBlock *MBB = I.getParent();
1862     const DebugLoc &DL = I.getDebugLoc();
1863 
1864     Register LoReg = MRI->createVirtualRegister(DstRC);
1865     Register HiReg = MRI->createVirtualRegister(DstRC);
1866     BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
1867       .addReg(SrcReg, 0, AMDGPU::sub0);
1868     BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
1869       .addReg(SrcReg, 0, AMDGPU::sub1);
1870 
1871     if (IsVALU && STI.hasSDWA()) {
1872       // Write the low 16-bits of the high element into the high 16-bits of the
1873       // low element.
1874       MachineInstr *MovSDWA =
1875         BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
1876         .addImm(0)                             // $src0_modifiers
1877         .addReg(HiReg)                         // $src0
1878         .addImm(0)                             // $clamp
1879         .addImm(AMDGPU::SDWA::WORD_1)          // $dst_sel
1880         .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
1881         .addImm(AMDGPU::SDWA::WORD_0)          // $src0_sel
1882         .addReg(LoReg, RegState::Implicit);
1883       MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
1884     } else {
1885       Register TmpReg0 = MRI->createVirtualRegister(DstRC);
1886       Register TmpReg1 = MRI->createVirtualRegister(DstRC);
1887       Register ImmReg = MRI->createVirtualRegister(DstRC);
1888       if (IsVALU) {
1889         BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
1890           .addImm(16)
1891           .addReg(HiReg);
1892       } else {
1893         BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
1894           .addReg(HiReg)
1895           .addImm(16);
1896       }
1897 
1898       unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
1899       unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
1900       unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
1901 
1902       BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
1903         .addImm(0xffff);
1904       BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
1905         .addReg(LoReg)
1906         .addReg(ImmReg);
1907       BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
1908         .addReg(TmpReg0)
1909         .addReg(TmpReg1);
1910     }
1911 
1912     I.eraseFromParent();
1913     return true;
1914   }
1915 
1916   if (!DstTy.isScalar())
1917     return false;
1918 
1919   if (SrcSize > 32) {
1920     int SubRegIdx = sizeToSubRegIndex(DstSize);
1921     if (SubRegIdx == -1)
1922       return false;
1923 
1924     // Deal with weird cases where the class only partially supports the subreg
1925     // index.
1926     const TargetRegisterClass *SrcWithSubRC
1927       = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
1928     if (!SrcWithSubRC)
1929       return false;
1930 
1931     if (SrcWithSubRC != SrcRC) {
1932       if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
1933         return false;
1934     }
1935 
1936     I.getOperand(1).setSubReg(SubRegIdx);
1937   }
1938 
1939   I.setDesc(TII.get(TargetOpcode::COPY));
1940   return true;
1941 }
1942 
1943 /// \returns true if a bitmask for \p Size bits will be an inline immediate.
1944 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
1945   Mask = maskTrailingOnes<unsigned>(Size);
1946   int SignedMask = static_cast<int>(Mask);
1947   return SignedMask >= -16 && SignedMask <= 64;
1948 }
1949 
1950 // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
1951 const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
1952   Register Reg, const MachineRegisterInfo &MRI,
1953   const TargetRegisterInfo &TRI) const {
1954   const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
1955   if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>())
1956     return RB;
1957 
1958   // Ignore the type, since we don't use vcc in artifacts.
1959   if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
1960     return &RBI.getRegBankFromRegClass(*RC, LLT());
1961   return nullptr;
1962 }
1963 
1964 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
1965   bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
1966   bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
1967   const DebugLoc &DL = I.getDebugLoc();
1968   MachineBasicBlock &MBB = *I.getParent();
1969   const Register DstReg = I.getOperand(0).getReg();
1970   const Register SrcReg = I.getOperand(1).getReg();
1971 
1972   const LLT DstTy = MRI->getType(DstReg);
1973   const LLT SrcTy = MRI->getType(SrcReg);
1974   const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
1975     I.getOperand(2).getImm() : SrcTy.getSizeInBits();
1976   const unsigned DstSize = DstTy.getSizeInBits();
1977   if (!DstTy.isScalar())
1978     return false;
1979 
1980   // Artifact casts should never use vcc.
1981   const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
1982 
1983   // FIXME: This should probably be illegal and split earlier.
1984   if (I.getOpcode() == AMDGPU::G_ANYEXT) {
1985     if (DstSize <= 32)
1986       return selectCOPY(I);
1987 
1988     const TargetRegisterClass *SrcRC =
1989         TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank, *MRI);
1990     const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1991     const TargetRegisterClass *DstRC =
1992         TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
1993 
1994     Register UndefReg = MRI->createVirtualRegister(SrcRC);
1995     BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
1996     BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1997       .addReg(SrcReg)
1998       .addImm(AMDGPU::sub0)
1999       .addReg(UndefReg)
2000       .addImm(AMDGPU::sub1);
2001     I.eraseFromParent();
2002 
2003     return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2004            RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2005   }
2006 
2007   if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2008     // 64-bit should have been split up in RegBankSelect
2009 
2010     // Try to use an and with a mask if it will save code size.
2011     unsigned Mask;
2012     if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2013       MachineInstr *ExtI =
2014       BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2015         .addImm(Mask)
2016         .addReg(SrcReg);
2017       I.eraseFromParent();
2018       return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2019     }
2020 
2021     const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2022     MachineInstr *ExtI =
2023       BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2024       .addReg(SrcReg)
2025       .addImm(0) // Offset
2026       .addImm(SrcSize); // Width
2027     I.eraseFromParent();
2028     return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2029   }
2030 
2031   if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2032     const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2033       AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2034     if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2035       return false;
2036 
2037     if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2038       const unsigned SextOpc = SrcSize == 8 ?
2039         AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2040       BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2041         .addReg(SrcReg);
2042       I.eraseFromParent();
2043       return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2044     }
2045 
2046     const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2047     const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2048 
2049     // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2050     if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2051       // We need a 64-bit register source, but the high bits don't matter.
2052       Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2053       Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2054       unsigned SubReg = InReg ? AMDGPU::sub0 : 0;
2055 
2056       BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2057       BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2058         .addReg(SrcReg, 0, SubReg)
2059         .addImm(AMDGPU::sub0)
2060         .addReg(UndefReg)
2061         .addImm(AMDGPU::sub1);
2062 
2063       BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2064         .addReg(ExtReg)
2065         .addImm(SrcSize << 16);
2066 
2067       I.eraseFromParent();
2068       return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2069     }
2070 
2071     unsigned Mask;
2072     if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2073       BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2074         .addReg(SrcReg)
2075         .addImm(Mask);
2076     } else {
2077       BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2078         .addReg(SrcReg)
2079         .addImm(SrcSize << 16);
2080     }
2081 
2082     I.eraseFromParent();
2083     return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2084   }
2085 
2086   return false;
2087 }
2088 
2089 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
2090   MachineBasicBlock *BB = I.getParent();
2091   MachineOperand &ImmOp = I.getOperand(1);
2092   Register DstReg = I.getOperand(0).getReg();
2093   unsigned Size = MRI->getType(DstReg).getSizeInBits();
2094 
2095   // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
2096   if (ImmOp.isFPImm()) {
2097     const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
2098     ImmOp.ChangeToImmediate(Imm.getZExtValue());
2099   } else if (ImmOp.isCImm()) {
2100     ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue());
2101   } else {
2102     llvm_unreachable("Not supported by g_constants");
2103   }
2104 
2105   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2106   const bool IsSgpr = DstRB->getID() == AMDGPU::SGPRRegBankID;
2107 
2108   unsigned Opcode;
2109   if (DstRB->getID() == AMDGPU::VCCRegBankID) {
2110     Opcode = STI.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2111   } else {
2112     Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
2113 
2114     // We should never produce s1 values on banks other than VCC. If the user of
2115     // this already constrained the register, we may incorrectly think it's VCC
2116     // if it wasn't originally.
2117     if (Size == 1)
2118       return false;
2119   }
2120 
2121   if (Size != 64) {
2122     I.setDesc(TII.get(Opcode));
2123     I.addImplicitDefUseOperands(*MF);
2124     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2125   }
2126 
2127   const DebugLoc &DL = I.getDebugLoc();
2128 
2129   APInt Imm(Size, I.getOperand(1).getImm());
2130 
2131   MachineInstr *ResInst;
2132   if (IsSgpr && TII.isInlineConstant(Imm)) {
2133     ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
2134       .addImm(I.getOperand(1).getImm());
2135   } else {
2136     const TargetRegisterClass *RC = IsSgpr ?
2137       &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass;
2138     Register LoReg = MRI->createVirtualRegister(RC);
2139     Register HiReg = MRI->createVirtualRegister(RC);
2140 
2141     BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
2142       .addImm(Imm.trunc(32).getZExtValue());
2143 
2144     BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
2145       .addImm(Imm.ashr(32).getZExtValue());
2146 
2147     ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2148       .addReg(LoReg)
2149       .addImm(AMDGPU::sub0)
2150       .addReg(HiReg)
2151       .addImm(AMDGPU::sub1);
2152   }
2153 
2154   // We can't call constrainSelectedInstRegOperands here, because it doesn't
2155   // work for target independent opcodes
2156   I.eraseFromParent();
2157   const TargetRegisterClass *DstRC =
2158     TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI);
2159   if (!DstRC)
2160     return true;
2161   return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI);
2162 }
2163 
2164 bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2165   // Only manually handle the f64 SGPR case.
2166   //
2167   // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2168   // the bit ops theoretically have a second result due to the implicit def of
2169   // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2170   // that is easy by disabling the check. The result works, but uses a
2171   // nonsensical sreg32orlds_and_sreg_1 regclass.
2172   //
2173   // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2174   // the variadic REG_SEQUENCE operands.
2175 
2176   Register Dst = MI.getOperand(0).getReg();
2177   const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2178   if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2179       MRI->getType(Dst) != LLT::scalar(64))
2180     return false;
2181 
2182   Register Src = MI.getOperand(1).getReg();
2183   MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2184   if (Fabs)
2185     Src = Fabs->getOperand(1).getReg();
2186 
2187   if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2188       !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2189     return false;
2190 
2191   MachineBasicBlock *BB = MI.getParent();
2192   const DebugLoc &DL = MI.getDebugLoc();
2193   Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2194   Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2195   Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2196   Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2197 
2198   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2199     .addReg(Src, 0, AMDGPU::sub0);
2200   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2201     .addReg(Src, 0, AMDGPU::sub1);
2202   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2203     .addImm(0x80000000);
2204 
2205   // Set or toggle sign bit.
2206   unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2207   BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2208     .addReg(HiReg)
2209     .addReg(ConstReg);
2210   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2211     .addReg(LoReg)
2212     .addImm(AMDGPU::sub0)
2213     .addReg(OpReg)
2214     .addImm(AMDGPU::sub1);
2215   MI.eraseFromParent();
2216   return true;
2217 }
2218 
2219 // FIXME: This is a workaround for the same tablegen problems as G_FNEG
2220 bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2221   Register Dst = MI.getOperand(0).getReg();
2222   const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2223   if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2224       MRI->getType(Dst) != LLT::scalar(64))
2225     return false;
2226 
2227   Register Src = MI.getOperand(1).getReg();
2228   MachineBasicBlock *BB = MI.getParent();
2229   const DebugLoc &DL = MI.getDebugLoc();
2230   Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2231   Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2232   Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2233   Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2234 
2235   if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2236       !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2237     return false;
2238 
2239   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2240     .addReg(Src, 0, AMDGPU::sub0);
2241   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2242     .addReg(Src, 0, AMDGPU::sub1);
2243   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2244     .addImm(0x7fffffff);
2245 
2246   // Clear sign bit.
2247   // TODO: Should this used S_BITSET0_*?
2248   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2249     .addReg(HiReg)
2250     .addReg(ConstReg);
2251   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2252     .addReg(LoReg)
2253     .addImm(AMDGPU::sub0)
2254     .addReg(OpReg)
2255     .addImm(AMDGPU::sub1);
2256 
2257   MI.eraseFromParent();
2258   return true;
2259 }
2260 
2261 static bool isConstant(const MachineInstr &MI) {
2262   return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2263 }
2264 
2265 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2266     const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2267 
2268   const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg());
2269 
2270   assert(PtrMI);
2271 
2272   if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2273     return;
2274 
2275   GEPInfo GEPInfo(*PtrMI);
2276 
2277   for (unsigned i = 1; i != 3; ++i) {
2278     const MachineOperand &GEPOp = PtrMI->getOperand(i);
2279     const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2280     assert(OpDef);
2281     if (i == 2 && isConstant(*OpDef)) {
2282       // TODO: Could handle constant base + variable offset, but a combine
2283       // probably should have commuted it.
2284       assert(GEPInfo.Imm == 0);
2285       GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2286       continue;
2287     }
2288     const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2289     if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2290       GEPInfo.SgprParts.push_back(GEPOp.getReg());
2291     else
2292       GEPInfo.VgprParts.push_back(GEPOp.getReg());
2293   }
2294 
2295   AddrInfo.push_back(GEPInfo);
2296   getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2297 }
2298 
2299 bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
2300   return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
2301 }
2302 
2303 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2304   if (!MI.hasOneMemOperand())
2305     return false;
2306 
2307   const MachineMemOperand *MMO = *MI.memoperands_begin();
2308   const Value *Ptr = MMO->getValue();
2309 
2310   // UndefValue means this is a load of a kernel input.  These are uniform.
2311   // Sometimes LDS instructions have constant pointers.
2312   // If Ptr is null, then that means this mem operand contains a
2313   // PseudoSourceValue like GOT.
2314   if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
2315       isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
2316     return true;
2317 
2318   if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
2319     return true;
2320 
2321   const Instruction *I = dyn_cast<Instruction>(Ptr);
2322   return I && I->getMetadata("amdgpu.uniform");
2323 }
2324 
2325 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2326   for (const GEPInfo &GEPInfo : AddrInfo) {
2327     if (!GEPInfo.VgprParts.empty())
2328       return true;
2329   }
2330   return false;
2331 }
2332 
2333 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2334   const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2335   unsigned AS = PtrTy.getAddressSpace();
2336   if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) &&
2337       STI.ldsRequiresM0Init()) {
2338     MachineBasicBlock *BB = I.getParent();
2339 
2340     // If DS instructions require M0 initializtion, insert it before selecting.
2341     BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2342       .addImm(-1);
2343   }
2344 }
2345 
2346 bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
2347   MachineInstr &I) const {
2348   initM0(I);
2349   return selectImpl(I, *CoverageInfo);
2350 }
2351 
2352 // TODO: No rtn optimization.
2353 bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG(
2354   MachineInstr &MI) const {
2355   Register PtrReg = MI.getOperand(1).getReg();
2356   const LLT PtrTy = MRI->getType(PtrReg);
2357   if (PtrTy.getAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
2358       STI.useFlatForGlobal())
2359     return selectImpl(MI, *CoverageInfo);
2360 
2361   Register DstReg = MI.getOperand(0).getReg();
2362   const LLT Ty = MRI->getType(DstReg);
2363   const bool Is64 = Ty.getSizeInBits() == 64;
2364   const unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2365   Register TmpReg = MRI->createVirtualRegister(
2366     Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2367 
2368   const DebugLoc &DL = MI.getDebugLoc();
2369   MachineBasicBlock *BB = MI.getParent();
2370 
2371   Register VAddr, RSrcReg, SOffset;
2372   int64_t Offset = 0;
2373 
2374   unsigned Opcode;
2375   if (selectMUBUFOffsetImpl(MI.getOperand(1), RSrcReg, SOffset, Offset)) {
2376     Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN :
2377                              AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN;
2378   } else if (selectMUBUFAddr64Impl(MI.getOperand(1), VAddr,
2379                                    RSrcReg, SOffset, Offset)) {
2380     Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN :
2381                     AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN;
2382   } else
2383     return selectImpl(MI, *CoverageInfo);
2384 
2385   auto MIB = BuildMI(*BB, &MI, DL, TII.get(Opcode), TmpReg)
2386     .addReg(MI.getOperand(2).getReg());
2387 
2388   if (VAddr)
2389     MIB.addReg(VAddr);
2390 
2391   MIB.addReg(RSrcReg);
2392   if (SOffset)
2393     MIB.addReg(SOffset);
2394   else
2395     MIB.addImm(0);
2396 
2397   MIB.addImm(Offset);
2398   MIB.addImm(1); // glc
2399   MIB.addImm(0); // slc
2400   MIB.cloneMemRefs(MI);
2401 
2402   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), DstReg)
2403     .addReg(TmpReg, RegState::Kill, SubReg);
2404 
2405   MI.eraseFromParent();
2406 
2407   MRI->setRegClass(
2408     DstReg, Is64 ? &AMDGPU::VReg_64RegClass : &AMDGPU::VGPR_32RegClass);
2409   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2410 }
2411 
2412 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
2413   MachineBasicBlock *BB = I.getParent();
2414   MachineOperand &CondOp = I.getOperand(0);
2415   Register CondReg = CondOp.getReg();
2416   const DebugLoc &DL = I.getDebugLoc();
2417 
2418   unsigned BrOpcode;
2419   Register CondPhysReg;
2420   const TargetRegisterClass *ConstrainRC;
2421 
2422   // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
2423   // whether the branch is uniform when selecting the instruction. In
2424   // GlobalISel, we should push that decision into RegBankSelect. Assume for now
2425   // RegBankSelect knows what it's doing if the branch condition is scc, even
2426   // though it currently does not.
2427   if (!isVCC(CondReg, *MRI)) {
2428     if (MRI->getType(CondReg) != LLT::scalar(32))
2429       return false;
2430 
2431     CondPhysReg = AMDGPU::SCC;
2432     BrOpcode = AMDGPU::S_CBRANCH_SCC1;
2433     ConstrainRC = &AMDGPU::SReg_32RegClass;
2434   } else {
2435     // FIXME: Do we have to insert an and with exec here, like in SelectionDAG?
2436     // We sort of know that a VCC producer based on the register bank, that ands
2437     // inactive lanes with 0. What if there was a logical operation with vcc
2438     // producers in different blocks/with different exec masks?
2439     // FIXME: Should scc->vcc copies and with exec?
2440     CondPhysReg = TRI.getVCC();
2441     BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
2442     ConstrainRC = TRI.getBoolRC();
2443   }
2444 
2445   if (!MRI->getRegClassOrNull(CondReg))
2446     MRI->setRegClass(CondReg, ConstrainRC);
2447 
2448   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
2449     .addReg(CondReg);
2450   BuildMI(*BB, &I, DL, TII.get(BrOpcode))
2451     .addMBB(I.getOperand(1).getMBB());
2452 
2453   I.eraseFromParent();
2454   return true;
2455 }
2456 
2457 bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
2458   MachineInstr &I) const {
2459   Register DstReg = I.getOperand(0).getReg();
2460   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2461   const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2462   I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
2463   if (IsVGPR)
2464     I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
2465 
2466   return RBI.constrainGenericRegister(
2467     DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
2468 }
2469 
2470 bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
2471   Register DstReg = I.getOperand(0).getReg();
2472   Register SrcReg = I.getOperand(1).getReg();
2473   Register MaskReg = I.getOperand(2).getReg();
2474   LLT Ty = MRI->getType(DstReg);
2475   LLT MaskTy = MRI->getType(MaskReg);
2476 
2477   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2478   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2479   const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
2480   const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2481   if (DstRB != SrcRB) // Should only happen for hand written MIR.
2482     return false;
2483 
2484   unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2485   const TargetRegisterClass &RegRC
2486     = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2487 
2488   const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB,
2489                                                                   *MRI);
2490   const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB,
2491                                                                   *MRI);
2492   const TargetRegisterClass *MaskRC =
2493       TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB, *MRI);
2494 
2495   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2496       !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2497       !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
2498     return false;
2499 
2500   MachineBasicBlock *BB = I.getParent();
2501   const DebugLoc &DL = I.getDebugLoc();
2502   if (Ty.getSizeInBits() == 32) {
2503     assert(MaskTy.getSizeInBits() == 32 &&
2504            "ptrmask should have been narrowed during legalize");
2505 
2506     BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
2507       .addReg(SrcReg)
2508       .addReg(MaskReg);
2509     I.eraseFromParent();
2510     return true;
2511   }
2512 
2513   Register HiReg = MRI->createVirtualRegister(&RegRC);
2514   Register LoReg = MRI->createVirtualRegister(&RegRC);
2515 
2516   // Extract the subregisters from the source pointer.
2517   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
2518     .addReg(SrcReg, 0, AMDGPU::sub0);
2519   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
2520     .addReg(SrcReg, 0, AMDGPU::sub1);
2521 
2522   Register MaskedLo, MaskedHi;
2523 
2524   // Try to avoid emitting a bit operation when we only need to touch half of
2525   // the 64-bit pointer.
2526   APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64);
2527 
2528   const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
2529   const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
2530   if ((MaskOnes & MaskLo32) == MaskLo32) {
2531     // If all the bits in the low half are 1, we only need a copy for it.
2532     MaskedLo = LoReg;
2533   } else {
2534     // Extract the mask subregister and apply the and.
2535     Register MaskLo = MRI->createVirtualRegister(&RegRC);
2536     MaskedLo = MRI->createVirtualRegister(&RegRC);
2537 
2538     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
2539       .addReg(MaskReg, 0, AMDGPU::sub0);
2540     BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
2541       .addReg(LoReg)
2542       .addReg(MaskLo);
2543   }
2544 
2545   if ((MaskOnes & MaskHi32) == MaskHi32) {
2546     // If all the bits in the high half are 1, we only need a copy for it.
2547     MaskedHi = HiReg;
2548   } else {
2549     Register MaskHi = MRI->createVirtualRegister(&RegRC);
2550     MaskedHi = MRI->createVirtualRegister(&RegRC);
2551 
2552     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
2553       .addReg(MaskReg, 0, AMDGPU::sub1);
2554     BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
2555       .addReg(HiReg)
2556       .addReg(MaskHi);
2557   }
2558 
2559   BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2560     .addReg(MaskedLo)
2561     .addImm(AMDGPU::sub0)
2562     .addReg(MaskedHi)
2563     .addImm(AMDGPU::sub1);
2564   I.eraseFromParent();
2565   return true;
2566 }
2567 
2568 /// Return the register to use for the index value, and the subregister to use
2569 /// for the indirectly accessed register.
2570 static std::pair<Register, unsigned>
2571 computeIndirectRegIndex(MachineRegisterInfo &MRI,
2572                         const SIRegisterInfo &TRI,
2573                         const TargetRegisterClass *SuperRC,
2574                         Register IdxReg,
2575                         unsigned EltSize) {
2576   Register IdxBaseReg;
2577   int Offset;
2578 
2579   std::tie(IdxBaseReg, Offset) = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg);
2580   if (IdxBaseReg == AMDGPU::NoRegister) {
2581     // This will happen if the index is a known constant. This should ordinarily
2582     // be legalized out, but handle it as a register just in case.
2583     assert(Offset == 0);
2584     IdxBaseReg = IdxReg;
2585   }
2586 
2587   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
2588 
2589   // Skip out of bounds offsets, or else we would end up using an undefined
2590   // register.
2591   if (static_cast<unsigned>(Offset) >= SubRegs.size())
2592     return std::make_pair(IdxReg, SubRegs[0]);
2593   return std::make_pair(IdxBaseReg, SubRegs[Offset]);
2594 }
2595 
2596 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
2597   MachineInstr &MI) const {
2598   Register DstReg = MI.getOperand(0).getReg();
2599   Register SrcReg = MI.getOperand(1).getReg();
2600   Register IdxReg = MI.getOperand(2).getReg();
2601 
2602   LLT DstTy = MRI->getType(DstReg);
2603   LLT SrcTy = MRI->getType(SrcReg);
2604 
2605   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2606   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2607   const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
2608 
2609   // The index must be scalar. If it wasn't RegBankSelect should have moved this
2610   // into a waterfall loop.
2611   if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
2612     return false;
2613 
2614   const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB,
2615                                                                   *MRI);
2616   const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(DstTy, *DstRB,
2617                                                                   *MRI);
2618   if (!SrcRC || !DstRC)
2619     return false;
2620   if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2621       !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2622       !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
2623     return false;
2624 
2625   MachineBasicBlock *BB = MI.getParent();
2626   const DebugLoc &DL = MI.getDebugLoc();
2627   const bool Is64 = DstTy.getSizeInBits() == 64;
2628 
2629   unsigned SubReg;
2630   std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, SrcRC, IdxReg,
2631                                                      DstTy.getSizeInBits() / 8);
2632 
2633   if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
2634     if (DstTy.getSizeInBits() != 32 && !Is64)
2635       return false;
2636 
2637     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2638       .addReg(IdxReg);
2639 
2640     unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
2641     BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
2642       .addReg(SrcReg, 0, SubReg)
2643       .addReg(SrcReg, RegState::Implicit);
2644     MI.eraseFromParent();
2645     return true;
2646   }
2647 
2648   if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
2649     return false;
2650 
2651   if (!STI.useVGPRIndexMode()) {
2652     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2653       .addReg(IdxReg);
2654     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
2655       .addReg(SrcReg, 0, SubReg)
2656       .addReg(SrcReg, RegState::Implicit);
2657     MI.eraseFromParent();
2658     return true;
2659   }
2660 
2661   const MCInstrDesc &GPRIDXDesc =
2662       TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
2663   BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
2664       .addReg(SrcReg)
2665       .addReg(IdxReg)
2666       .addImm(SubReg);
2667 
2668   MI.eraseFromParent();
2669   return true;
2670 }
2671 
2672 // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
2673 bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
2674   MachineInstr &MI) const {
2675   Register DstReg = MI.getOperand(0).getReg();
2676   Register VecReg = MI.getOperand(1).getReg();
2677   Register ValReg = MI.getOperand(2).getReg();
2678   Register IdxReg = MI.getOperand(3).getReg();
2679 
2680   LLT VecTy = MRI->getType(DstReg);
2681   LLT ValTy = MRI->getType(ValReg);
2682   unsigned VecSize = VecTy.getSizeInBits();
2683   unsigned ValSize = ValTy.getSizeInBits();
2684 
2685   const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
2686   const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
2687   const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
2688 
2689   assert(VecTy.getElementType() == ValTy);
2690 
2691   // The index must be scalar. If it wasn't RegBankSelect should have moved this
2692   // into a waterfall loop.
2693   if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
2694     return false;
2695 
2696   const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB,
2697                                                                   *MRI);
2698   const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB,
2699                                                                   *MRI);
2700 
2701   if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
2702       !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
2703       !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
2704       !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
2705     return false;
2706 
2707   if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
2708     return false;
2709 
2710   unsigned SubReg;
2711   std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg,
2712                                                      ValSize / 8);
2713 
2714   const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
2715                          STI.useVGPRIndexMode();
2716 
2717   MachineBasicBlock *BB = MI.getParent();
2718   const DebugLoc &DL = MI.getDebugLoc();
2719 
2720   if (!IndexMode) {
2721     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2722       .addReg(IdxReg);
2723 
2724     const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
2725         VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
2726     BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
2727         .addReg(VecReg)
2728         .addReg(ValReg)
2729         .addImm(SubReg);
2730     MI.eraseFromParent();
2731     return true;
2732   }
2733 
2734   const MCInstrDesc &GPRIDXDesc =
2735       TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
2736   BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
2737       .addReg(VecReg)
2738       .addReg(ValReg)
2739       .addReg(IdxReg)
2740       .addImm(SubReg);
2741 
2742   MI.eraseFromParent();
2743   return true;
2744 }
2745 
2746 static bool isZeroOrUndef(int X) {
2747   return X == 0 || X == -1;
2748 }
2749 
2750 static bool isOneOrUndef(int X) {
2751   return X == 1 || X == -1;
2752 }
2753 
2754 static bool isZeroOrOneOrUndef(int X) {
2755   return X == 0 || X == 1 || X == -1;
2756 }
2757 
2758 // Normalize a VOP3P shuffle mask to refer to the low/high half of a single
2759 // 32-bit register.
2760 static Register normalizeVOP3PMask(int NewMask[2], Register Src0, Register Src1,
2761                                    ArrayRef<int> Mask) {
2762   NewMask[0] = Mask[0];
2763   NewMask[1] = Mask[1];
2764   if (isZeroOrOneOrUndef(Mask[0]) && isZeroOrOneOrUndef(Mask[1]))
2765     return Src0;
2766 
2767   assert(NewMask[0] == 2 || NewMask[0] == 3 || NewMask[0] == -1);
2768   assert(NewMask[1] == 2 || NewMask[1] == 3 || NewMask[1] == -1);
2769 
2770   // Shift the mask inputs to be 0/1;
2771   NewMask[0] = NewMask[0] == -1 ? -1 : NewMask[0] - 2;
2772   NewMask[1] = NewMask[1] == -1 ? -1 : NewMask[1] - 2;
2773   return Src1;
2774 }
2775 
2776 // This is only legal with VOP3P instructions as an aid to op_sel matching.
2777 bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
2778   MachineInstr &MI) const {
2779   Register DstReg = MI.getOperand(0).getReg();
2780   Register Src0Reg = MI.getOperand(1).getReg();
2781   Register Src1Reg = MI.getOperand(2).getReg();
2782   ArrayRef<int> ShufMask = MI.getOperand(3).getShuffleMask();
2783 
2784   const LLT V2S16 = LLT::vector(2, 16);
2785   if (MRI->getType(DstReg) != V2S16 || MRI->getType(Src0Reg) != V2S16)
2786     return false;
2787 
2788   if (!AMDGPU::isLegalVOP3PShuffleMask(ShufMask))
2789     return false;
2790 
2791   assert(ShufMask.size() == 2);
2792   assert(STI.hasSDWA() && "no target has VOP3P but not SDWA");
2793 
2794   MachineBasicBlock *MBB = MI.getParent();
2795   const DebugLoc &DL = MI.getDebugLoc();
2796 
2797   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2798   const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2799   const TargetRegisterClass &RC = IsVALU ?
2800     AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2801 
2802   // Handle the degenerate case which should have folded out.
2803   if (ShufMask[0] == -1 && ShufMask[1] == -1) {
2804     BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), DstReg);
2805 
2806     MI.eraseFromParent();
2807     return RBI.constrainGenericRegister(DstReg, RC, *MRI);
2808   }
2809 
2810   // A legal VOP3P mask only reads one of the sources.
2811   int Mask[2];
2812   Register SrcVec = normalizeVOP3PMask(Mask, Src0Reg, Src1Reg, ShufMask);
2813 
2814   if (!RBI.constrainGenericRegister(DstReg, RC, *MRI) ||
2815       !RBI.constrainGenericRegister(SrcVec, RC, *MRI))
2816     return false;
2817 
2818   // TODO: This also should have been folded out
2819   if (isZeroOrUndef(Mask[0]) && isOneOrUndef(Mask[1])) {
2820     BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), DstReg)
2821       .addReg(SrcVec);
2822 
2823     MI.eraseFromParent();
2824     return true;
2825   }
2826 
2827   if (Mask[0] == 1 && Mask[1] == -1) {
2828     if (IsVALU) {
2829       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
2830         .addImm(16)
2831         .addReg(SrcVec);
2832     } else {
2833       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
2834         .addReg(SrcVec)
2835         .addImm(16);
2836     }
2837   } else if (Mask[0] == -1 && Mask[1] == 0) {
2838     if (IsVALU) {
2839       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), DstReg)
2840         .addImm(16)
2841         .addReg(SrcVec);
2842     } else {
2843       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHL_B32), DstReg)
2844         .addReg(SrcVec)
2845         .addImm(16);
2846     }
2847   } else if (Mask[0] == 0 && Mask[1] == 0) {
2848     if (IsVALU) {
2849       // Write low half of the register into the high half.
2850       MachineInstr *MovSDWA =
2851         BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2852         .addImm(0)                             // $src0_modifiers
2853         .addReg(SrcVec)                        // $src0
2854         .addImm(0)                             // $clamp
2855         .addImm(AMDGPU::SDWA::WORD_1)          // $dst_sel
2856         .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2857         .addImm(AMDGPU::SDWA::WORD_0)          // $src0_sel
2858         .addReg(SrcVec, RegState::Implicit);
2859       MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2860     } else {
2861       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
2862         .addReg(SrcVec)
2863         .addReg(SrcVec);
2864     }
2865   } else if (Mask[0] == 1 && Mask[1] == 1) {
2866     if (IsVALU) {
2867       // Write high half of the register into the low half.
2868       MachineInstr *MovSDWA =
2869         BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2870         .addImm(0)                             // $src0_modifiers
2871         .addReg(SrcVec)                        // $src0
2872         .addImm(0)                             // $clamp
2873         .addImm(AMDGPU::SDWA::WORD_0)          // $dst_sel
2874         .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2875         .addImm(AMDGPU::SDWA::WORD_1)          // $src0_sel
2876         .addReg(SrcVec, RegState::Implicit);
2877       MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2878     } else {
2879       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_HH_B32_B16), DstReg)
2880         .addReg(SrcVec)
2881         .addReg(SrcVec);
2882     }
2883   } else if (Mask[0] == 1 && Mask[1] == 0) {
2884     if (IsVALU) {
2885       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_ALIGNBIT_B32_e64), DstReg)
2886         .addReg(SrcVec)
2887         .addReg(SrcVec)
2888         .addImm(16);
2889     } else {
2890       Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2891       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg)
2892         .addReg(SrcVec)
2893         .addImm(16);
2894       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
2895         .addReg(TmpReg)
2896         .addReg(SrcVec);
2897     }
2898   } else
2899     llvm_unreachable("all shuffle masks should be handled");
2900 
2901   MI.eraseFromParent();
2902   return true;
2903 }
2904 
2905 bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD(
2906   MachineInstr &MI) const {
2907 
2908   MachineBasicBlock *MBB = MI.getParent();
2909   const DebugLoc &DL = MI.getDebugLoc();
2910 
2911   if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) {
2912     Function &F = MBB->getParent()->getFunction();
2913     DiagnosticInfoUnsupported
2914       NoFpRet(F, "return versions of fp atomics not supported",
2915               MI.getDebugLoc(), DS_Error);
2916     F.getContext().diagnose(NoFpRet);
2917     return false;
2918   }
2919 
2920   // FIXME: This is only needed because tablegen requires number of dst operands
2921   // in match and replace pattern to be the same. Otherwise patterns can be
2922   // exported from SDag path.
2923   MachineOperand &VDataIn = MI.getOperand(1);
2924   MachineOperand &VIndex = MI.getOperand(3);
2925   MachineOperand &VOffset = MI.getOperand(4);
2926   MachineOperand &SOffset = MI.getOperand(5);
2927   int16_t Offset = MI.getOperand(6).getImm();
2928 
2929   bool HasVOffset = !isOperandImmEqual(VOffset, 0, *MRI);
2930   bool HasVIndex = !isOperandImmEqual(VIndex, 0, *MRI);
2931 
2932   unsigned Opcode;
2933   if (HasVOffset) {
2934     Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN
2935                        : AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN;
2936   } else {
2937     Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN
2938                        : AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET;
2939   }
2940 
2941   if (MRI->getType(VDataIn.getReg()).isVector()) {
2942     switch (Opcode) {
2943     case AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN:
2944       Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN;
2945       break;
2946     case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN:
2947       Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFEN;
2948       break;
2949     case AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN:
2950       Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_IDXEN;
2951       break;
2952     case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET:
2953       Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFSET;
2954       break;
2955     }
2956   }
2957 
2958   auto I = BuildMI(*MBB, MI, DL, TII.get(Opcode));
2959   I.add(VDataIn);
2960 
2961   if (Opcode == AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN ||
2962       Opcode == AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN) {
2963     Register IdxReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
2964     BuildMI(*MBB, &*I, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
2965       .addReg(VIndex.getReg())
2966       .addImm(AMDGPU::sub0)
2967       .addReg(VOffset.getReg())
2968       .addImm(AMDGPU::sub1);
2969 
2970     I.addReg(IdxReg);
2971   } else if (HasVIndex) {
2972     I.add(VIndex);
2973   } else if (HasVOffset) {
2974     I.add(VOffset);
2975   }
2976 
2977   I.add(MI.getOperand(2)); // rsrc
2978   I.add(SOffset);
2979   I.addImm(Offset);
2980   renderExtractSLC(I, MI, 7);
2981   I.cloneMemRefs(MI);
2982 
2983   MI.eraseFromParent();
2984 
2985   return true;
2986 }
2987 
2988 bool AMDGPUInstructionSelector::selectGlobalAtomicFaddIntrinsic(
2989   MachineInstr &MI) const{
2990 
2991   MachineBasicBlock *MBB = MI.getParent();
2992   const DebugLoc &DL = MI.getDebugLoc();
2993 
2994   if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) {
2995     Function &F = MBB->getParent()->getFunction();
2996     DiagnosticInfoUnsupported
2997       NoFpRet(F, "return versions of fp atomics not supported",
2998               MI.getDebugLoc(), DS_Error);
2999     F.getContext().diagnose(NoFpRet);
3000     return false;
3001   }
3002 
3003   // FIXME: This is only needed because tablegen requires number of dst operands
3004   // in match and replace pattern to be the same. Otherwise patterns can be
3005   // exported from SDag path.
3006   auto Addr = selectFlatOffsetImpl<true>(MI.getOperand(2));
3007 
3008   Register Data = MI.getOperand(3).getReg();
3009   const unsigned Opc = MRI->getType(Data).isVector() ?
3010     AMDGPU::GLOBAL_ATOMIC_PK_ADD_F16 : AMDGPU::GLOBAL_ATOMIC_ADD_F32;
3011   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3012     .addReg(Addr.first)
3013     .addReg(Data)
3014     .addImm(Addr.second)
3015     .addImm(0) // SLC
3016     .cloneMemRefs(MI);
3017 
3018   MI.eraseFromParent();
3019   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3020 }
3021 
3022 bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
3023   MI.setDesc(TII.get(MI.getOperand(1).getImm()));
3024   MI.RemoveOperand(1);
3025   MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3026   return true;
3027 }
3028 
3029 bool AMDGPUInstructionSelector::select(MachineInstr &I) {
3030   if (I.isPHI())
3031     return selectPHI(I);
3032 
3033   if (!I.isPreISelOpcode()) {
3034     if (I.isCopy())
3035       return selectCOPY(I);
3036     return true;
3037   }
3038 
3039   switch (I.getOpcode()) {
3040   case TargetOpcode::G_AND:
3041   case TargetOpcode::G_OR:
3042   case TargetOpcode::G_XOR:
3043     if (selectImpl(I, *CoverageInfo))
3044       return true;
3045     return selectG_AND_OR_XOR(I);
3046   case TargetOpcode::G_ADD:
3047   case TargetOpcode::G_SUB:
3048     if (selectImpl(I, *CoverageInfo))
3049       return true;
3050     return selectG_ADD_SUB(I);
3051   case TargetOpcode::G_UADDO:
3052   case TargetOpcode::G_USUBO:
3053   case TargetOpcode::G_UADDE:
3054   case TargetOpcode::G_USUBE:
3055     return selectG_UADDO_USUBO_UADDE_USUBE(I);
3056   case TargetOpcode::G_INTTOPTR:
3057   case TargetOpcode::G_BITCAST:
3058   case TargetOpcode::G_PTRTOINT:
3059     return selectCOPY(I);
3060   case TargetOpcode::G_CONSTANT:
3061   case TargetOpcode::G_FCONSTANT:
3062     return selectG_CONSTANT(I);
3063   case TargetOpcode::G_FNEG:
3064     if (selectImpl(I, *CoverageInfo))
3065       return true;
3066     return selectG_FNEG(I);
3067   case TargetOpcode::G_FABS:
3068     if (selectImpl(I, *CoverageInfo))
3069       return true;
3070     return selectG_FABS(I);
3071   case TargetOpcode::G_EXTRACT:
3072     return selectG_EXTRACT(I);
3073   case TargetOpcode::G_MERGE_VALUES:
3074   case TargetOpcode::G_BUILD_VECTOR:
3075   case TargetOpcode::G_CONCAT_VECTORS:
3076     return selectG_MERGE_VALUES(I);
3077   case TargetOpcode::G_UNMERGE_VALUES:
3078     return selectG_UNMERGE_VALUES(I);
3079   case TargetOpcode::G_BUILD_VECTOR_TRUNC:
3080     return selectG_BUILD_VECTOR_TRUNC(I);
3081   case TargetOpcode::G_PTR_ADD:
3082     return selectG_PTR_ADD(I);
3083   case TargetOpcode::G_IMPLICIT_DEF:
3084     return selectG_IMPLICIT_DEF(I);
3085   case TargetOpcode::G_FREEZE:
3086     return selectCOPY(I);
3087   case TargetOpcode::G_INSERT:
3088     return selectG_INSERT(I);
3089   case TargetOpcode::G_INTRINSIC:
3090     return selectG_INTRINSIC(I);
3091   case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3092     return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
3093   case TargetOpcode::G_ICMP:
3094     if (selectG_ICMP(I))
3095       return true;
3096     return selectImpl(I, *CoverageInfo);
3097   case TargetOpcode::G_LOAD:
3098   case TargetOpcode::G_STORE:
3099   case TargetOpcode::G_ATOMIC_CMPXCHG:
3100   case TargetOpcode::G_ATOMICRMW_XCHG:
3101   case TargetOpcode::G_ATOMICRMW_ADD:
3102   case TargetOpcode::G_ATOMICRMW_SUB:
3103   case TargetOpcode::G_ATOMICRMW_AND:
3104   case TargetOpcode::G_ATOMICRMW_OR:
3105   case TargetOpcode::G_ATOMICRMW_XOR:
3106   case TargetOpcode::G_ATOMICRMW_MIN:
3107   case TargetOpcode::G_ATOMICRMW_MAX:
3108   case TargetOpcode::G_ATOMICRMW_UMIN:
3109   case TargetOpcode::G_ATOMICRMW_UMAX:
3110   case TargetOpcode::G_ATOMICRMW_FADD:
3111   case AMDGPU::G_AMDGPU_ATOMIC_INC:
3112   case AMDGPU::G_AMDGPU_ATOMIC_DEC:
3113   case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
3114   case AMDGPU::G_AMDGPU_ATOMIC_FMAX:
3115     return selectG_LOAD_STORE_ATOMICRMW(I);
3116   case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
3117     return selectG_AMDGPU_ATOMIC_CMPXCHG(I);
3118   case TargetOpcode::G_SELECT:
3119     return selectG_SELECT(I);
3120   case TargetOpcode::G_TRUNC:
3121     return selectG_TRUNC(I);
3122   case TargetOpcode::G_SEXT:
3123   case TargetOpcode::G_ZEXT:
3124   case TargetOpcode::G_ANYEXT:
3125   case TargetOpcode::G_SEXT_INREG:
3126     if (selectImpl(I, *CoverageInfo))
3127       return true;
3128     return selectG_SZA_EXT(I);
3129   case TargetOpcode::G_BRCOND:
3130     return selectG_BRCOND(I);
3131   case TargetOpcode::G_GLOBAL_VALUE:
3132     return selectG_GLOBAL_VALUE(I);
3133   case TargetOpcode::G_PTRMASK:
3134     return selectG_PTRMASK(I);
3135   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3136     return selectG_EXTRACT_VECTOR_ELT(I);
3137   case TargetOpcode::G_INSERT_VECTOR_ELT:
3138     return selectG_INSERT_VECTOR_ELT(I);
3139   case TargetOpcode::G_SHUFFLE_VECTOR:
3140     return selectG_SHUFFLE_VECTOR(I);
3141   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3142   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
3143     const AMDGPU::ImageDimIntrinsicInfo *Intr
3144       = AMDGPU::getImageDimIntrinsicInfo(I.getIntrinsicID());
3145     assert(Intr && "not an image intrinsic with image pseudo");
3146     return selectImageIntrinsic(I, Intr);
3147   }
3148   case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY:
3149     return selectBVHIntrinsic(I);
3150   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
3151     return selectAMDGPU_BUFFER_ATOMIC_FADD(I);
3152   default:
3153     return selectImpl(I, *CoverageInfo);
3154   }
3155   return false;
3156 }
3157 
3158 InstructionSelector::ComplexRendererFns
3159 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
3160   return {{
3161       [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3162   }};
3163 
3164 }
3165 
3166 std::pair<Register, unsigned>
3167 AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root,
3168                                               bool AllowAbs) const {
3169   Register Src = Root.getReg();
3170   Register OrigSrc = Src;
3171   unsigned Mods = 0;
3172   MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
3173 
3174   if (MI && MI->getOpcode() == AMDGPU::G_FNEG) {
3175     Src = MI->getOperand(1).getReg();
3176     Mods |= SISrcMods::NEG;
3177     MI = getDefIgnoringCopies(Src, *MRI);
3178   }
3179 
3180   if (AllowAbs && MI && MI->getOpcode() == AMDGPU::G_FABS) {
3181     Src = MI->getOperand(1).getReg();
3182     Mods |= SISrcMods::ABS;
3183   }
3184 
3185   if (Mods != 0 &&
3186       RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
3187     MachineInstr *UseMI = Root.getParent();
3188 
3189     // If we looked through copies to find source modifiers on an SGPR operand,
3190     // we now have an SGPR register source. To avoid potentially violating the
3191     // constant bus restriction, we need to insert a copy to a VGPR.
3192     Register VGPRSrc = MRI->cloneVirtualRegister(OrigSrc);
3193     BuildMI(*UseMI->getParent(), UseMI, UseMI->getDebugLoc(),
3194             TII.get(AMDGPU::COPY), VGPRSrc)
3195       .addReg(Src);
3196     Src = VGPRSrc;
3197   }
3198 
3199   return std::make_pair(Src, Mods);
3200 }
3201 
3202 ///
3203 /// This will select either an SGPR or VGPR operand and will save us from
3204 /// having to write an extra tablegen pattern.
3205 InstructionSelector::ComplexRendererFns
3206 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
3207   return {{
3208       [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3209   }};
3210 }
3211 
3212 InstructionSelector::ComplexRendererFns
3213 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
3214   Register Src;
3215   unsigned Mods;
3216   std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3217 
3218   return {{
3219       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3220       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3221       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },    // clamp
3222       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }     // omod
3223   }};
3224 }
3225 
3226 InstructionSelector::ComplexRendererFns
3227 AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
3228   Register Src;
3229   unsigned Mods;
3230   std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /* AllowAbs */ false);
3231 
3232   return {{
3233       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3234       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3235       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },    // clamp
3236       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }     // omod
3237   }};
3238 }
3239 
3240 InstructionSelector::ComplexRendererFns
3241 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
3242   return {{
3243       [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
3244       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3245       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }  // omod
3246   }};
3247 }
3248 
3249 InstructionSelector::ComplexRendererFns
3250 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
3251   Register Src;
3252   unsigned Mods;
3253   std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3254 
3255   return {{
3256       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3257       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
3258   }};
3259 }
3260 
3261 InstructionSelector::ComplexRendererFns
3262 AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
3263   Register Src;
3264   unsigned Mods;
3265   std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /* AllowAbs */ false);
3266 
3267   return {{
3268       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3269       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3270   }};
3271 }
3272 
3273 InstructionSelector::ComplexRendererFns
3274 AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
3275   Register Reg = Root.getReg();
3276   const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3277   if (Def && (Def->getOpcode() == AMDGPU::G_FNEG ||
3278               Def->getOpcode() == AMDGPU::G_FABS))
3279     return {};
3280   return {{
3281       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3282   }};
3283 }
3284 
3285 std::pair<Register, unsigned>
3286 AMDGPUInstructionSelector::selectVOP3PModsImpl(
3287   Register Src, const MachineRegisterInfo &MRI) const {
3288   unsigned Mods = 0;
3289   MachineInstr *MI = MRI.getVRegDef(Src);
3290 
3291   if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
3292       // It's possible to see an f32 fneg here, but unlikely.
3293       // TODO: Treat f32 fneg as only high bit.
3294       MRI.getType(Src) == LLT::vector(2, 16)) {
3295     Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
3296     Src = MI->getOperand(1).getReg();
3297     MI = MRI.getVRegDef(Src);
3298   }
3299 
3300   // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
3301 
3302   // Packed instructions do not have abs modifiers.
3303   Mods |= SISrcMods::OP_SEL_1;
3304 
3305   return std::make_pair(Src, Mods);
3306 }
3307 
3308 InstructionSelector::ComplexRendererFns
3309 AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
3310   MachineRegisterInfo &MRI
3311     = Root.getParent()->getParent()->getParent()->getRegInfo();
3312 
3313   Register Src;
3314   unsigned Mods;
3315   std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
3316 
3317   return {{
3318       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3319       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
3320   }};
3321 }
3322 
3323 InstructionSelector::ComplexRendererFns
3324 AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {
3325   Register Src;
3326   unsigned Mods;
3327   std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3328   if (!isKnownNeverNaN(Src, *MRI))
3329     return None;
3330 
3331   return {{
3332       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3333       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
3334   }};
3335 }
3336 
3337 InstructionSelector::ComplexRendererFns
3338 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
3339   // FIXME: Handle op_sel
3340   return {{
3341       [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
3342       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods
3343   }};
3344 }
3345 
3346 InstructionSelector::ComplexRendererFns
3347 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
3348   SmallVector<GEPInfo, 4> AddrInfo;
3349   getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
3350 
3351   if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3352     return None;
3353 
3354   const GEPInfo &GEPInfo = AddrInfo[0];
3355   Optional<int64_t> EncodedImm =
3356       AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm, false);
3357   if (!EncodedImm)
3358     return None;
3359 
3360   unsigned PtrReg = GEPInfo.SgprParts[0];
3361   return {{
3362     [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3363     [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
3364   }};
3365 }
3366 
3367 InstructionSelector::ComplexRendererFns
3368 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
3369   SmallVector<GEPInfo, 4> AddrInfo;
3370   getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
3371 
3372   if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3373     return None;
3374 
3375   const GEPInfo &GEPInfo = AddrInfo[0];
3376   Register PtrReg = GEPInfo.SgprParts[0];
3377   Optional<int64_t> EncodedImm =
3378       AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
3379   if (!EncodedImm)
3380     return None;
3381 
3382   return {{
3383     [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3384     [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
3385   }};
3386 }
3387 
3388 InstructionSelector::ComplexRendererFns
3389 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
3390   MachineInstr *MI = Root.getParent();
3391   MachineBasicBlock *MBB = MI->getParent();
3392 
3393   SmallVector<GEPInfo, 4> AddrInfo;
3394   getAddrModeInfo(*MI, *MRI, AddrInfo);
3395 
3396   // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
3397   // then we can select all ptr + 32-bit offsets not just immediate offsets.
3398   if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3399     return None;
3400 
3401   const GEPInfo &GEPInfo = AddrInfo[0];
3402   // SGPR offset is unsigned.
3403   if (!GEPInfo.Imm || GEPInfo.Imm < 0 || !isUInt<32>(GEPInfo.Imm))
3404     return None;
3405 
3406   // If we make it this far we have a load with an 32-bit immediate offset.
3407   // It is OK to select this using a sgpr offset, because we have already
3408   // failed trying to select this load into one of the _IMM variants since
3409   // the _IMM Patterns are considered before the _SGPR patterns.
3410   Register PtrReg = GEPInfo.SgprParts[0];
3411   Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3412   BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg)
3413           .addImm(GEPInfo.Imm);
3414   return {{
3415     [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3416     [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }
3417   }};
3418 }
3419 
3420 template <bool Signed>
3421 std::pair<Register, int>
3422 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const {
3423   MachineInstr *MI = Root.getParent();
3424 
3425   auto Default = std::make_pair(Root.getReg(), 0);
3426 
3427   if (!STI.hasFlatInstOffsets())
3428     return Default;
3429 
3430   Register PtrBase;
3431   int64_t ConstOffset;
3432   std::tie(PtrBase, ConstOffset) =
3433       getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3434   if (ConstOffset == 0)
3435     return Default;
3436 
3437   unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
3438   if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, Signed))
3439     return Default;
3440 
3441   return std::make_pair(PtrBase, ConstOffset);
3442 }
3443 
3444 InstructionSelector::ComplexRendererFns
3445 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
3446   auto PtrWithOffset = selectFlatOffsetImpl<false>(Root);
3447 
3448   return {{
3449       [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
3450       [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
3451     }};
3452 }
3453 
3454 InstructionSelector::ComplexRendererFns
3455 AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const {
3456   auto PtrWithOffset = selectFlatOffsetImpl<true>(Root);
3457 
3458   return {{
3459       [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
3460       [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
3461     }};
3462 }
3463 
3464 /// Match a zero extend from a 32-bit value to 64-bits.
3465 static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
3466   Register ZExtSrc;
3467   if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
3468     return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3469 
3470   // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3471   const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
3472   if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3473     return false;
3474 
3475   if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
3476     return Def->getOperand(1).getReg();
3477   }
3478 
3479   return Register();
3480 }
3481 
3482 // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
3483 InstructionSelector::ComplexRendererFns
3484 AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
3485   Register Addr = Root.getReg();
3486   Register PtrBase;
3487   int64_t ConstOffset;
3488   int64_t ImmOffset = 0;
3489 
3490   // Match the immediate offset first, which canonically is moved as low as
3491   // possible.
3492   std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
3493 
3494   if (ConstOffset != 0) {
3495     if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, true)) {
3496       Addr = PtrBase;
3497       ImmOffset = ConstOffset;
3498     } else if (ConstOffset > 0) {
3499       auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
3500       if (!PtrBaseDef)
3501         return None;
3502 
3503       if (isSGPR(PtrBaseDef->Reg)) {
3504         // Offset is too large.
3505         //
3506         // saddr + large_offset -> saddr + (voffset = large_offset & ~MaxOffset)
3507         //                         + (large_offset & MaxOffset);
3508         int64_t SplitImmOffset, RemainderOffset;
3509         std::tie(SplitImmOffset, RemainderOffset)
3510           = TII.splitFlatOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, true);
3511 
3512         if (isUInt<32>(RemainderOffset)) {
3513           MachineInstr *MI = Root.getParent();
3514           MachineBasicBlock *MBB = MI->getParent();
3515           Register HighBits
3516             = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3517 
3518           BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
3519                   HighBits)
3520             .addImm(RemainderOffset);
3521 
3522           return {{
3523             [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); },  // saddr
3524             [=](MachineInstrBuilder &MIB) { MIB.addReg(HighBits); }, // voffset
3525             [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
3526           }};
3527         }
3528       }
3529     }
3530   }
3531 
3532   auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3533   if (!AddrDef)
3534     return None;
3535 
3536   // Match the variable offset.
3537   if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD) {
3538     // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
3539     // drop this.
3540     if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
3541         AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT)
3542       return None;
3543 
3544     // It's cheaper to materialize a single 32-bit zero for vaddr than the two
3545     // moves required to copy a 64-bit SGPR to VGPR.
3546     const Register SAddr = AddrDef->Reg;
3547     if (!isSGPR(SAddr))
3548       return None;
3549 
3550     MachineInstr *MI = Root.getParent();
3551     MachineBasicBlock *MBB = MI->getParent();
3552     Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3553 
3554     BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
3555             VOffset)
3556       .addImm(0);
3557 
3558     return {{
3559         [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); },    // saddr
3560         [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); },  // voffset
3561         [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
3562     }};
3563   }
3564 
3565   // Look through the SGPR->VGPR copy.
3566   Register SAddr =
3567     getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3568   if (!SAddr || !isSGPR(SAddr))
3569     return None;
3570 
3571   Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3572 
3573   // It's possible voffset is an SGPR here, but the copy to VGPR will be
3574   // inserted later.
3575   Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset);
3576   if (!VOffset)
3577     return None;
3578 
3579   return {{[=](MachineInstrBuilder &MIB) { // saddr
3580              MIB.addReg(SAddr);
3581            },
3582            [=](MachineInstrBuilder &MIB) { // voffset
3583              MIB.addReg(VOffset);
3584            },
3585            [=](MachineInstrBuilder &MIB) { // offset
3586              MIB.addImm(ImmOffset);
3587            }}};
3588 }
3589 
3590 InstructionSelector::ComplexRendererFns
3591 AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
3592   Register Addr = Root.getReg();
3593   Register PtrBase;
3594   int64_t ConstOffset;
3595   int64_t ImmOffset = 0;
3596 
3597   // Match the immediate offset first, which canonically is moved as low as
3598   // possible.
3599   std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
3600 
3601   if (ConstOffset != 0 &&
3602       TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
3603     Addr = PtrBase;
3604     ImmOffset = ConstOffset;
3605   }
3606 
3607   auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3608   if (!AddrDef)
3609     return None;
3610 
3611   if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
3612     int FI = AddrDef->MI->getOperand(1).getIndex();
3613     return {{
3614         [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
3615         [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
3616     }};
3617   }
3618 
3619   Register SAddr = AddrDef->Reg;
3620 
3621   if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3622     Register LHS = AddrDef->MI->getOperand(1).getReg();
3623     Register RHS = AddrDef->MI->getOperand(2).getReg();
3624     auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
3625     auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
3626 
3627     if (LHSDef && RHSDef &&
3628         LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
3629         isSGPR(RHSDef->Reg)) {
3630       int FI = LHSDef->MI->getOperand(1).getIndex();
3631       MachineInstr &I = *Root.getParent();
3632       MachineBasicBlock *BB = I.getParent();
3633       const DebugLoc &DL = I.getDebugLoc();
3634       SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3635 
3636       BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), SAddr)
3637         .addFrameIndex(FI)
3638         .addReg(RHSDef->Reg);
3639     }
3640   }
3641 
3642   if (!isSGPR(SAddr))
3643     return None;
3644 
3645   return {{
3646       [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
3647       [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
3648   }};
3649 }
3650 
3651 static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
3652   auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>();
3653   return PSV && PSV->isStack();
3654 }
3655 
3656 InstructionSelector::ComplexRendererFns
3657 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
3658   MachineInstr *MI = Root.getParent();
3659   MachineBasicBlock *MBB = MI->getParent();
3660   MachineFunction *MF = MBB->getParent();
3661   const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3662 
3663   int64_t Offset = 0;
3664   if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
3665       Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) {
3666     Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3667 
3668     // TODO: Should this be inside the render function? The iterator seems to
3669     // move.
3670     BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
3671             HighBits)
3672       .addImm(Offset & ~4095);
3673 
3674     return {{[=](MachineInstrBuilder &MIB) { // rsrc
3675                MIB.addReg(Info->getScratchRSrcReg());
3676              },
3677              [=](MachineInstrBuilder &MIB) { // vaddr
3678                MIB.addReg(HighBits);
3679              },
3680              [=](MachineInstrBuilder &MIB) { // soffset
3681                // Use constant zero for soffset and rely on eliminateFrameIndex
3682                // to choose the appropriate frame register if need be.
3683                MIB.addImm(0);
3684              },
3685              [=](MachineInstrBuilder &MIB) { // offset
3686                MIB.addImm(Offset & 4095);
3687              }}};
3688   }
3689 
3690   assert(Offset == 0 || Offset == -1);
3691 
3692   // Try to fold a frame index directly into the MUBUF vaddr field, and any
3693   // offsets.
3694   Optional<int> FI;
3695   Register VAddr = Root.getReg();
3696   if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) {
3697     if (isBaseWithConstantOffset(Root, *MRI)) {
3698       const MachineOperand &LHS = RootDef->getOperand(1);
3699       const MachineOperand &RHS = RootDef->getOperand(2);
3700       const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg());
3701       const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg());
3702       if (LHSDef && RHSDef) {
3703         int64_t PossibleOffset =
3704             RHSDef->getOperand(1).getCImm()->getSExtValue();
3705         if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) &&
3706             (!STI.privateMemoryResourceIsRangeChecked() ||
3707              KnownBits->signBitIsZero(LHS.getReg()))) {
3708           if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
3709             FI = LHSDef->getOperand(1).getIndex();
3710           else
3711             VAddr = LHS.getReg();
3712           Offset = PossibleOffset;
3713         }
3714       }
3715     } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
3716       FI = RootDef->getOperand(1).getIndex();
3717     }
3718   }
3719 
3720   return {{[=](MachineInstrBuilder &MIB) { // rsrc
3721              MIB.addReg(Info->getScratchRSrcReg());
3722            },
3723            [=](MachineInstrBuilder &MIB) { // vaddr
3724              if (FI.hasValue())
3725                MIB.addFrameIndex(FI.getValue());
3726              else
3727                MIB.addReg(VAddr);
3728            },
3729            [=](MachineInstrBuilder &MIB) { // soffset
3730              // Use constant zero for soffset and rely on eliminateFrameIndex
3731              // to choose the appropriate frame register if need be.
3732              MIB.addImm(0);
3733            },
3734            [=](MachineInstrBuilder &MIB) { // offset
3735              MIB.addImm(Offset);
3736            }}};
3737 }
3738 
3739 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
3740                                                 int64_t Offset) const {
3741   if (!isUInt<16>(Offset))
3742     return false;
3743 
3744   if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
3745     return true;
3746 
3747   // On Southern Islands instruction with a negative base value and an offset
3748   // don't seem to work.
3749   return KnownBits->signBitIsZero(Base);
3750 }
3751 
3752 bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
3753                                                  int64_t Offset1,
3754                                                  unsigned Size) const {
3755   if (Offset0 % Size != 0 || Offset1 % Size != 0)
3756     return false;
3757   if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
3758     return false;
3759 
3760   if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
3761     return true;
3762 
3763   // On Southern Islands instruction with a negative base value and an offset
3764   // don't seem to work.
3765   return KnownBits->signBitIsZero(Base);
3766 }
3767 
3768 InstructionSelector::ComplexRendererFns
3769 AMDGPUInstructionSelector::selectMUBUFScratchOffset(
3770     MachineOperand &Root) const {
3771   MachineInstr *MI = Root.getParent();
3772   MachineBasicBlock *MBB = MI->getParent();
3773 
3774   int64_t Offset = 0;
3775   if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
3776       !SIInstrInfo::isLegalMUBUFImmOffset(Offset))
3777     return {};
3778 
3779   const MachineFunction *MF = MBB->getParent();
3780   const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3781   const MachineMemOperand *MMO = *MI->memoperands_begin();
3782   const MachinePointerInfo &PtrInfo = MMO->getPointerInfo();
3783 
3784   return {{
3785       [=](MachineInstrBuilder &MIB) { // rsrc
3786         MIB.addReg(Info->getScratchRSrcReg());
3787       },
3788       [=](MachineInstrBuilder &MIB) { // soffset
3789         if (isStackPtrRelative(PtrInfo))
3790           MIB.addReg(Info->getStackPtrOffsetReg());
3791         else
3792           MIB.addImm(0);
3793       },
3794       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
3795   }};
3796 }
3797 
3798 std::pair<Register, unsigned>
3799 AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
3800   const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
3801   if (!RootDef)
3802     return std::make_pair(Root.getReg(), 0);
3803 
3804   int64_t ConstAddr = 0;
3805 
3806   Register PtrBase;
3807   int64_t Offset;
3808   std::tie(PtrBase, Offset) =
3809     getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3810 
3811   if (Offset) {
3812     if (isDSOffsetLegal(PtrBase, Offset)) {
3813       // (add n0, c0)
3814       return std::make_pair(PtrBase, Offset);
3815     }
3816   } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
3817     // TODO
3818 
3819 
3820   } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
3821     // TODO
3822 
3823   }
3824 
3825   return std::make_pair(Root.getReg(), 0);
3826 }
3827 
3828 InstructionSelector::ComplexRendererFns
3829 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
3830   Register Reg;
3831   unsigned Offset;
3832   std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
3833   return {{
3834       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3835       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
3836     }};
3837 }
3838 
3839 InstructionSelector::ComplexRendererFns
3840 AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
3841   return selectDSReadWrite2(Root, 4);
3842 }
3843 
3844 InstructionSelector::ComplexRendererFns
3845 AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
3846   return selectDSReadWrite2(Root, 8);
3847 }
3848 
3849 InstructionSelector::ComplexRendererFns
3850 AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
3851                                               unsigned Size) const {
3852   Register Reg;
3853   unsigned Offset;
3854   std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
3855   return {{
3856       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3857       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
3858       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
3859     }};
3860 }
3861 
3862 std::pair<Register, unsigned>
3863 AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
3864                                                   unsigned Size) const {
3865   const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
3866   if (!RootDef)
3867     return std::make_pair(Root.getReg(), 0);
3868 
3869   int64_t ConstAddr = 0;
3870 
3871   Register PtrBase;
3872   int64_t Offset;
3873   std::tie(PtrBase, Offset) =
3874     getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3875 
3876   if (Offset) {
3877     int64_t OffsetValue0 = Offset;
3878     int64_t OffsetValue1 = Offset + Size;
3879     if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
3880       // (add n0, c0)
3881       return std::make_pair(PtrBase, OffsetValue0 / Size);
3882     }
3883   } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
3884     // TODO
3885 
3886   } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
3887     // TODO
3888 
3889   }
3890 
3891   return std::make_pair(Root.getReg(), 0);
3892 }
3893 
3894 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
3895 /// the base value with the constant offset. There may be intervening copies
3896 /// between \p Root and the identified constant. Returns \p Root, 0 if this does
3897 /// not match the pattern.
3898 std::pair<Register, int64_t>
3899 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
3900   Register Root, const MachineRegisterInfo &MRI) const {
3901   MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
3902   if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
3903     return {Root, 0};
3904 
3905   MachineOperand &RHS = RootI->getOperand(2);
3906   Optional<ValueAndVReg> MaybeOffset
3907     = getConstantVRegValWithLookThrough(RHS.getReg(), MRI, true);
3908   if (!MaybeOffset)
3909     return {Root, 0};
3910   return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()};
3911 }
3912 
3913 static void addZeroImm(MachineInstrBuilder &MIB) {
3914   MIB.addImm(0);
3915 }
3916 
3917 /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
3918 /// BasePtr is not valid, a null base pointer will be used.
3919 static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI,
3920                           uint32_t FormatLo, uint32_t FormatHi,
3921                           Register BasePtr) {
3922   Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
3923   Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
3924   Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3925   Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
3926 
3927   B.buildInstr(AMDGPU::S_MOV_B32)
3928     .addDef(RSrc2)
3929     .addImm(FormatLo);
3930   B.buildInstr(AMDGPU::S_MOV_B32)
3931     .addDef(RSrc3)
3932     .addImm(FormatHi);
3933 
3934   // Build the half of the subregister with the constants before building the
3935   // full 128-bit register. If we are building multiple resource descriptors,
3936   // this will allow CSEing of the 2-component register.
3937   B.buildInstr(AMDGPU::REG_SEQUENCE)
3938     .addDef(RSrcHi)
3939     .addReg(RSrc2)
3940     .addImm(AMDGPU::sub0)
3941     .addReg(RSrc3)
3942     .addImm(AMDGPU::sub1);
3943 
3944   Register RSrcLo = BasePtr;
3945   if (!BasePtr) {
3946     RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3947     B.buildInstr(AMDGPU::S_MOV_B64)
3948       .addDef(RSrcLo)
3949       .addImm(0);
3950   }
3951 
3952   B.buildInstr(AMDGPU::REG_SEQUENCE)
3953     .addDef(RSrc)
3954     .addReg(RSrcLo)
3955     .addImm(AMDGPU::sub0_sub1)
3956     .addReg(RSrcHi)
3957     .addImm(AMDGPU::sub2_sub3);
3958 
3959   return RSrc;
3960 }
3961 
3962 static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
3963                                 const SIInstrInfo &TII, Register BasePtr) {
3964   uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
3965 
3966   // FIXME: Why are half the "default" bits ignored based on the addressing
3967   // mode?
3968   return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
3969 }
3970 
3971 static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
3972                                const SIInstrInfo &TII, Register BasePtr) {
3973   uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
3974 
3975   // FIXME: Why are half the "default" bits ignored based on the addressing
3976   // mode?
3977   return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
3978 }
3979 
3980 AMDGPUInstructionSelector::MUBUFAddressData
3981 AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
3982   MUBUFAddressData Data;
3983   Data.N0 = Src;
3984 
3985   Register PtrBase;
3986   int64_t Offset;
3987 
3988   std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
3989   if (isUInt<32>(Offset)) {
3990     Data.N0 = PtrBase;
3991     Data.Offset = Offset;
3992   }
3993 
3994   if (MachineInstr *InputAdd
3995       = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
3996     Data.N2 = InputAdd->getOperand(1).getReg();
3997     Data.N3 = InputAdd->getOperand(2).getReg();
3998 
3999     // FIXME: Need to fix extra SGPR->VGPRcopies inserted
4000     // FIXME: Don't know this was defined by operand 0
4001     //
4002     // TODO: Remove this when we have copy folding optimizations after
4003     // RegBankSelect.
4004     Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
4005     Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
4006   }
4007 
4008   return Data;
4009 }
4010 
4011 /// Return if the addr64 mubuf mode should be used for the given address.
4012 bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
4013   // (ptr_add N2, N3) -> addr64, or
4014   // (ptr_add (ptr_add N2, N3), C1) -> addr64
4015   if (Addr.N2)
4016     return true;
4017 
4018   const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
4019   return N0Bank->getID() == AMDGPU::VGPRRegBankID;
4020 }
4021 
4022 /// Split an immediate offset \p ImmOffset depending on whether it fits in the
4023 /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
4024 /// component.
4025 void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
4026   MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
4027   if (SIInstrInfo::isLegalMUBUFImmOffset(ImmOffset))
4028     return;
4029 
4030   // Illegal offset, store it in soffset.
4031   SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4032   B.buildInstr(AMDGPU::S_MOV_B32)
4033     .addDef(SOffset)
4034     .addImm(ImmOffset);
4035   ImmOffset = 0;
4036 }
4037 
4038 bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
4039   MachineOperand &Root, Register &VAddr, Register &RSrcReg,
4040   Register &SOffset, int64_t &Offset) const {
4041   // FIXME: Predicates should stop this from reaching here.
4042   // addr64 bit was removed for volcanic islands.
4043   if (!STI.hasAddr64() || STI.useFlatForGlobal())
4044     return false;
4045 
4046   MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
4047   if (!shouldUseAddr64(AddrData))
4048     return false;
4049 
4050   Register N0 = AddrData.N0;
4051   Register N2 = AddrData.N2;
4052   Register N3 = AddrData.N3;
4053   Offset = AddrData.Offset;
4054 
4055   // Base pointer for the SRD.
4056   Register SRDPtr;
4057 
4058   if (N2) {
4059     if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
4060       assert(N3);
4061       if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
4062         // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
4063         // addr64, and construct the default resource from a 0 address.
4064         VAddr = N0;
4065       } else {
4066         SRDPtr = N3;
4067         VAddr = N2;
4068       }
4069     } else {
4070       // N2 is not divergent.
4071       SRDPtr = N2;
4072       VAddr = N3;
4073     }
4074   } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
4075     // Use the default null pointer in the resource
4076     VAddr = N0;
4077   } else {
4078     // N0 -> offset, or
4079     // (N0 + C1) -> offset
4080     SRDPtr = N0;
4081   }
4082 
4083   MachineIRBuilder B(*Root.getParent());
4084   RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
4085   splitIllegalMUBUFOffset(B, SOffset, Offset);
4086   return true;
4087 }
4088 
4089 bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
4090   MachineOperand &Root, Register &RSrcReg, Register &SOffset,
4091   int64_t &Offset) const {
4092 
4093   // FIXME: Pattern should not reach here.
4094   if (STI.useFlatForGlobal())
4095     return false;
4096 
4097   MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
4098   if (shouldUseAddr64(AddrData))
4099     return false;
4100 
4101   // N0 -> offset, or
4102   // (N0 + C1) -> offset
4103   Register SRDPtr = AddrData.N0;
4104   Offset = AddrData.Offset;
4105 
4106   // TODO: Look through extensions for 32-bit soffset.
4107   MachineIRBuilder B(*Root.getParent());
4108 
4109   RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
4110   splitIllegalMUBUFOffset(B, SOffset, Offset);
4111   return true;
4112 }
4113 
4114 InstructionSelector::ComplexRendererFns
4115 AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
4116   Register VAddr;
4117   Register RSrcReg;
4118   Register SOffset;
4119   int64_t Offset = 0;
4120 
4121   if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
4122     return {};
4123 
4124   // FIXME: Use defaulted operands for trailing 0s and remove from the complex
4125   // pattern.
4126   return {{
4127       [=](MachineInstrBuilder &MIB) {  // rsrc
4128         MIB.addReg(RSrcReg);
4129       },
4130       [=](MachineInstrBuilder &MIB) { // vaddr
4131         MIB.addReg(VAddr);
4132       },
4133       [=](MachineInstrBuilder &MIB) { // soffset
4134         if (SOffset)
4135           MIB.addReg(SOffset);
4136         else
4137           MIB.addImm(0);
4138       },
4139       [=](MachineInstrBuilder &MIB) { // offset
4140         MIB.addImm(Offset);
4141       },
4142       addZeroImm, //  glc
4143       addZeroImm, //  slc
4144       addZeroImm, //  tfe
4145       addZeroImm, //  dlc
4146       addZeroImm  //  swz
4147     }};
4148 }
4149 
4150 InstructionSelector::ComplexRendererFns
4151 AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
4152   Register RSrcReg;
4153   Register SOffset;
4154   int64_t Offset = 0;
4155 
4156   if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
4157     return {};
4158 
4159   return {{
4160       [=](MachineInstrBuilder &MIB) {  // rsrc
4161         MIB.addReg(RSrcReg);
4162       },
4163       [=](MachineInstrBuilder &MIB) { // soffset
4164         if (SOffset)
4165           MIB.addReg(SOffset);
4166         else
4167           MIB.addImm(0);
4168       },
4169       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
4170       addZeroImm, //  glc
4171       addZeroImm, //  slc
4172       addZeroImm, //  tfe
4173       addZeroImm, //  dlc
4174       addZeroImm  //  swz
4175     }};
4176 }
4177 
4178 InstructionSelector::ComplexRendererFns
4179 AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const {
4180   Register VAddr;
4181   Register RSrcReg;
4182   Register SOffset;
4183   int64_t Offset = 0;
4184 
4185   if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
4186     return {};
4187 
4188   // FIXME: Use defaulted operands for trailing 0s and remove from the complex
4189   // pattern.
4190   return {{
4191       [=](MachineInstrBuilder &MIB) {  // rsrc
4192         MIB.addReg(RSrcReg);
4193       },
4194       [=](MachineInstrBuilder &MIB) { // vaddr
4195         MIB.addReg(VAddr);
4196       },
4197       [=](MachineInstrBuilder &MIB) { // soffset
4198         if (SOffset)
4199           MIB.addReg(SOffset);
4200         else
4201           MIB.addImm(0);
4202       },
4203       [=](MachineInstrBuilder &MIB) { // offset
4204         MIB.addImm(Offset);
4205       },
4206       addZeroImm //  slc
4207     }};
4208 }
4209 
4210 InstructionSelector::ComplexRendererFns
4211 AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const {
4212   Register RSrcReg;
4213   Register SOffset;
4214   int64_t Offset = 0;
4215 
4216   if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
4217     return {};
4218 
4219   return {{
4220       [=](MachineInstrBuilder &MIB) {  // rsrc
4221         MIB.addReg(RSrcReg);
4222       },
4223       [=](MachineInstrBuilder &MIB) { // soffset
4224         if (SOffset)
4225           MIB.addReg(SOffset);
4226         else
4227           MIB.addImm(0);
4228       },
4229       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
4230       addZeroImm //  slc
4231     }};
4232 }
4233 
4234 /// Get an immediate that must be 32-bits, and treated as zero extended.
4235 static Optional<uint64_t> getConstantZext32Val(Register Reg,
4236                                                const MachineRegisterInfo &MRI) {
4237   // getConstantVRegVal sexts any values, so see if that matters.
4238   Optional<int64_t> OffsetVal = getConstantVRegSExtVal(Reg, MRI);
4239   if (!OffsetVal || !isInt<32>(*OffsetVal))
4240     return None;
4241   return Lo_32(*OffsetVal);
4242 }
4243 
4244 InstructionSelector::ComplexRendererFns
4245 AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
4246   Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
4247   if (!OffsetVal)
4248     return {};
4249 
4250   Optional<int64_t> EncodedImm =
4251       AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
4252   if (!EncodedImm)
4253     return {};
4254 
4255   return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }  }};
4256 }
4257 
4258 InstructionSelector::ComplexRendererFns
4259 AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
4260   assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
4261 
4262   Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
4263   if (!OffsetVal)
4264     return {};
4265 
4266   Optional<int64_t> EncodedImm
4267     = AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal);
4268   if (!EncodedImm)
4269     return {};
4270 
4271   return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }  }};
4272 }
4273 
4274 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
4275                                                  const MachineInstr &MI,
4276                                                  int OpIdx) const {
4277   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
4278          "Expected G_CONSTANT");
4279   MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
4280 }
4281 
4282 void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
4283                                                 const MachineInstr &MI,
4284                                                 int OpIdx) const {
4285   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
4286          "Expected G_CONSTANT");
4287   MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
4288 }
4289 
4290 void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB,
4291                                                  const MachineInstr &MI,
4292                                                  int OpIdx) const {
4293   assert(OpIdx == -1);
4294 
4295   const MachineOperand &Op = MI.getOperand(1);
4296   if (MI.getOpcode() == TargetOpcode::G_FCONSTANT)
4297     MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
4298   else {
4299     assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
4300     MIB.addImm(Op.getCImm()->getSExtValue());
4301   }
4302 }
4303 
4304 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
4305                                                 const MachineInstr &MI,
4306                                                 int OpIdx) const {
4307   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
4308          "Expected G_CONSTANT");
4309   MIB.addImm(MI.getOperand(1).getCImm()->getValue().countPopulation());
4310 }
4311 
4312 /// This only really exists to satisfy DAG type checking machinery, so is a
4313 /// no-op here.
4314 void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
4315                                                 const MachineInstr &MI,
4316                                                 int OpIdx) const {
4317   MIB.addImm(MI.getOperand(OpIdx).getImm());
4318 }
4319 
4320 void AMDGPUInstructionSelector::renderExtractGLC(MachineInstrBuilder &MIB,
4321                                                  const MachineInstr &MI,
4322                                                  int OpIdx) const {
4323   assert(OpIdx >= 0 && "expected to match an immediate operand");
4324   MIB.addImm(MI.getOperand(OpIdx).getImm() & 1);
4325 }
4326 
4327 void AMDGPUInstructionSelector::renderExtractSLC(MachineInstrBuilder &MIB,
4328                                                  const MachineInstr &MI,
4329                                                  int OpIdx) const {
4330   assert(OpIdx >= 0 && "expected to match an immediate operand");
4331   MIB.addImm((MI.getOperand(OpIdx).getImm() >> 1) & 1);
4332 }
4333 
4334 void AMDGPUInstructionSelector::renderExtractDLC(MachineInstrBuilder &MIB,
4335                                                  const MachineInstr &MI,
4336                                                  int OpIdx) const {
4337   assert(OpIdx >= 0 && "expected to match an immediate operand");
4338   MIB.addImm((MI.getOperand(OpIdx).getImm() >> 2) & 1);
4339 }
4340 
4341 void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
4342                                                  const MachineInstr &MI,
4343                                                  int OpIdx) const {
4344   assert(OpIdx >= 0 && "expected to match an immediate operand");
4345   MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1);
4346 }
4347 
4348 void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
4349                                                  const MachineInstr &MI,
4350                                                  int OpIdx) const {
4351   MIB.addFrameIndex((MI.getOperand(1).getIndex()));
4352 }
4353 
4354 bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const {
4355   return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm());
4356 }
4357 
4358 bool AMDGPUInstructionSelector::isInlineImmediate32(int64_t Imm) const {
4359   return AMDGPU::isInlinableLiteral32(Imm, STI.hasInv2PiInlineImm());
4360 }
4361 
4362 bool AMDGPUInstructionSelector::isInlineImmediate64(int64_t Imm) const {
4363   return AMDGPU::isInlinableLiteral64(Imm, STI.hasInv2PiInlineImm());
4364 }
4365 
4366 bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
4367   return TII.isInlineConstant(Imm);
4368 }
4369