1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the InstructionSelector class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUInstructionSelector.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUGlobalISelUtils.h"
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
22 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
25 #include "llvm/IR/DiagnosticInfo.h"
26 
27 #define DEBUG_TYPE "amdgpu-isel"
28 
29 using namespace llvm;
30 using namespace MIPatternMatch;
31 
32 static cl::opt<bool> AllowRiskySelect(
33   "amdgpu-global-isel-risky-select",
34   cl::desc("Allow GlobalISel to select cases that are likely to not work yet"),
35   cl::init(false),
36   cl::ReallyHidden);
37 
38 #define GET_GLOBALISEL_IMPL
39 #define AMDGPUSubtarget GCNSubtarget
40 #include "AMDGPUGenGlobalISel.inc"
41 #undef GET_GLOBALISEL_IMPL
42 #undef AMDGPUSubtarget
43 
44 AMDGPUInstructionSelector::AMDGPUInstructionSelector(
45     const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
46     const AMDGPUTargetMachine &TM)
47     : InstructionSelector(), TII(*STI.getInstrInfo()),
48       TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
49       STI(STI),
50       EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG),
51 #define GET_GLOBALISEL_PREDICATES_INIT
52 #include "AMDGPUGenGlobalISel.inc"
53 #undef GET_GLOBALISEL_PREDICATES_INIT
54 #define GET_GLOBALISEL_TEMPORARIES_INIT
55 #include "AMDGPUGenGlobalISel.inc"
56 #undef GET_GLOBALISEL_TEMPORARIES_INIT
57 {
58 }
59 
60 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
61 
62 void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB,
63                                         CodeGenCoverage &CoverageInfo) {
64   MRI = &MF.getRegInfo();
65   Subtarget = &MF.getSubtarget<GCNSubtarget>();
66   InstructionSelector::setupMF(MF, KB, CoverageInfo);
67 }
68 
69 bool AMDGPUInstructionSelector::isVCC(Register Reg,
70                                       const MachineRegisterInfo &MRI) const {
71   // The verifier is oblivious to s1 being a valid value for wavesize registers.
72   if (Reg.isPhysical())
73     return false;
74 
75   auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
76   const TargetRegisterClass *RC =
77       RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
78   if (RC) {
79     const LLT Ty = MRI.getType(Reg);
80     return RC->hasSuperClassEq(TRI.getBoolRC()) &&
81            Ty.isValid() && Ty.getSizeInBits() == 1;
82   }
83 
84   const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
85   return RB->getID() == AMDGPU::VCCRegBankID;
86 }
87 
88 bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
89                                                         unsigned NewOpc) const {
90   MI.setDesc(TII.get(NewOpc));
91   MI.RemoveOperand(1); // Remove intrinsic ID.
92   MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
93 
94   MachineOperand &Dst = MI.getOperand(0);
95   MachineOperand &Src = MI.getOperand(1);
96 
97   // TODO: This should be legalized to s32 if needed
98   if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
99     return false;
100 
101   const TargetRegisterClass *DstRC
102     = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
103   const TargetRegisterClass *SrcRC
104     = TRI.getConstrainedRegClassForOperand(Src, *MRI);
105   if (!DstRC || DstRC != SrcRC)
106     return false;
107 
108   return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
109          RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
110 }
111 
112 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
113   const DebugLoc &DL = I.getDebugLoc();
114   MachineBasicBlock *BB = I.getParent();
115   I.setDesc(TII.get(TargetOpcode::COPY));
116 
117   const MachineOperand &Src = I.getOperand(1);
118   MachineOperand &Dst = I.getOperand(0);
119   Register DstReg = Dst.getReg();
120   Register SrcReg = Src.getReg();
121 
122   if (isVCC(DstReg, *MRI)) {
123     if (SrcReg == AMDGPU::SCC) {
124       const TargetRegisterClass *RC
125         = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
126       if (!RC)
127         return true;
128       return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
129     }
130 
131     if (!isVCC(SrcReg, *MRI)) {
132       // TODO: Should probably leave the copy and let copyPhysReg expand it.
133       if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
134         return false;
135 
136       const TargetRegisterClass *SrcRC
137         = TRI.getConstrainedRegClassForOperand(Src, *MRI);
138 
139       Optional<ValueAndVReg> ConstVal =
140           getConstantVRegValWithLookThrough(SrcReg, *MRI, true, true);
141       if (ConstVal) {
142         unsigned MovOpc =
143             STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
144         BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
145             .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
146       } else {
147         Register MaskedReg = MRI->createVirtualRegister(SrcRC);
148 
149         // We can't trust the high bits at this point, so clear them.
150 
151         // TODO: Skip masking high bits if def is known boolean.
152 
153         unsigned AndOpc =
154             TRI.isSGPRClass(SrcRC) ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
155         BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
156             .addImm(1)
157             .addReg(SrcReg);
158         BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
159             .addImm(0)
160             .addReg(MaskedReg);
161       }
162 
163       if (!MRI->getRegClassOrNull(SrcReg))
164         MRI->setRegClass(SrcReg, SrcRC);
165       I.eraseFromParent();
166       return true;
167     }
168 
169     const TargetRegisterClass *RC =
170       TRI.getConstrainedRegClassForOperand(Dst, *MRI);
171     if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
172       return false;
173 
174     return true;
175   }
176 
177   for (const MachineOperand &MO : I.operands()) {
178     if (MO.getReg().isPhysical())
179       continue;
180 
181     const TargetRegisterClass *RC =
182             TRI.getConstrainedRegClassForOperand(MO, *MRI);
183     if (!RC)
184       continue;
185     RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
186   }
187   return true;
188 }
189 
190 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
191   const Register DefReg = I.getOperand(0).getReg();
192   const LLT DefTy = MRI->getType(DefReg);
193   if (DefTy == LLT::scalar(1)) {
194     if (!AllowRiskySelect) {
195       LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n");
196       return false;
197     }
198 
199     LLVM_DEBUG(dbgs() << "Selecting risky boolean phi\n");
200   }
201 
202   // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
203 
204   const RegClassOrRegBank &RegClassOrBank =
205     MRI->getRegClassOrRegBank(DefReg);
206 
207   const TargetRegisterClass *DefRC
208     = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
209   if (!DefRC) {
210     if (!DefTy.isValid()) {
211       LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
212       return false;
213     }
214 
215     const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
216     DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI);
217     if (!DefRC) {
218       LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
219       return false;
220     }
221   }
222 
223   // TODO: Verify that all registers have the same bank
224   I.setDesc(TII.get(TargetOpcode::PHI));
225   return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
226 }
227 
228 MachineOperand
229 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
230                                            const TargetRegisterClass &SubRC,
231                                            unsigned SubIdx) const {
232 
233   MachineInstr *MI = MO.getParent();
234   MachineBasicBlock *BB = MO.getParent()->getParent();
235   Register DstReg = MRI->createVirtualRegister(&SubRC);
236 
237   if (MO.isReg()) {
238     unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
239     Register Reg = MO.getReg();
240     BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
241             .addReg(Reg, 0, ComposedSubIdx);
242 
243     return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
244                                      MO.isKill(), MO.isDead(), MO.isUndef(),
245                                      MO.isEarlyClobber(), 0, MO.isDebug(),
246                                      MO.isInternalRead());
247   }
248 
249   assert(MO.isImm());
250 
251   APInt Imm(64, MO.getImm());
252 
253   switch (SubIdx) {
254   default:
255     llvm_unreachable("do not know to split immediate with this sub index.");
256   case AMDGPU::sub0:
257     return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
258   case AMDGPU::sub1:
259     return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
260   }
261 }
262 
263 static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
264   switch (Opc) {
265   case AMDGPU::G_AND:
266     return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
267   case AMDGPU::G_OR:
268     return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
269   case AMDGPU::G_XOR:
270     return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
271   default:
272     llvm_unreachable("not a bit op");
273   }
274 }
275 
276 bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
277   Register DstReg = I.getOperand(0).getReg();
278   unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
279 
280   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
281   if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
282       DstRB->getID() != AMDGPU::VCCRegBankID)
283     return false;
284 
285   bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
286                             STI.isWave64());
287   I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
288 
289   // Dead implicit-def of scc
290   I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
291                                          true, // isImp
292                                          false, // isKill
293                                          true)); // isDead
294   return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
295 }
296 
297 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
298   MachineBasicBlock *BB = I.getParent();
299   MachineFunction *MF = BB->getParent();
300   Register DstReg = I.getOperand(0).getReg();
301   const DebugLoc &DL = I.getDebugLoc();
302   LLT Ty = MRI->getType(DstReg);
303   if (Ty.isVector())
304     return false;
305 
306   unsigned Size = Ty.getSizeInBits();
307   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
308   const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
309   const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
310 
311   if (Size == 32) {
312     if (IsSALU) {
313       const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
314       MachineInstr *Add =
315         BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
316         .add(I.getOperand(1))
317         .add(I.getOperand(2));
318       I.eraseFromParent();
319       return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
320     }
321 
322     if (STI.hasAddNoCarry()) {
323       const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
324       I.setDesc(TII.get(Opc));
325       I.addOperand(*MF, MachineOperand::CreateImm(0));
326       I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
327       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
328     }
329 
330     const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
331 
332     Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
333     MachineInstr *Add
334       = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
335       .addDef(UnusedCarry, RegState::Dead)
336       .add(I.getOperand(1))
337       .add(I.getOperand(2))
338       .addImm(0);
339     I.eraseFromParent();
340     return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
341   }
342 
343   assert(!Sub && "illegal sub should not reach here");
344 
345   const TargetRegisterClass &RC
346     = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
347   const TargetRegisterClass &HalfRC
348     = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
349 
350   MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
351   MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
352   MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
353   MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
354 
355   Register DstLo = MRI->createVirtualRegister(&HalfRC);
356   Register DstHi = MRI->createVirtualRegister(&HalfRC);
357 
358   if (IsSALU) {
359     BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
360       .add(Lo1)
361       .add(Lo2);
362     BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
363       .add(Hi1)
364       .add(Hi2);
365   } else {
366     const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
367     Register CarryReg = MRI->createVirtualRegister(CarryRC);
368     BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
369       .addDef(CarryReg)
370       .add(Lo1)
371       .add(Lo2)
372       .addImm(0);
373     MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
374       .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
375       .add(Hi1)
376       .add(Hi2)
377       .addReg(CarryReg, RegState::Kill)
378       .addImm(0);
379 
380     if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
381       return false;
382   }
383 
384   BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
385     .addReg(DstLo)
386     .addImm(AMDGPU::sub0)
387     .addReg(DstHi)
388     .addImm(AMDGPU::sub1);
389 
390 
391   if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
392     return false;
393 
394   I.eraseFromParent();
395   return true;
396 }
397 
398 bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
399   MachineInstr &I) const {
400   MachineBasicBlock *BB = I.getParent();
401   MachineFunction *MF = BB->getParent();
402   const DebugLoc &DL = I.getDebugLoc();
403   Register Dst0Reg = I.getOperand(0).getReg();
404   Register Dst1Reg = I.getOperand(1).getReg();
405   const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
406                      I.getOpcode() == AMDGPU::G_UADDE;
407   const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
408                           I.getOpcode() == AMDGPU::G_USUBE;
409 
410   if (isVCC(Dst1Reg, *MRI)) {
411     unsigned NoCarryOpc =
412         IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
413     unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
414     I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
415     I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
416     I.addOperand(*MF, MachineOperand::CreateImm(0));
417     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
418   }
419 
420   Register Src0Reg = I.getOperand(2).getReg();
421   Register Src1Reg = I.getOperand(3).getReg();
422 
423   if (HasCarryIn) {
424     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
425       .addReg(I.getOperand(4).getReg());
426   }
427 
428   unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
429   unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
430 
431   BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
432     .add(I.getOperand(2))
433     .add(I.getOperand(3));
434   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
435     .addReg(AMDGPU::SCC);
436 
437   if (!MRI->getRegClassOrNull(Dst1Reg))
438     MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
439 
440   if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
441       !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
442       !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
443     return false;
444 
445   if (HasCarryIn &&
446       !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
447                                     AMDGPU::SReg_32RegClass, *MRI))
448     return false;
449 
450   I.eraseFromParent();
451   return true;
452 }
453 
454 // TODO: We should probably legalize these to only using 32-bit results.
455 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
456   MachineBasicBlock *BB = I.getParent();
457   Register DstReg = I.getOperand(0).getReg();
458   Register SrcReg = I.getOperand(1).getReg();
459   LLT DstTy = MRI->getType(DstReg);
460   LLT SrcTy = MRI->getType(SrcReg);
461   const unsigned SrcSize = SrcTy.getSizeInBits();
462   unsigned DstSize = DstTy.getSizeInBits();
463 
464   // TODO: Should handle any multiple of 32 offset.
465   unsigned Offset = I.getOperand(2).getImm();
466   if (Offset % 32 != 0 || DstSize > 128)
467     return false;
468 
469   // 16-bit operations really use 32-bit registers.
470   // FIXME: Probably should not allow 16-bit G_EXTRACT results.
471   if (DstSize == 16)
472     DstSize = 32;
473 
474   const TargetRegisterClass *DstRC =
475     TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
476   if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
477     return false;
478 
479   const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
480   const TargetRegisterClass *SrcRC =
481     TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
482   if (!SrcRC)
483     return false;
484   unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32,
485                                                          DstSize / 32);
486   SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
487   if (!SrcRC)
488     return false;
489 
490   SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
491                                     *SrcRC, I.getOperand(1));
492   const DebugLoc &DL = I.getDebugLoc();
493   BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
494     .addReg(SrcReg, 0, SubReg);
495 
496   I.eraseFromParent();
497   return true;
498 }
499 
500 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
501   MachineBasicBlock *BB = MI.getParent();
502   Register DstReg = MI.getOperand(0).getReg();
503   LLT DstTy = MRI->getType(DstReg);
504   LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
505 
506   const unsigned SrcSize = SrcTy.getSizeInBits();
507   if (SrcSize < 32)
508     return selectImpl(MI, *CoverageInfo);
509 
510   const DebugLoc &DL = MI.getDebugLoc();
511   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
512   const unsigned DstSize = DstTy.getSizeInBits();
513   const TargetRegisterClass *DstRC =
514     TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
515   if (!DstRC)
516     return false;
517 
518   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
519   MachineInstrBuilder MIB =
520     BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
521   for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
522     MachineOperand &Src = MI.getOperand(I + 1);
523     MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
524     MIB.addImm(SubRegs[I]);
525 
526     const TargetRegisterClass *SrcRC
527       = TRI.getConstrainedRegClassForOperand(Src, *MRI);
528     if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
529       return false;
530   }
531 
532   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
533     return false;
534 
535   MI.eraseFromParent();
536   return true;
537 }
538 
539 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
540   MachineBasicBlock *BB = MI.getParent();
541   const int NumDst = MI.getNumOperands() - 1;
542 
543   MachineOperand &Src = MI.getOperand(NumDst);
544 
545   Register SrcReg = Src.getReg();
546   Register DstReg0 = MI.getOperand(0).getReg();
547   LLT DstTy = MRI->getType(DstReg0);
548   LLT SrcTy = MRI->getType(SrcReg);
549 
550   const unsigned DstSize = DstTy.getSizeInBits();
551   const unsigned SrcSize = SrcTy.getSizeInBits();
552   const DebugLoc &DL = MI.getDebugLoc();
553   const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
554 
555   const TargetRegisterClass *SrcRC =
556     TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
557   if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
558     return false;
559 
560   // Note we could have mixed SGPR and VGPR destination banks for an SGPR
561   // source, and this relies on the fact that the same subregister indices are
562   // used for both.
563   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
564   for (int I = 0, E = NumDst; I != E; ++I) {
565     MachineOperand &Dst = MI.getOperand(I);
566     BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
567       .addReg(SrcReg, 0, SubRegs[I]);
568 
569     // Make sure the subregister index is valid for the source register.
570     SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
571     if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
572       return false;
573 
574     const TargetRegisterClass *DstRC =
575       TRI.getConstrainedRegClassForOperand(Dst, *MRI);
576     if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
577       return false;
578   }
579 
580   MI.eraseFromParent();
581   return true;
582 }
583 
584 bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
585   MachineInstr &MI) const {
586   if (selectImpl(MI, *CoverageInfo))
587     return true;
588 
589   const LLT S32 = LLT::scalar(32);
590   const LLT V2S16 = LLT::vector(2, 16);
591 
592   Register Dst = MI.getOperand(0).getReg();
593   if (MRI->getType(Dst) != V2S16)
594     return false;
595 
596   const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
597   if (DstBank->getID() != AMDGPU::SGPRRegBankID)
598     return false;
599 
600   Register Src0 = MI.getOperand(1).getReg();
601   Register Src1 = MI.getOperand(2).getReg();
602   if (MRI->getType(Src0) != S32)
603     return false;
604 
605   const DebugLoc &DL = MI.getDebugLoc();
606   MachineBasicBlock *BB = MI.getParent();
607 
608   auto ConstSrc1 =
609       getConstantVRegValWithLookThrough(Src1, *MRI, true, true, true);
610   if (ConstSrc1) {
611     auto ConstSrc0 =
612         getConstantVRegValWithLookThrough(Src0, *MRI, true, true, true);
613     if (ConstSrc0) {
614       const int64_t K0 = ConstSrc0->Value.getSExtValue();
615       const int64_t K1 = ConstSrc1->Value.getSExtValue();
616       uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
617       uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
618 
619       BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst)
620         .addImm(Lo16 | (Hi16 << 16));
621       MI.eraseFromParent();
622       return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
623     }
624   }
625 
626   // TODO: This should probably be a combine somewhere
627   // (build_vector_trunc $src0, undef -> copy $src0
628   MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
629   if (Src1Def && Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
630     MI.setDesc(TII.get(AMDGPU::COPY));
631     MI.RemoveOperand(2);
632     return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI) &&
633            RBI.constrainGenericRegister(Src0, AMDGPU::SReg_32RegClass, *MRI);
634   }
635 
636   Register ShiftSrc0;
637   Register ShiftSrc1;
638 
639   // With multiple uses of the shift, this will duplicate the shift and
640   // increase register pressure.
641   //
642   // (build_vector_trunc (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
643   //  => (S_PACK_HH_B32_B16 $src0, $src1)
644   // (build_vector_trunc $src0, (lshr_oneuse SReg_32:$src1, 16))
645   //  => (S_PACK_LH_B32_B16 $src0, $src1)
646   // (build_vector_trunc $src0, $src1)
647   //  => (S_PACK_LL_B32_B16 $src0, $src1)
648 
649   bool Shift0 = mi_match(
650       Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
651 
652   bool Shift1 = mi_match(
653       Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
654 
655   unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
656   if (Shift0 && Shift1) {
657     Opc = AMDGPU::S_PACK_HH_B32_B16;
658     MI.getOperand(1).setReg(ShiftSrc0);
659     MI.getOperand(2).setReg(ShiftSrc1);
660   } else if (Shift1) {
661     Opc = AMDGPU::S_PACK_LH_B32_B16;
662     MI.getOperand(2).setReg(ShiftSrc1);
663   } else if (Shift0 && ConstSrc1 && ConstSrc1->Value == 0) {
664     // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
665     auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
666       .addReg(ShiftSrc0)
667       .addImm(16);
668 
669     MI.eraseFromParent();
670     return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
671   }
672 
673   MI.setDesc(TII.get(Opc));
674   return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
675 }
676 
677 bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const {
678   return selectG_ADD_SUB(I);
679 }
680 
681 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
682   const MachineOperand &MO = I.getOperand(0);
683 
684   // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
685   // regbank check here is to know why getConstrainedRegClassForOperand failed.
686   const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
687   if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
688       (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
689     I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
690     return true;
691   }
692 
693   return false;
694 }
695 
696 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
697   MachineBasicBlock *BB = I.getParent();
698 
699   Register DstReg = I.getOperand(0).getReg();
700   Register Src0Reg = I.getOperand(1).getReg();
701   Register Src1Reg = I.getOperand(2).getReg();
702   LLT Src1Ty = MRI->getType(Src1Reg);
703 
704   unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
705   unsigned InsSize = Src1Ty.getSizeInBits();
706 
707   int64_t Offset = I.getOperand(3).getImm();
708 
709   // FIXME: These cases should have been illegal and unnecessary to check here.
710   if (Offset % 32 != 0 || InsSize % 32 != 0)
711     return false;
712 
713   // Currently not handled by getSubRegFromChannel.
714   if (InsSize > 128)
715     return false;
716 
717   unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
718   if (SubReg == AMDGPU::NoSubRegister)
719     return false;
720 
721   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
722   const TargetRegisterClass *DstRC =
723     TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
724   if (!DstRC)
725     return false;
726 
727   const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
728   const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
729   const TargetRegisterClass *Src0RC =
730     TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI);
731   const TargetRegisterClass *Src1RC =
732     TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI);
733 
734   // Deal with weird cases where the class only partially supports the subreg
735   // index.
736   Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
737   if (!Src0RC || !Src1RC)
738     return false;
739 
740   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
741       !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
742       !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
743     return false;
744 
745   const DebugLoc &DL = I.getDebugLoc();
746   BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
747     .addReg(Src0Reg)
748     .addReg(Src1Reg)
749     .addImm(SubReg);
750 
751   I.eraseFromParent();
752   return true;
753 }
754 
755 bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
756   if (STI.getLDSBankCount() != 16)
757     return selectImpl(MI, *CoverageInfo);
758 
759   Register Dst = MI.getOperand(0).getReg();
760   Register Src0 = MI.getOperand(2).getReg();
761   Register M0Val = MI.getOperand(6).getReg();
762   if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
763       !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
764       !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
765     return false;
766 
767   // This requires 2 instructions. It is possible to write a pattern to support
768   // this, but the generated isel emitter doesn't correctly deal with multiple
769   // output instructions using the same physical register input. The copy to m0
770   // is incorrectly placed before the second instruction.
771   //
772   // TODO: Match source modifiers.
773 
774   Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
775   const DebugLoc &DL = MI.getDebugLoc();
776   MachineBasicBlock *MBB = MI.getParent();
777 
778   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
779     .addReg(M0Val);
780   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
781     .addImm(2)
782     .addImm(MI.getOperand(4).getImm())  // $attr
783     .addImm(MI.getOperand(3).getImm()); // $attrchan
784 
785   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
786     .addImm(0)                          // $src0_modifiers
787     .addReg(Src0)                       // $src0
788     .addImm(MI.getOperand(4).getImm())  // $attr
789     .addImm(MI.getOperand(3).getImm())  // $attrchan
790     .addImm(0)                          // $src2_modifiers
791     .addReg(InterpMov)                  // $src2 - 2 f16 values selected by high
792     .addImm(MI.getOperand(5).getImm())  // $high
793     .addImm(0)                          // $clamp
794     .addImm(0);                         // $omod
795 
796   MI.eraseFromParent();
797   return true;
798 }
799 
800 // Writelane is special in that it can use SGPR and M0 (which would normally
801 // count as using the constant bus twice - but in this case it is allowed since
802 // the lane selector doesn't count as a use of the constant bus). However, it is
803 // still required to abide by the 1 SGPR rule. Fix this up if we might have
804 // multiple SGPRs.
805 bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
806   // With a constant bus limit of at least 2, there's no issue.
807   if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
808     return selectImpl(MI, *CoverageInfo);
809 
810   MachineBasicBlock *MBB = MI.getParent();
811   const DebugLoc &DL = MI.getDebugLoc();
812   Register VDst = MI.getOperand(0).getReg();
813   Register Val = MI.getOperand(2).getReg();
814   Register LaneSelect = MI.getOperand(3).getReg();
815   Register VDstIn = MI.getOperand(4).getReg();
816 
817   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
818 
819   Optional<ValueAndVReg> ConstSelect =
820     getConstantVRegValWithLookThrough(LaneSelect, *MRI, true, true);
821   if (ConstSelect) {
822     // The selector has to be an inline immediate, so we can use whatever for
823     // the other operands.
824     MIB.addReg(Val);
825     MIB.addImm(ConstSelect->Value.getSExtValue() &
826                maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
827   } else {
828     Optional<ValueAndVReg> ConstVal =
829       getConstantVRegValWithLookThrough(Val, *MRI, true, true);
830 
831     // If the value written is an inline immediate, we can get away without a
832     // copy to m0.
833     if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
834                                                  STI.hasInv2PiInlineImm())) {
835       MIB.addImm(ConstVal->Value.getSExtValue());
836       MIB.addReg(LaneSelect);
837     } else {
838       MIB.addReg(Val);
839 
840       // If the lane selector was originally in a VGPR and copied with
841       // readfirstlane, there's a hazard to read the same SGPR from the
842       // VALU. Constrain to a different SGPR to help avoid needing a nop later.
843       RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
844 
845       BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
846         .addReg(LaneSelect);
847       MIB.addReg(AMDGPU::M0);
848     }
849   }
850 
851   MIB.addReg(VDstIn);
852 
853   MI.eraseFromParent();
854   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
855 }
856 
857 // We need to handle this here because tablegen doesn't support matching
858 // instructions with multiple outputs.
859 bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
860   Register Dst0 = MI.getOperand(0).getReg();
861   Register Dst1 = MI.getOperand(1).getReg();
862 
863   LLT Ty = MRI->getType(Dst0);
864   unsigned Opc;
865   if (Ty == LLT::scalar(32))
866     Opc = AMDGPU::V_DIV_SCALE_F32_e64;
867   else if (Ty == LLT::scalar(64))
868     Opc = AMDGPU::V_DIV_SCALE_F64_e64;
869   else
870     return false;
871 
872   // TODO: Match source modifiers.
873 
874   const DebugLoc &DL = MI.getDebugLoc();
875   MachineBasicBlock *MBB = MI.getParent();
876 
877   Register Numer = MI.getOperand(3).getReg();
878   Register Denom = MI.getOperand(4).getReg();
879   unsigned ChooseDenom = MI.getOperand(5).getImm();
880 
881   Register Src0 = ChooseDenom != 0 ? Numer : Denom;
882 
883   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
884     .addDef(Dst1)
885     .addImm(0)     // $src0_modifiers
886     .addUse(Src0)  // $src0
887     .addImm(0)     // $src1_modifiers
888     .addUse(Denom) // $src1
889     .addImm(0)     // $src2_modifiers
890     .addUse(Numer) // $src2
891     .addImm(0)     // $clamp
892     .addImm(0);    // $omod
893 
894   MI.eraseFromParent();
895   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
896 }
897 
898 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
899   unsigned IntrinsicID = I.getIntrinsicID();
900   switch (IntrinsicID) {
901   case Intrinsic::amdgcn_if_break: {
902     MachineBasicBlock *BB = I.getParent();
903 
904     // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
905     // SelectionDAG uses for wave32 vs wave64.
906     BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
907       .add(I.getOperand(0))
908       .add(I.getOperand(2))
909       .add(I.getOperand(3));
910 
911     Register DstReg = I.getOperand(0).getReg();
912     Register Src0Reg = I.getOperand(2).getReg();
913     Register Src1Reg = I.getOperand(3).getReg();
914 
915     I.eraseFromParent();
916 
917     for (Register Reg : { DstReg, Src0Reg, Src1Reg })
918       MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
919 
920     return true;
921   }
922   case Intrinsic::amdgcn_interp_p1_f16:
923     return selectInterpP1F16(I);
924   case Intrinsic::amdgcn_wqm:
925     return constrainCopyLikeIntrin(I, AMDGPU::WQM);
926   case Intrinsic::amdgcn_softwqm:
927     return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
928   case Intrinsic::amdgcn_wwm:
929     return constrainCopyLikeIntrin(I, AMDGPU::WWM);
930   case Intrinsic::amdgcn_writelane:
931     return selectWritelane(I);
932   case Intrinsic::amdgcn_div_scale:
933     return selectDivScale(I);
934   case Intrinsic::amdgcn_icmp:
935     return selectIntrinsicIcmp(I);
936   case Intrinsic::amdgcn_ballot:
937     return selectBallot(I);
938   case Intrinsic::amdgcn_reloc_constant:
939     return selectRelocConstant(I);
940   case Intrinsic::amdgcn_groupstaticsize:
941     return selectGroupStaticSize(I);
942   case Intrinsic::returnaddress:
943     return selectReturnAddress(I);
944   default:
945     return selectImpl(I, *CoverageInfo);
946   }
947 }
948 
949 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) {
950   if (Size != 32 && Size != 64)
951     return -1;
952   switch (P) {
953   default:
954     llvm_unreachable("Unknown condition code!");
955   case CmpInst::ICMP_NE:
956     return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64;
957   case CmpInst::ICMP_EQ:
958     return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64;
959   case CmpInst::ICMP_SGT:
960     return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64;
961   case CmpInst::ICMP_SGE:
962     return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64;
963   case CmpInst::ICMP_SLT:
964     return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64;
965   case CmpInst::ICMP_SLE:
966     return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64;
967   case CmpInst::ICMP_UGT:
968     return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64;
969   case CmpInst::ICMP_UGE:
970     return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64;
971   case CmpInst::ICMP_ULT:
972     return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64;
973   case CmpInst::ICMP_ULE:
974     return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64;
975   }
976 }
977 
978 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
979                                               unsigned Size) const {
980   if (Size == 64) {
981     if (!STI.hasScalarCompareEq64())
982       return -1;
983 
984     switch (P) {
985     case CmpInst::ICMP_NE:
986       return AMDGPU::S_CMP_LG_U64;
987     case CmpInst::ICMP_EQ:
988       return AMDGPU::S_CMP_EQ_U64;
989     default:
990       return -1;
991     }
992   }
993 
994   if (Size != 32)
995     return -1;
996 
997   switch (P) {
998   case CmpInst::ICMP_NE:
999     return AMDGPU::S_CMP_LG_U32;
1000   case CmpInst::ICMP_EQ:
1001     return AMDGPU::S_CMP_EQ_U32;
1002   case CmpInst::ICMP_SGT:
1003     return AMDGPU::S_CMP_GT_I32;
1004   case CmpInst::ICMP_SGE:
1005     return AMDGPU::S_CMP_GE_I32;
1006   case CmpInst::ICMP_SLT:
1007     return AMDGPU::S_CMP_LT_I32;
1008   case CmpInst::ICMP_SLE:
1009     return AMDGPU::S_CMP_LE_I32;
1010   case CmpInst::ICMP_UGT:
1011     return AMDGPU::S_CMP_GT_U32;
1012   case CmpInst::ICMP_UGE:
1013     return AMDGPU::S_CMP_GE_U32;
1014   case CmpInst::ICMP_ULT:
1015     return AMDGPU::S_CMP_LT_U32;
1016   case CmpInst::ICMP_ULE:
1017     return AMDGPU::S_CMP_LE_U32;
1018   default:
1019     llvm_unreachable("Unknown condition code!");
1020   }
1021 }
1022 
1023 bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
1024   MachineBasicBlock *BB = I.getParent();
1025   const DebugLoc &DL = I.getDebugLoc();
1026 
1027   Register SrcReg = I.getOperand(2).getReg();
1028   unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1029 
1030   auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1031 
1032   Register CCReg = I.getOperand(0).getReg();
1033   if (!isVCC(CCReg, *MRI)) {
1034     int Opcode = getS_CMPOpcode(Pred, Size);
1035     if (Opcode == -1)
1036       return false;
1037     MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1038             .add(I.getOperand(2))
1039             .add(I.getOperand(3));
1040     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1041       .addReg(AMDGPU::SCC);
1042     bool Ret =
1043         constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
1044         RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1045     I.eraseFromParent();
1046     return Ret;
1047   }
1048 
1049   int Opcode = getV_CMPOpcode(Pred, Size);
1050   if (Opcode == -1)
1051     return false;
1052 
1053   MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode),
1054             I.getOperand(0).getReg())
1055             .add(I.getOperand(2))
1056             .add(I.getOperand(3));
1057   RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
1058                                *TRI.getBoolRC(), *MRI);
1059   bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1060   I.eraseFromParent();
1061   return Ret;
1062 }
1063 
1064 bool AMDGPUInstructionSelector::selectIntrinsicIcmp(MachineInstr &I) const {
1065   Register Dst = I.getOperand(0).getReg();
1066   if (isVCC(Dst, *MRI))
1067     return false;
1068 
1069   if (MRI->getType(Dst).getSizeInBits() != STI.getWavefrontSize())
1070     return false;
1071 
1072   MachineBasicBlock *BB = I.getParent();
1073   const DebugLoc &DL = I.getDebugLoc();
1074   Register SrcReg = I.getOperand(2).getReg();
1075   unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1076   auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1077 
1078   int Opcode = getV_CMPOpcode(Pred, Size);
1079   if (Opcode == -1)
1080     return false;
1081 
1082   MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst)
1083                            .add(I.getOperand(2))
1084                            .add(I.getOperand(3));
1085   RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), *TRI.getBoolRC(),
1086                                *MRI);
1087   bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1088   I.eraseFromParent();
1089   return Ret;
1090 }
1091 
1092 bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1093   MachineBasicBlock *BB = I.getParent();
1094   const DebugLoc &DL = I.getDebugLoc();
1095   Register DstReg = I.getOperand(0).getReg();
1096   const unsigned Size = MRI->getType(DstReg).getSizeInBits();
1097   const bool Is64 = Size == 64;
1098 
1099   if (Size != STI.getWavefrontSize())
1100     return false;
1101 
1102   Optional<ValueAndVReg> Arg =
1103       getConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI, true);
1104 
1105   if (Arg.hasValue()) {
1106     const int64_t Value = Arg.getValue().Value.getSExtValue();
1107     if (Value == 0) {
1108       unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1109       BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
1110     } else if (Value == -1) { // all ones
1111       Register SrcReg = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
1112       BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
1113     } else
1114       return false;
1115   } else {
1116     Register SrcReg = I.getOperand(2).getReg();
1117     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
1118   }
1119 
1120   I.eraseFromParent();
1121   return true;
1122 }
1123 
1124 bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1125   Register DstReg = I.getOperand(0).getReg();
1126   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1127   const TargetRegisterClass *DstRC =
1128     TRI.getRegClassForSizeOnBank(32, *DstBank, *MRI);
1129   if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1130     return false;
1131 
1132   const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1133 
1134   Module *M = MF->getFunction().getParent();
1135   const MDNode *Metadata = I.getOperand(2).getMetadata();
1136   auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1137   auto RelocSymbol = cast<GlobalVariable>(
1138     M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1139 
1140   MachineBasicBlock *BB = I.getParent();
1141   BuildMI(*BB, &I, I.getDebugLoc(),
1142           TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1143     .addGlobalAddress(RelocSymbol, 0, SIInstrInfo::MO_ABS32_LO);
1144 
1145   I.eraseFromParent();
1146   return true;
1147 }
1148 
1149 bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1150   Triple::OSType OS = MF->getTarget().getTargetTriple().getOS();
1151 
1152   Register DstReg = I.getOperand(0).getReg();
1153   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1154   unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1155     AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1156 
1157   MachineBasicBlock *MBB = I.getParent();
1158   const DebugLoc &DL = I.getDebugLoc();
1159 
1160   auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1161 
1162   if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1163     const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1164     MIB.addImm(MFI->getLDSSize());
1165   } else {
1166     Module *M = MF->getFunction().getParent();
1167     const GlobalValue *GV
1168       = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1169     MIB.addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
1170   }
1171 
1172   I.eraseFromParent();
1173   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1174 }
1175 
1176 bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1177   MachineBasicBlock *MBB = I.getParent();
1178   MachineFunction &MF = *MBB->getParent();
1179   const DebugLoc &DL = I.getDebugLoc();
1180 
1181   MachineOperand &Dst = I.getOperand(0);
1182   Register DstReg = Dst.getReg();
1183   unsigned Depth = I.getOperand(2).getImm();
1184 
1185   const TargetRegisterClass *RC
1186     = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1187   if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1188       !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1189     return false;
1190 
1191   // Check for kernel and shader functions
1192   if (Depth != 0 ||
1193       MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1194     BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1195       .addImm(0);
1196     I.eraseFromParent();
1197     return true;
1198   }
1199 
1200   MachineFrameInfo &MFI = MF.getFrameInfo();
1201   // There is a call to @llvm.returnaddress in this function
1202   MFI.setReturnAddressIsTaken(true);
1203 
1204   // Get the return address reg and mark it as an implicit live-in
1205   Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1206   Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1207                                              AMDGPU::SReg_64RegClass);
1208   BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1209     .addReg(LiveIn);
1210   I.eraseFromParent();
1211   return true;
1212 }
1213 
1214 bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1215   // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
1216   // SelectionDAG uses for wave32 vs wave64.
1217   MachineBasicBlock *BB = MI.getParent();
1218   BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1219       .add(MI.getOperand(1));
1220 
1221   Register Reg = MI.getOperand(1).getReg();
1222   MI.eraseFromParent();
1223 
1224   if (!MRI->getRegClassOrNull(Reg))
1225     MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1226   return true;
1227 }
1228 
1229 bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1230   MachineInstr &MI, Intrinsic::ID IntrID) const {
1231   MachineBasicBlock *MBB = MI.getParent();
1232   MachineFunction *MF = MBB->getParent();
1233   const DebugLoc &DL = MI.getDebugLoc();
1234 
1235   unsigned IndexOperand = MI.getOperand(7).getImm();
1236   bool WaveRelease = MI.getOperand(8).getImm() != 0;
1237   bool WaveDone = MI.getOperand(9).getImm() != 0;
1238 
1239   if (WaveDone && !WaveRelease)
1240     report_fatal_error("ds_ordered_count: wave_done requires wave_release");
1241 
1242   unsigned OrderedCountIndex = IndexOperand & 0x3f;
1243   IndexOperand &= ~0x3f;
1244   unsigned CountDw = 0;
1245 
1246   if (STI.getGeneration() >= AMDGPUSubtarget::GFX10) {
1247     CountDw = (IndexOperand >> 24) & 0xf;
1248     IndexOperand &= ~(0xf << 24);
1249 
1250     if (CountDw < 1 || CountDw > 4) {
1251       report_fatal_error(
1252         "ds_ordered_count: dword count must be between 1 and 4");
1253     }
1254   }
1255 
1256   if (IndexOperand)
1257     report_fatal_error("ds_ordered_count: bad index operand");
1258 
1259   unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1260   unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1261 
1262   unsigned Offset0 = OrderedCountIndex << 2;
1263   unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) |
1264                      (Instruction << 4);
1265 
1266   if (STI.getGeneration() >= AMDGPUSubtarget::GFX10)
1267     Offset1 |= (CountDw - 1) << 6;
1268 
1269   unsigned Offset = Offset0 | (Offset1 << 8);
1270 
1271   Register M0Val = MI.getOperand(2).getReg();
1272   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1273     .addReg(M0Val);
1274 
1275   Register DstReg = MI.getOperand(0).getReg();
1276   Register ValReg = MI.getOperand(3).getReg();
1277   MachineInstrBuilder DS =
1278     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1279       .addReg(ValReg)
1280       .addImm(Offset)
1281       .cloneMemRefs(MI);
1282 
1283   if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1284     return false;
1285 
1286   bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1287   MI.eraseFromParent();
1288   return Ret;
1289 }
1290 
1291 static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1292   switch (IntrID) {
1293   case Intrinsic::amdgcn_ds_gws_init:
1294     return AMDGPU::DS_GWS_INIT;
1295   case Intrinsic::amdgcn_ds_gws_barrier:
1296     return AMDGPU::DS_GWS_BARRIER;
1297   case Intrinsic::amdgcn_ds_gws_sema_v:
1298     return AMDGPU::DS_GWS_SEMA_V;
1299   case Intrinsic::amdgcn_ds_gws_sema_br:
1300     return AMDGPU::DS_GWS_SEMA_BR;
1301   case Intrinsic::amdgcn_ds_gws_sema_p:
1302     return AMDGPU::DS_GWS_SEMA_P;
1303   case Intrinsic::amdgcn_ds_gws_sema_release_all:
1304     return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1305   default:
1306     llvm_unreachable("not a gws intrinsic");
1307   }
1308 }
1309 
1310 bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1311                                                      Intrinsic::ID IID) const {
1312   if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1313       !STI.hasGWSSemaReleaseAll())
1314     return false;
1315 
1316   // intrinsic ID, vsrc, offset
1317   const bool HasVSrc = MI.getNumOperands() == 3;
1318   assert(HasVSrc || MI.getNumOperands() == 2);
1319 
1320   Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1321   const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1322   if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1323     return false;
1324 
1325   MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1326   assert(OffsetDef);
1327 
1328   unsigned ImmOffset;
1329 
1330   MachineBasicBlock *MBB = MI.getParent();
1331   const DebugLoc &DL = MI.getDebugLoc();
1332 
1333   MachineInstr *Readfirstlane = nullptr;
1334 
1335   // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1336   // incoming offset, in case there's an add of a constant. We'll have to put it
1337   // back later.
1338   if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1339     Readfirstlane = OffsetDef;
1340     BaseOffset = OffsetDef->getOperand(1).getReg();
1341     OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1342   }
1343 
1344   if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1345     // If we have a constant offset, try to use the 0 in m0 as the base.
1346     // TODO: Look into changing the default m0 initialization value. If the
1347     // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1348     // the immediate offset.
1349 
1350     ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1351     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1352       .addImm(0);
1353   } else {
1354     std::tie(BaseOffset, ImmOffset) =
1355         AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset);
1356 
1357     if (Readfirstlane) {
1358       // We have the constant offset now, so put the readfirstlane back on the
1359       // variable component.
1360       if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1361         return false;
1362 
1363       Readfirstlane->getOperand(1).setReg(BaseOffset);
1364       BaseOffset = Readfirstlane->getOperand(0).getReg();
1365     } else {
1366       if (!RBI.constrainGenericRegister(BaseOffset,
1367                                         AMDGPU::SReg_32RegClass, *MRI))
1368         return false;
1369     }
1370 
1371     Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1372     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1373       .addReg(BaseOffset)
1374       .addImm(16);
1375 
1376     BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1377       .addReg(M0Base);
1378   }
1379 
1380   // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1381   // offset field) % 64. Some versions of the programming guide omit the m0
1382   // part, or claim it's from offset 0.
1383   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1384 
1385   if (HasVSrc) {
1386     Register VSrc = MI.getOperand(1).getReg();
1387     MIB.addReg(VSrc);
1388     if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1389       return false;
1390   }
1391 
1392   MIB.addImm(ImmOffset)
1393      .cloneMemRefs(MI);
1394 
1395   MI.eraseFromParent();
1396   return true;
1397 }
1398 
1399 bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1400                                                       bool IsAppend) const {
1401   Register PtrBase = MI.getOperand(2).getReg();
1402   LLT PtrTy = MRI->getType(PtrBase);
1403   bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1404 
1405   unsigned Offset;
1406   std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1407 
1408   // TODO: Should this try to look through readfirstlane like GWS?
1409   if (!isDSOffsetLegal(PtrBase, Offset)) {
1410     PtrBase = MI.getOperand(2).getReg();
1411     Offset = 0;
1412   }
1413 
1414   MachineBasicBlock *MBB = MI.getParent();
1415   const DebugLoc &DL = MI.getDebugLoc();
1416   const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1417 
1418   BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1419     .addReg(PtrBase);
1420   if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1421     return false;
1422 
1423   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1424     .addImm(Offset)
1425     .addImm(IsGDS ? -1 : 0)
1426     .cloneMemRefs(MI);
1427   MI.eraseFromParent();
1428   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1429 }
1430 
1431 bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
1432   if (TM.getOptLevel() > CodeGenOpt::None) {
1433     unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
1434     if (WGSize <= STI.getWavefrontSize()) {
1435       MachineBasicBlock *MBB = MI.getParent();
1436       const DebugLoc &DL = MI.getDebugLoc();
1437       BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER));
1438       MI.eraseFromParent();
1439       return true;
1440     }
1441   }
1442   return selectImpl(MI, *CoverageInfo);
1443 }
1444 
1445 static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
1446                          bool &IsTexFail) {
1447   if (TexFailCtrl)
1448     IsTexFail = true;
1449 
1450   TFE = (TexFailCtrl & 0x1) ? 1 : 0;
1451   TexFailCtrl &= ~(uint64_t)0x1;
1452   LWE = (TexFailCtrl & 0x2) ? 1 : 0;
1453   TexFailCtrl &= ~(uint64_t)0x2;
1454 
1455   return TexFailCtrl == 0;
1456 }
1457 
1458 static bool parseCachePolicy(uint64_t Value,
1459                              bool *GLC, bool *SLC, bool *DLC, bool *SCC) {
1460   if (GLC) {
1461     *GLC = (Value & 0x1) ? 1 : 0;
1462     Value &= ~(uint64_t)0x1;
1463   }
1464   if (SLC) {
1465     *SLC = (Value & 0x2) ? 1 : 0;
1466     Value &= ~(uint64_t)0x2;
1467   }
1468   if (DLC) {
1469     *DLC = (Value & 0x4) ? 1 : 0;
1470     Value &= ~(uint64_t)0x4;
1471   }
1472   if (SCC) {
1473     *SCC = (Value & 0x10) ? 1 : 0;
1474     Value &= ~(uint64_t)0x10;
1475   }
1476 
1477   return Value == 0;
1478 }
1479 
1480 bool AMDGPUInstructionSelector::selectImageIntrinsic(
1481   MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
1482   MachineBasicBlock *MBB = MI.getParent();
1483   const DebugLoc &DL = MI.getDebugLoc();
1484 
1485   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1486     AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1487 
1488   const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
1489   const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
1490       AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
1491   const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo =
1492       AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode);
1493   unsigned IntrOpcode = Intr->BaseOpcode;
1494   const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
1495 
1496   const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
1497 
1498   Register VDataIn, VDataOut;
1499   LLT VDataTy;
1500   int NumVDataDwords = -1;
1501   bool IsD16 = false;
1502 
1503   bool Unorm;
1504   if (!BaseOpcode->Sampler)
1505     Unorm = true;
1506   else
1507     Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
1508 
1509   bool TFE;
1510   bool LWE;
1511   bool IsTexFail = false;
1512   if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
1513                     TFE, LWE, IsTexFail))
1514     return false;
1515 
1516   const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
1517   const bool IsA16 = (Flags & 1) != 0;
1518   const bool IsG16 = (Flags & 2) != 0;
1519 
1520   // A16 implies 16 bit gradients
1521   if (IsA16 && !IsG16)
1522     return false;
1523 
1524   unsigned DMask = 0;
1525   unsigned DMaskLanes = 0;
1526 
1527   if (BaseOpcode->Atomic) {
1528     VDataOut = MI.getOperand(0).getReg();
1529     VDataIn = MI.getOperand(2).getReg();
1530     LLT Ty = MRI->getType(VDataIn);
1531 
1532     // Be careful to allow atomic swap on 16-bit element vectors.
1533     const bool Is64Bit = BaseOpcode->AtomicX2 ?
1534       Ty.getSizeInBits() == 128 :
1535       Ty.getSizeInBits() == 64;
1536 
1537     if (BaseOpcode->AtomicX2) {
1538       assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
1539 
1540       DMask = Is64Bit ? 0xf : 0x3;
1541       NumVDataDwords = Is64Bit ? 4 : 2;
1542     } else {
1543       DMask = Is64Bit ? 0x3 : 0x1;
1544       NumVDataDwords = Is64Bit ? 2 : 1;
1545     }
1546   } else {
1547     DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
1548     DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
1549 
1550     // One memoperand is mandatory, except for getresinfo.
1551     // FIXME: Check this in verifier.
1552     if (!MI.memoperands_empty()) {
1553       const MachineMemOperand *MMO = *MI.memoperands_begin();
1554 
1555       // Infer d16 from the memory size, as the register type will be mangled by
1556       // unpacked subtargets, or by TFE.
1557       IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32;
1558     }
1559 
1560     if (BaseOpcode->Store) {
1561       VDataIn = MI.getOperand(1).getReg();
1562       VDataTy = MRI->getType(VDataIn);
1563       NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
1564     } else {
1565       VDataOut = MI.getOperand(0).getReg();
1566       VDataTy = MRI->getType(VDataOut);
1567       NumVDataDwords = DMaskLanes;
1568 
1569       if (IsD16 && !STI.hasUnpackedD16VMem())
1570         NumVDataDwords = (DMaskLanes + 1) / 2;
1571     }
1572   }
1573 
1574   // Optimize _L to _LZ when _L is zero
1575   if (LZMappingInfo) {
1576     // The legalizer replaced the register with an immediate 0 if we need to
1577     // change the opcode.
1578     const MachineOperand &Lod = MI.getOperand(ArgOffset + Intr->LodIndex);
1579     if (Lod.isImm()) {
1580       assert(Lod.getImm() == 0);
1581       IntrOpcode = LZMappingInfo->LZ;  // set new opcode to _lz variant of _l
1582     }
1583   }
1584 
1585   // Optimize _mip away, when 'lod' is zero
1586   if (MIPMappingInfo) {
1587     const MachineOperand &Lod = MI.getOperand(ArgOffset + Intr->MipIndex);
1588     if (Lod.isImm()) {
1589       assert(Lod.getImm() == 0);
1590       IntrOpcode = MIPMappingInfo->NONMIP;  // set new opcode to variant without _mip
1591     }
1592   }
1593 
1594   // Set G16 opcode
1595   if (IsG16 && !IsA16) {
1596     const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
1597         AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
1598     assert(G16MappingInfo);
1599     IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
1600   }
1601 
1602   // TODO: Check this in verifier.
1603   assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
1604 
1605   bool GLC = false;
1606   bool SLC = false;
1607   bool DLC = false;
1608   bool SCC = false;
1609   if (BaseOpcode->Atomic) {
1610     GLC = true; // TODO no-return optimization
1611     if (!parseCachePolicy(
1612             MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm(), nullptr,
1613             &SLC, IsGFX10Plus ? &DLC : nullptr, &SCC))
1614       return false;
1615   } else {
1616     if (!parseCachePolicy(
1617             MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm(), &GLC,
1618             &SLC, IsGFX10Plus ? &DLC : nullptr, &SCC))
1619       return false;
1620   }
1621 
1622   int NumVAddrRegs = 0;
1623   int NumVAddrDwords = 0;
1624   for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
1625     // Skip the $noregs and 0s inserted during legalization.
1626     MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
1627     if (!AddrOp.isReg())
1628       continue; // XXX - Break?
1629 
1630     Register Addr = AddrOp.getReg();
1631     if (!Addr)
1632       break;
1633 
1634     ++NumVAddrRegs;
1635     NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
1636   }
1637 
1638   // The legalizer preprocessed the intrinsic arguments. If we aren't using
1639   // NSA, these should have beeen packed into a single value in the first
1640   // address register
1641   const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs;
1642   if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
1643     LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
1644     return false;
1645   }
1646 
1647   if (IsTexFail)
1648     ++NumVDataDwords;
1649 
1650   int Opcode = -1;
1651   if (IsGFX10Plus) {
1652     Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1653                                    UseNSA ? AMDGPU::MIMGEncGfx10NSA
1654                                           : AMDGPU::MIMGEncGfx10Default,
1655                                    NumVDataDwords, NumVAddrDwords);
1656   } else {
1657     if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
1658       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
1659                                      NumVDataDwords, NumVAddrDwords);
1660     if (Opcode == -1)
1661       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
1662                                      NumVDataDwords, NumVAddrDwords);
1663   }
1664   assert(Opcode != -1);
1665 
1666   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
1667     .cloneMemRefs(MI);
1668 
1669   if (VDataOut) {
1670     if (BaseOpcode->AtomicX2) {
1671       const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
1672 
1673       Register TmpReg = MRI->createVirtualRegister(
1674         Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
1675       unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
1676 
1677       MIB.addDef(TmpReg);
1678       BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
1679         .addReg(TmpReg, RegState::Kill, SubReg);
1680 
1681     } else {
1682       MIB.addDef(VDataOut); // vdata output
1683     }
1684   }
1685 
1686   if (VDataIn)
1687     MIB.addReg(VDataIn); // vdata input
1688 
1689   for (int I = 0; I != NumVAddrRegs; ++I) {
1690     MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
1691     if (SrcOp.isReg()) {
1692       assert(SrcOp.getReg() != 0);
1693       MIB.addReg(SrcOp.getReg());
1694     }
1695   }
1696 
1697   MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
1698   if (BaseOpcode->Sampler)
1699     MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
1700 
1701   MIB.addImm(DMask); // dmask
1702 
1703   if (IsGFX10Plus)
1704     MIB.addImm(DimInfo->Encoding);
1705   MIB.addImm(Unorm);
1706   if (IsGFX10Plus)
1707     MIB.addImm(DLC);
1708   else
1709     MIB.addImm(SCC);
1710 
1711   MIB.addImm(GLC);
1712   MIB.addImm(SLC);
1713   MIB.addImm(IsA16 &&  // a16 or r128
1714              STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
1715   if (IsGFX10Plus)
1716     MIB.addImm(IsA16 ? -1 : 0);
1717 
1718   MIB.addImm(TFE); // tfe
1719   MIB.addImm(LWE); // lwe
1720   if (!IsGFX10Plus)
1721     MIB.addImm(DimInfo->DA ? -1 : 0);
1722   if (BaseOpcode->HasD16)
1723     MIB.addImm(IsD16 ? -1 : 0);
1724 
1725   MI.eraseFromParent();
1726   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1727 }
1728 
1729 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
1730     MachineInstr &I) const {
1731   unsigned IntrinsicID = I.getIntrinsicID();
1732   switch (IntrinsicID) {
1733   case Intrinsic::amdgcn_end_cf:
1734     return selectEndCfIntrinsic(I);
1735   case Intrinsic::amdgcn_ds_ordered_add:
1736   case Intrinsic::amdgcn_ds_ordered_swap:
1737     return selectDSOrderedIntrinsic(I, IntrinsicID);
1738   case Intrinsic::amdgcn_ds_gws_init:
1739   case Intrinsic::amdgcn_ds_gws_barrier:
1740   case Intrinsic::amdgcn_ds_gws_sema_v:
1741   case Intrinsic::amdgcn_ds_gws_sema_br:
1742   case Intrinsic::amdgcn_ds_gws_sema_p:
1743   case Intrinsic::amdgcn_ds_gws_sema_release_all:
1744     return selectDSGWSIntrinsic(I, IntrinsicID);
1745   case Intrinsic::amdgcn_ds_append:
1746     return selectDSAppendConsume(I, true);
1747   case Intrinsic::amdgcn_ds_consume:
1748     return selectDSAppendConsume(I, false);
1749   case Intrinsic::amdgcn_s_barrier:
1750     return selectSBarrier(I);
1751   case Intrinsic::amdgcn_global_atomic_fadd:
1752     return selectGlobalAtomicFaddIntrinsic(I);
1753   default: {
1754     return selectImpl(I, *CoverageInfo);
1755   }
1756   }
1757 }
1758 
1759 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
1760   if (selectImpl(I, *CoverageInfo))
1761     return true;
1762 
1763   MachineBasicBlock *BB = I.getParent();
1764   const DebugLoc &DL = I.getDebugLoc();
1765 
1766   Register DstReg = I.getOperand(0).getReg();
1767   unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
1768   assert(Size <= 32 || Size == 64);
1769   const MachineOperand &CCOp = I.getOperand(1);
1770   Register CCReg = CCOp.getReg();
1771   if (!isVCC(CCReg, *MRI)) {
1772     unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
1773                                          AMDGPU::S_CSELECT_B32;
1774     MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
1775             .addReg(CCReg);
1776 
1777     // The generic constrainSelectedInstRegOperands doesn't work for the scc register
1778     // bank, because it does not cover the register class that we used to represent
1779     // for it.  So we need to manually set the register class here.
1780     if (!MRI->getRegClassOrNull(CCReg))
1781         MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
1782     MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
1783             .add(I.getOperand(2))
1784             .add(I.getOperand(3));
1785 
1786     bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI) |
1787                constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
1788     I.eraseFromParent();
1789     return Ret;
1790   }
1791 
1792   // Wide VGPR select should have been split in RegBankSelect.
1793   if (Size > 32)
1794     return false;
1795 
1796   MachineInstr *Select =
1797       BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1798               .addImm(0)
1799               .add(I.getOperand(3))
1800               .addImm(0)
1801               .add(I.getOperand(2))
1802               .add(I.getOperand(1));
1803 
1804   bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
1805   I.eraseFromParent();
1806   return Ret;
1807 }
1808 
1809 static int sizeToSubRegIndex(unsigned Size) {
1810   switch (Size) {
1811   case 32:
1812     return AMDGPU::sub0;
1813   case 64:
1814     return AMDGPU::sub0_sub1;
1815   case 96:
1816     return AMDGPU::sub0_sub1_sub2;
1817   case 128:
1818     return AMDGPU::sub0_sub1_sub2_sub3;
1819   case 256:
1820     return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
1821   default:
1822     if (Size < 32)
1823       return AMDGPU::sub0;
1824     if (Size > 256)
1825       return -1;
1826     return sizeToSubRegIndex(PowerOf2Ceil(Size));
1827   }
1828 }
1829 
1830 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
1831   Register DstReg = I.getOperand(0).getReg();
1832   Register SrcReg = I.getOperand(1).getReg();
1833   const LLT DstTy = MRI->getType(DstReg);
1834   const LLT SrcTy = MRI->getType(SrcReg);
1835   const LLT S1 = LLT::scalar(1);
1836 
1837   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
1838   const RegisterBank *DstRB;
1839   if (DstTy == S1) {
1840     // This is a special case. We don't treat s1 for legalization artifacts as
1841     // vcc booleans.
1842     DstRB = SrcRB;
1843   } else {
1844     DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1845     if (SrcRB != DstRB)
1846       return false;
1847   }
1848 
1849   const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
1850 
1851   unsigned DstSize = DstTy.getSizeInBits();
1852   unsigned SrcSize = SrcTy.getSizeInBits();
1853 
1854   const TargetRegisterClass *SrcRC
1855     = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI);
1856   const TargetRegisterClass *DstRC
1857     = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI);
1858   if (!SrcRC || !DstRC)
1859     return false;
1860 
1861   if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
1862       !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
1863     LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
1864     return false;
1865   }
1866 
1867   if (DstTy == LLT::vector(2, 16) && SrcTy == LLT::vector(2, 32)) {
1868     MachineBasicBlock *MBB = I.getParent();
1869     const DebugLoc &DL = I.getDebugLoc();
1870 
1871     Register LoReg = MRI->createVirtualRegister(DstRC);
1872     Register HiReg = MRI->createVirtualRegister(DstRC);
1873     BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
1874       .addReg(SrcReg, 0, AMDGPU::sub0);
1875     BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
1876       .addReg(SrcReg, 0, AMDGPU::sub1);
1877 
1878     if (IsVALU && STI.hasSDWA()) {
1879       // Write the low 16-bits of the high element into the high 16-bits of the
1880       // low element.
1881       MachineInstr *MovSDWA =
1882         BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
1883         .addImm(0)                             // $src0_modifiers
1884         .addReg(HiReg)                         // $src0
1885         .addImm(0)                             // $clamp
1886         .addImm(AMDGPU::SDWA::WORD_1)          // $dst_sel
1887         .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
1888         .addImm(AMDGPU::SDWA::WORD_0)          // $src0_sel
1889         .addReg(LoReg, RegState::Implicit);
1890       MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
1891     } else {
1892       Register TmpReg0 = MRI->createVirtualRegister(DstRC);
1893       Register TmpReg1 = MRI->createVirtualRegister(DstRC);
1894       Register ImmReg = MRI->createVirtualRegister(DstRC);
1895       if (IsVALU) {
1896         BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
1897           .addImm(16)
1898           .addReg(HiReg);
1899       } else {
1900         BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
1901           .addReg(HiReg)
1902           .addImm(16);
1903       }
1904 
1905       unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
1906       unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
1907       unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
1908 
1909       BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
1910         .addImm(0xffff);
1911       BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
1912         .addReg(LoReg)
1913         .addReg(ImmReg);
1914       BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
1915         .addReg(TmpReg0)
1916         .addReg(TmpReg1);
1917     }
1918 
1919     I.eraseFromParent();
1920     return true;
1921   }
1922 
1923   if (!DstTy.isScalar())
1924     return false;
1925 
1926   if (SrcSize > 32) {
1927     int SubRegIdx = sizeToSubRegIndex(DstSize);
1928     if (SubRegIdx == -1)
1929       return false;
1930 
1931     // Deal with weird cases where the class only partially supports the subreg
1932     // index.
1933     const TargetRegisterClass *SrcWithSubRC
1934       = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
1935     if (!SrcWithSubRC)
1936       return false;
1937 
1938     if (SrcWithSubRC != SrcRC) {
1939       if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
1940         return false;
1941     }
1942 
1943     I.getOperand(1).setSubReg(SubRegIdx);
1944   }
1945 
1946   I.setDesc(TII.get(TargetOpcode::COPY));
1947   return true;
1948 }
1949 
1950 /// \returns true if a bitmask for \p Size bits will be an inline immediate.
1951 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
1952   Mask = maskTrailingOnes<unsigned>(Size);
1953   int SignedMask = static_cast<int>(Mask);
1954   return SignedMask >= -16 && SignedMask <= 64;
1955 }
1956 
1957 // Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
1958 const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
1959   Register Reg, const MachineRegisterInfo &MRI,
1960   const TargetRegisterInfo &TRI) const {
1961   const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
1962   if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>())
1963     return RB;
1964 
1965   // Ignore the type, since we don't use vcc in artifacts.
1966   if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
1967     return &RBI.getRegBankFromRegClass(*RC, LLT());
1968   return nullptr;
1969 }
1970 
1971 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
1972   bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
1973   bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
1974   const DebugLoc &DL = I.getDebugLoc();
1975   MachineBasicBlock &MBB = *I.getParent();
1976   const Register DstReg = I.getOperand(0).getReg();
1977   const Register SrcReg = I.getOperand(1).getReg();
1978 
1979   const LLT DstTy = MRI->getType(DstReg);
1980   const LLT SrcTy = MRI->getType(SrcReg);
1981   const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
1982     I.getOperand(2).getImm() : SrcTy.getSizeInBits();
1983   const unsigned DstSize = DstTy.getSizeInBits();
1984   if (!DstTy.isScalar())
1985     return false;
1986 
1987   // Artifact casts should never use vcc.
1988   const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
1989 
1990   // FIXME: This should probably be illegal and split earlier.
1991   if (I.getOpcode() == AMDGPU::G_ANYEXT) {
1992     if (DstSize <= 32)
1993       return selectCOPY(I);
1994 
1995     const TargetRegisterClass *SrcRC =
1996         TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank, *MRI);
1997     const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1998     const TargetRegisterClass *DstRC =
1999         TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
2000 
2001     Register UndefReg = MRI->createVirtualRegister(SrcRC);
2002     BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2003     BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2004       .addReg(SrcReg)
2005       .addImm(AMDGPU::sub0)
2006       .addReg(UndefReg)
2007       .addImm(AMDGPU::sub1);
2008     I.eraseFromParent();
2009 
2010     return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2011            RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2012   }
2013 
2014   if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2015     // 64-bit should have been split up in RegBankSelect
2016 
2017     // Try to use an and with a mask if it will save code size.
2018     unsigned Mask;
2019     if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2020       MachineInstr *ExtI =
2021       BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2022         .addImm(Mask)
2023         .addReg(SrcReg);
2024       I.eraseFromParent();
2025       return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2026     }
2027 
2028     const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2029     MachineInstr *ExtI =
2030       BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2031       .addReg(SrcReg)
2032       .addImm(0) // Offset
2033       .addImm(SrcSize); // Width
2034     I.eraseFromParent();
2035     return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2036   }
2037 
2038   if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2039     const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2040       AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2041     if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2042       return false;
2043 
2044     if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2045       const unsigned SextOpc = SrcSize == 8 ?
2046         AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2047       BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2048         .addReg(SrcReg);
2049       I.eraseFromParent();
2050       return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2051     }
2052 
2053     const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2054     const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2055 
2056     // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2057     if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2058       // We need a 64-bit register source, but the high bits don't matter.
2059       Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2060       Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2061       unsigned SubReg = InReg ? AMDGPU::sub0 : 0;
2062 
2063       BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2064       BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2065         .addReg(SrcReg, 0, SubReg)
2066         .addImm(AMDGPU::sub0)
2067         .addReg(UndefReg)
2068         .addImm(AMDGPU::sub1);
2069 
2070       BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2071         .addReg(ExtReg)
2072         .addImm(SrcSize << 16);
2073 
2074       I.eraseFromParent();
2075       return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2076     }
2077 
2078     unsigned Mask;
2079     if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2080       BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2081         .addReg(SrcReg)
2082         .addImm(Mask);
2083     } else {
2084       BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2085         .addReg(SrcReg)
2086         .addImm(SrcSize << 16);
2087     }
2088 
2089     I.eraseFromParent();
2090     return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2091   }
2092 
2093   return false;
2094 }
2095 
2096 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
2097   MachineBasicBlock *BB = I.getParent();
2098   MachineOperand &ImmOp = I.getOperand(1);
2099   Register DstReg = I.getOperand(0).getReg();
2100   unsigned Size = MRI->getType(DstReg).getSizeInBits();
2101 
2102   // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
2103   if (ImmOp.isFPImm()) {
2104     const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
2105     ImmOp.ChangeToImmediate(Imm.getZExtValue());
2106   } else if (ImmOp.isCImm()) {
2107     ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue());
2108   } else {
2109     llvm_unreachable("Not supported by g_constants");
2110   }
2111 
2112   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2113   const bool IsSgpr = DstRB->getID() == AMDGPU::SGPRRegBankID;
2114 
2115   unsigned Opcode;
2116   if (DstRB->getID() == AMDGPU::VCCRegBankID) {
2117     Opcode = STI.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2118   } else {
2119     Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
2120 
2121     // We should never produce s1 values on banks other than VCC. If the user of
2122     // this already constrained the register, we may incorrectly think it's VCC
2123     // if it wasn't originally.
2124     if (Size == 1)
2125       return false;
2126   }
2127 
2128   if (Size != 64) {
2129     I.setDesc(TII.get(Opcode));
2130     I.addImplicitDefUseOperands(*MF);
2131     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2132   }
2133 
2134   const DebugLoc &DL = I.getDebugLoc();
2135 
2136   APInt Imm(Size, I.getOperand(1).getImm());
2137 
2138   MachineInstr *ResInst;
2139   if (IsSgpr && TII.isInlineConstant(Imm)) {
2140     ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
2141       .addImm(I.getOperand(1).getImm());
2142   } else {
2143     const TargetRegisterClass *RC = IsSgpr ?
2144       &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass;
2145     Register LoReg = MRI->createVirtualRegister(RC);
2146     Register HiReg = MRI->createVirtualRegister(RC);
2147 
2148     BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
2149       .addImm(Imm.trunc(32).getZExtValue());
2150 
2151     BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
2152       .addImm(Imm.ashr(32).getZExtValue());
2153 
2154     ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2155       .addReg(LoReg)
2156       .addImm(AMDGPU::sub0)
2157       .addReg(HiReg)
2158       .addImm(AMDGPU::sub1);
2159   }
2160 
2161   // We can't call constrainSelectedInstRegOperands here, because it doesn't
2162   // work for target independent opcodes
2163   I.eraseFromParent();
2164   const TargetRegisterClass *DstRC =
2165     TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI);
2166   if (!DstRC)
2167     return true;
2168   return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI);
2169 }
2170 
2171 bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2172   // Only manually handle the f64 SGPR case.
2173   //
2174   // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2175   // the bit ops theoretically have a second result due to the implicit def of
2176   // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2177   // that is easy by disabling the check. The result works, but uses a
2178   // nonsensical sreg32orlds_and_sreg_1 regclass.
2179   //
2180   // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2181   // the variadic REG_SEQUENCE operands.
2182 
2183   Register Dst = MI.getOperand(0).getReg();
2184   const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2185   if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2186       MRI->getType(Dst) != LLT::scalar(64))
2187     return false;
2188 
2189   Register Src = MI.getOperand(1).getReg();
2190   MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2191   if (Fabs)
2192     Src = Fabs->getOperand(1).getReg();
2193 
2194   if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2195       !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2196     return false;
2197 
2198   MachineBasicBlock *BB = MI.getParent();
2199   const DebugLoc &DL = MI.getDebugLoc();
2200   Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2201   Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2202   Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2203   Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2204 
2205   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2206     .addReg(Src, 0, AMDGPU::sub0);
2207   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2208     .addReg(Src, 0, AMDGPU::sub1);
2209   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2210     .addImm(0x80000000);
2211 
2212   // Set or toggle sign bit.
2213   unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2214   BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2215     .addReg(HiReg)
2216     .addReg(ConstReg);
2217   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2218     .addReg(LoReg)
2219     .addImm(AMDGPU::sub0)
2220     .addReg(OpReg)
2221     .addImm(AMDGPU::sub1);
2222   MI.eraseFromParent();
2223   return true;
2224 }
2225 
2226 // FIXME: This is a workaround for the same tablegen problems as G_FNEG
2227 bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2228   Register Dst = MI.getOperand(0).getReg();
2229   const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2230   if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2231       MRI->getType(Dst) != LLT::scalar(64))
2232     return false;
2233 
2234   Register Src = MI.getOperand(1).getReg();
2235   MachineBasicBlock *BB = MI.getParent();
2236   const DebugLoc &DL = MI.getDebugLoc();
2237   Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2238   Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2239   Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2240   Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2241 
2242   if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2243       !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2244     return false;
2245 
2246   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2247     .addReg(Src, 0, AMDGPU::sub0);
2248   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2249     .addReg(Src, 0, AMDGPU::sub1);
2250   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2251     .addImm(0x7fffffff);
2252 
2253   // Clear sign bit.
2254   // TODO: Should this used S_BITSET0_*?
2255   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2256     .addReg(HiReg)
2257     .addReg(ConstReg);
2258   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2259     .addReg(LoReg)
2260     .addImm(AMDGPU::sub0)
2261     .addReg(OpReg)
2262     .addImm(AMDGPU::sub1);
2263 
2264   MI.eraseFromParent();
2265   return true;
2266 }
2267 
2268 static bool isConstant(const MachineInstr &MI) {
2269   return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2270 }
2271 
2272 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2273     const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2274 
2275   const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg());
2276 
2277   assert(PtrMI);
2278 
2279   if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2280     return;
2281 
2282   GEPInfo GEPInfo(*PtrMI);
2283 
2284   for (unsigned i = 1; i != 3; ++i) {
2285     const MachineOperand &GEPOp = PtrMI->getOperand(i);
2286     const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2287     assert(OpDef);
2288     if (i == 2 && isConstant(*OpDef)) {
2289       // TODO: Could handle constant base + variable offset, but a combine
2290       // probably should have commuted it.
2291       assert(GEPInfo.Imm == 0);
2292       GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2293       continue;
2294     }
2295     const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2296     if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2297       GEPInfo.SgprParts.push_back(GEPOp.getReg());
2298     else
2299       GEPInfo.VgprParts.push_back(GEPOp.getReg());
2300   }
2301 
2302   AddrInfo.push_back(GEPInfo);
2303   getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2304 }
2305 
2306 bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
2307   return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
2308 }
2309 
2310 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2311   if (!MI.hasOneMemOperand())
2312     return false;
2313 
2314   const MachineMemOperand *MMO = *MI.memoperands_begin();
2315   const Value *Ptr = MMO->getValue();
2316 
2317   // UndefValue means this is a load of a kernel input.  These are uniform.
2318   // Sometimes LDS instructions have constant pointers.
2319   // If Ptr is null, then that means this mem operand contains a
2320   // PseudoSourceValue like GOT.
2321   if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
2322       isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
2323     return true;
2324 
2325   if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
2326     return true;
2327 
2328   const Instruction *I = dyn_cast<Instruction>(Ptr);
2329   return I && I->getMetadata("amdgpu.uniform");
2330 }
2331 
2332 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2333   for (const GEPInfo &GEPInfo : AddrInfo) {
2334     if (!GEPInfo.VgprParts.empty())
2335       return true;
2336   }
2337   return false;
2338 }
2339 
2340 void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2341   const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2342   unsigned AS = PtrTy.getAddressSpace();
2343   if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) &&
2344       STI.ldsRequiresM0Init()) {
2345     MachineBasicBlock *BB = I.getParent();
2346 
2347     // If DS instructions require M0 initializtion, insert it before selecting.
2348     BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2349       .addImm(-1);
2350   }
2351 }
2352 
2353 bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
2354   MachineInstr &I) const {
2355   initM0(I);
2356   return selectImpl(I, *CoverageInfo);
2357 }
2358 
2359 // TODO: No rtn optimization.
2360 bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG(
2361   MachineInstr &MI) const {
2362   Register PtrReg = MI.getOperand(1).getReg();
2363   const LLT PtrTy = MRI->getType(PtrReg);
2364   if (PtrTy.getAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
2365       STI.useFlatForGlobal())
2366     return selectImpl(MI, *CoverageInfo);
2367 
2368   Register DstReg = MI.getOperand(0).getReg();
2369   const LLT Ty = MRI->getType(DstReg);
2370   const bool Is64 = Ty.getSizeInBits() == 64;
2371   const unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2372   Register TmpReg = MRI->createVirtualRegister(
2373     Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2374 
2375   const DebugLoc &DL = MI.getDebugLoc();
2376   MachineBasicBlock *BB = MI.getParent();
2377 
2378   Register VAddr, RSrcReg, SOffset;
2379   int64_t Offset = 0;
2380 
2381   unsigned Opcode;
2382   if (selectMUBUFOffsetImpl(MI.getOperand(1), RSrcReg, SOffset, Offset)) {
2383     Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN :
2384                              AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN;
2385   } else if (selectMUBUFAddr64Impl(MI.getOperand(1), VAddr,
2386                                    RSrcReg, SOffset, Offset)) {
2387     Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN :
2388                     AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN;
2389   } else
2390     return selectImpl(MI, *CoverageInfo);
2391 
2392   auto MIB = BuildMI(*BB, &MI, DL, TII.get(Opcode), TmpReg)
2393     .addReg(MI.getOperand(2).getReg());
2394 
2395   if (VAddr)
2396     MIB.addReg(VAddr);
2397 
2398   MIB.addReg(RSrcReg);
2399   if (SOffset)
2400     MIB.addReg(SOffset);
2401   else
2402     MIB.addImm(0);
2403 
2404   MIB.addImm(Offset);
2405   MIB.addImm(1); // glc
2406   MIB.addImm(0); // slc
2407   MIB.cloneMemRefs(MI);
2408 
2409   BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), DstReg)
2410     .addReg(TmpReg, RegState::Kill, SubReg);
2411 
2412   MI.eraseFromParent();
2413 
2414   MRI->setRegClass(
2415     DstReg, Is64 ? &AMDGPU::VReg_64RegClass : &AMDGPU::VGPR_32RegClass);
2416   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2417 }
2418 
2419 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
2420   MachineBasicBlock *BB = I.getParent();
2421   MachineOperand &CondOp = I.getOperand(0);
2422   Register CondReg = CondOp.getReg();
2423   const DebugLoc &DL = I.getDebugLoc();
2424 
2425   unsigned BrOpcode;
2426   Register CondPhysReg;
2427   const TargetRegisterClass *ConstrainRC;
2428 
2429   // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
2430   // whether the branch is uniform when selecting the instruction. In
2431   // GlobalISel, we should push that decision into RegBankSelect. Assume for now
2432   // RegBankSelect knows what it's doing if the branch condition is scc, even
2433   // though it currently does not.
2434   if (!isVCC(CondReg, *MRI)) {
2435     if (MRI->getType(CondReg) != LLT::scalar(32))
2436       return false;
2437 
2438     CondPhysReg = AMDGPU::SCC;
2439     BrOpcode = AMDGPU::S_CBRANCH_SCC1;
2440     ConstrainRC = &AMDGPU::SReg_32RegClass;
2441   } else {
2442     // FIXME: Do we have to insert an and with exec here, like in SelectionDAG?
2443     // We sort of know that a VCC producer based on the register bank, that ands
2444     // inactive lanes with 0. What if there was a logical operation with vcc
2445     // producers in different blocks/with different exec masks?
2446     // FIXME: Should scc->vcc copies and with exec?
2447     CondPhysReg = TRI.getVCC();
2448     BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
2449     ConstrainRC = TRI.getBoolRC();
2450   }
2451 
2452   if (!MRI->getRegClassOrNull(CondReg))
2453     MRI->setRegClass(CondReg, ConstrainRC);
2454 
2455   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
2456     .addReg(CondReg);
2457   BuildMI(*BB, &I, DL, TII.get(BrOpcode))
2458     .addMBB(I.getOperand(1).getMBB());
2459 
2460   I.eraseFromParent();
2461   return true;
2462 }
2463 
2464 bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
2465   MachineInstr &I) const {
2466   Register DstReg = I.getOperand(0).getReg();
2467   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2468   const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2469   I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
2470   if (IsVGPR)
2471     I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
2472 
2473   return RBI.constrainGenericRegister(
2474     DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
2475 }
2476 
2477 bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
2478   Register DstReg = I.getOperand(0).getReg();
2479   Register SrcReg = I.getOperand(1).getReg();
2480   Register MaskReg = I.getOperand(2).getReg();
2481   LLT Ty = MRI->getType(DstReg);
2482   LLT MaskTy = MRI->getType(MaskReg);
2483 
2484   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2485   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2486   const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
2487   const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2488   if (DstRB != SrcRB) // Should only happen for hand written MIR.
2489     return false;
2490 
2491   unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2492   const TargetRegisterClass &RegRC
2493     = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2494 
2495   const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB,
2496                                                                   *MRI);
2497   const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB,
2498                                                                   *MRI);
2499   const TargetRegisterClass *MaskRC =
2500       TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB, *MRI);
2501 
2502   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2503       !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2504       !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
2505     return false;
2506 
2507   MachineBasicBlock *BB = I.getParent();
2508   const DebugLoc &DL = I.getDebugLoc();
2509   if (Ty.getSizeInBits() == 32) {
2510     assert(MaskTy.getSizeInBits() == 32 &&
2511            "ptrmask should have been narrowed during legalize");
2512 
2513     BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
2514       .addReg(SrcReg)
2515       .addReg(MaskReg);
2516     I.eraseFromParent();
2517     return true;
2518   }
2519 
2520   Register HiReg = MRI->createVirtualRegister(&RegRC);
2521   Register LoReg = MRI->createVirtualRegister(&RegRC);
2522 
2523   // Extract the subregisters from the source pointer.
2524   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
2525     .addReg(SrcReg, 0, AMDGPU::sub0);
2526   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
2527     .addReg(SrcReg, 0, AMDGPU::sub1);
2528 
2529   Register MaskedLo, MaskedHi;
2530 
2531   // Try to avoid emitting a bit operation when we only need to touch half of
2532   // the 64-bit pointer.
2533   APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64);
2534 
2535   const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
2536   const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
2537   if ((MaskOnes & MaskLo32) == MaskLo32) {
2538     // If all the bits in the low half are 1, we only need a copy for it.
2539     MaskedLo = LoReg;
2540   } else {
2541     // Extract the mask subregister and apply the and.
2542     Register MaskLo = MRI->createVirtualRegister(&RegRC);
2543     MaskedLo = MRI->createVirtualRegister(&RegRC);
2544 
2545     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
2546       .addReg(MaskReg, 0, AMDGPU::sub0);
2547     BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
2548       .addReg(LoReg)
2549       .addReg(MaskLo);
2550   }
2551 
2552   if ((MaskOnes & MaskHi32) == MaskHi32) {
2553     // If all the bits in the high half are 1, we only need a copy for it.
2554     MaskedHi = HiReg;
2555   } else {
2556     Register MaskHi = MRI->createVirtualRegister(&RegRC);
2557     MaskedHi = MRI->createVirtualRegister(&RegRC);
2558 
2559     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
2560       .addReg(MaskReg, 0, AMDGPU::sub1);
2561     BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
2562       .addReg(HiReg)
2563       .addReg(MaskHi);
2564   }
2565 
2566   BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2567     .addReg(MaskedLo)
2568     .addImm(AMDGPU::sub0)
2569     .addReg(MaskedHi)
2570     .addImm(AMDGPU::sub1);
2571   I.eraseFromParent();
2572   return true;
2573 }
2574 
2575 /// Return the register to use for the index value, and the subregister to use
2576 /// for the indirectly accessed register.
2577 static std::pair<Register, unsigned>
2578 computeIndirectRegIndex(MachineRegisterInfo &MRI,
2579                         const SIRegisterInfo &TRI,
2580                         const TargetRegisterClass *SuperRC,
2581                         Register IdxReg,
2582                         unsigned EltSize) {
2583   Register IdxBaseReg;
2584   int Offset;
2585 
2586   std::tie(IdxBaseReg, Offset) = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg);
2587   if (IdxBaseReg == AMDGPU::NoRegister) {
2588     // This will happen if the index is a known constant. This should ordinarily
2589     // be legalized out, but handle it as a register just in case.
2590     assert(Offset == 0);
2591     IdxBaseReg = IdxReg;
2592   }
2593 
2594   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
2595 
2596   // Skip out of bounds offsets, or else we would end up using an undefined
2597   // register.
2598   if (static_cast<unsigned>(Offset) >= SubRegs.size())
2599     return std::make_pair(IdxReg, SubRegs[0]);
2600   return std::make_pair(IdxBaseReg, SubRegs[Offset]);
2601 }
2602 
2603 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
2604   MachineInstr &MI) const {
2605   Register DstReg = MI.getOperand(0).getReg();
2606   Register SrcReg = MI.getOperand(1).getReg();
2607   Register IdxReg = MI.getOperand(2).getReg();
2608 
2609   LLT DstTy = MRI->getType(DstReg);
2610   LLT SrcTy = MRI->getType(SrcReg);
2611 
2612   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2613   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2614   const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
2615 
2616   // The index must be scalar. If it wasn't RegBankSelect should have moved this
2617   // into a waterfall loop.
2618   if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
2619     return false;
2620 
2621   const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB,
2622                                                                   *MRI);
2623   const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(DstTy, *DstRB,
2624                                                                   *MRI);
2625   if (!SrcRC || !DstRC)
2626     return false;
2627   if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2628       !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2629       !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
2630     return false;
2631 
2632   MachineBasicBlock *BB = MI.getParent();
2633   const DebugLoc &DL = MI.getDebugLoc();
2634   const bool Is64 = DstTy.getSizeInBits() == 64;
2635 
2636   unsigned SubReg;
2637   std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, SrcRC, IdxReg,
2638                                                      DstTy.getSizeInBits() / 8);
2639 
2640   if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
2641     if (DstTy.getSizeInBits() != 32 && !Is64)
2642       return false;
2643 
2644     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2645       .addReg(IdxReg);
2646 
2647     unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
2648     BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
2649       .addReg(SrcReg, 0, SubReg)
2650       .addReg(SrcReg, RegState::Implicit);
2651     MI.eraseFromParent();
2652     return true;
2653   }
2654 
2655   if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
2656     return false;
2657 
2658   if (!STI.useVGPRIndexMode()) {
2659     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2660       .addReg(IdxReg);
2661     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
2662       .addReg(SrcReg, 0, SubReg)
2663       .addReg(SrcReg, RegState::Implicit);
2664     MI.eraseFromParent();
2665     return true;
2666   }
2667 
2668   const MCInstrDesc &GPRIDXDesc =
2669       TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
2670   BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
2671       .addReg(SrcReg)
2672       .addReg(IdxReg)
2673       .addImm(SubReg);
2674 
2675   MI.eraseFromParent();
2676   return true;
2677 }
2678 
2679 // TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
2680 bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
2681   MachineInstr &MI) const {
2682   Register DstReg = MI.getOperand(0).getReg();
2683   Register VecReg = MI.getOperand(1).getReg();
2684   Register ValReg = MI.getOperand(2).getReg();
2685   Register IdxReg = MI.getOperand(3).getReg();
2686 
2687   LLT VecTy = MRI->getType(DstReg);
2688   LLT ValTy = MRI->getType(ValReg);
2689   unsigned VecSize = VecTy.getSizeInBits();
2690   unsigned ValSize = ValTy.getSizeInBits();
2691 
2692   const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
2693   const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
2694   const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
2695 
2696   assert(VecTy.getElementType() == ValTy);
2697 
2698   // The index must be scalar. If it wasn't RegBankSelect should have moved this
2699   // into a waterfall loop.
2700   if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
2701     return false;
2702 
2703   const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB,
2704                                                                   *MRI);
2705   const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB,
2706                                                                   *MRI);
2707 
2708   if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
2709       !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
2710       !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
2711       !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
2712     return false;
2713 
2714   if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
2715     return false;
2716 
2717   unsigned SubReg;
2718   std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg,
2719                                                      ValSize / 8);
2720 
2721   const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
2722                          STI.useVGPRIndexMode();
2723 
2724   MachineBasicBlock *BB = MI.getParent();
2725   const DebugLoc &DL = MI.getDebugLoc();
2726 
2727   if (!IndexMode) {
2728     BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
2729       .addReg(IdxReg);
2730 
2731     const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
2732         VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
2733     BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
2734         .addReg(VecReg)
2735         .addReg(ValReg)
2736         .addImm(SubReg);
2737     MI.eraseFromParent();
2738     return true;
2739   }
2740 
2741   const MCInstrDesc &GPRIDXDesc =
2742       TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
2743   BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
2744       .addReg(VecReg)
2745       .addReg(ValReg)
2746       .addReg(IdxReg)
2747       .addImm(SubReg);
2748 
2749   MI.eraseFromParent();
2750   return true;
2751 }
2752 
2753 static bool isZeroOrUndef(int X) {
2754   return X == 0 || X == -1;
2755 }
2756 
2757 static bool isOneOrUndef(int X) {
2758   return X == 1 || X == -1;
2759 }
2760 
2761 static bool isZeroOrOneOrUndef(int X) {
2762   return X == 0 || X == 1 || X == -1;
2763 }
2764 
2765 // Normalize a VOP3P shuffle mask to refer to the low/high half of a single
2766 // 32-bit register.
2767 static Register normalizeVOP3PMask(int NewMask[2], Register Src0, Register Src1,
2768                                    ArrayRef<int> Mask) {
2769   NewMask[0] = Mask[0];
2770   NewMask[1] = Mask[1];
2771   if (isZeroOrOneOrUndef(Mask[0]) && isZeroOrOneOrUndef(Mask[1]))
2772     return Src0;
2773 
2774   assert(NewMask[0] == 2 || NewMask[0] == 3 || NewMask[0] == -1);
2775   assert(NewMask[1] == 2 || NewMask[1] == 3 || NewMask[1] == -1);
2776 
2777   // Shift the mask inputs to be 0/1;
2778   NewMask[0] = NewMask[0] == -1 ? -1 : NewMask[0] - 2;
2779   NewMask[1] = NewMask[1] == -1 ? -1 : NewMask[1] - 2;
2780   return Src1;
2781 }
2782 
2783 // This is only legal with VOP3P instructions as an aid to op_sel matching.
2784 bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
2785   MachineInstr &MI) const {
2786   Register DstReg = MI.getOperand(0).getReg();
2787   Register Src0Reg = MI.getOperand(1).getReg();
2788   Register Src1Reg = MI.getOperand(2).getReg();
2789   ArrayRef<int> ShufMask = MI.getOperand(3).getShuffleMask();
2790 
2791   const LLT V2S16 = LLT::vector(2, 16);
2792   if (MRI->getType(DstReg) != V2S16 || MRI->getType(Src0Reg) != V2S16)
2793     return false;
2794 
2795   if (!AMDGPU::isLegalVOP3PShuffleMask(ShufMask))
2796     return false;
2797 
2798   assert(ShufMask.size() == 2);
2799   assert(STI.hasSDWA() && "no target has VOP3P but not SDWA");
2800 
2801   MachineBasicBlock *MBB = MI.getParent();
2802   const DebugLoc &DL = MI.getDebugLoc();
2803 
2804   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2805   const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2806   const TargetRegisterClass &RC = IsVALU ?
2807     AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2808 
2809   // Handle the degenerate case which should have folded out.
2810   if (ShufMask[0] == -1 && ShufMask[1] == -1) {
2811     BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), DstReg);
2812 
2813     MI.eraseFromParent();
2814     return RBI.constrainGenericRegister(DstReg, RC, *MRI);
2815   }
2816 
2817   // A legal VOP3P mask only reads one of the sources.
2818   int Mask[2];
2819   Register SrcVec = normalizeVOP3PMask(Mask, Src0Reg, Src1Reg, ShufMask);
2820 
2821   if (!RBI.constrainGenericRegister(DstReg, RC, *MRI) ||
2822       !RBI.constrainGenericRegister(SrcVec, RC, *MRI))
2823     return false;
2824 
2825   // TODO: This also should have been folded out
2826   if (isZeroOrUndef(Mask[0]) && isOneOrUndef(Mask[1])) {
2827     BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), DstReg)
2828       .addReg(SrcVec);
2829 
2830     MI.eraseFromParent();
2831     return true;
2832   }
2833 
2834   if (Mask[0] == 1 && Mask[1] == -1) {
2835     if (IsVALU) {
2836       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
2837         .addImm(16)
2838         .addReg(SrcVec);
2839     } else {
2840       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
2841         .addReg(SrcVec)
2842         .addImm(16);
2843     }
2844   } else if (Mask[0] == -1 && Mask[1] == 0) {
2845     if (IsVALU) {
2846       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), DstReg)
2847         .addImm(16)
2848         .addReg(SrcVec);
2849     } else {
2850       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHL_B32), DstReg)
2851         .addReg(SrcVec)
2852         .addImm(16);
2853     }
2854   } else if (Mask[0] == 0 && Mask[1] == 0) {
2855     if (IsVALU) {
2856       // Write low half of the register into the high half.
2857       MachineInstr *MovSDWA =
2858         BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2859         .addImm(0)                             // $src0_modifiers
2860         .addReg(SrcVec)                        // $src0
2861         .addImm(0)                             // $clamp
2862         .addImm(AMDGPU::SDWA::WORD_1)          // $dst_sel
2863         .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2864         .addImm(AMDGPU::SDWA::WORD_0)          // $src0_sel
2865         .addReg(SrcVec, RegState::Implicit);
2866       MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2867     } else {
2868       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
2869         .addReg(SrcVec)
2870         .addReg(SrcVec);
2871     }
2872   } else if (Mask[0] == 1 && Mask[1] == 1) {
2873     if (IsVALU) {
2874       // Write high half of the register into the low half.
2875       MachineInstr *MovSDWA =
2876         BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2877         .addImm(0)                             // $src0_modifiers
2878         .addReg(SrcVec)                        // $src0
2879         .addImm(0)                             // $clamp
2880         .addImm(AMDGPU::SDWA::WORD_0)          // $dst_sel
2881         .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2882         .addImm(AMDGPU::SDWA::WORD_1)          // $src0_sel
2883         .addReg(SrcVec, RegState::Implicit);
2884       MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2885     } else {
2886       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_HH_B32_B16), DstReg)
2887         .addReg(SrcVec)
2888         .addReg(SrcVec);
2889     }
2890   } else if (Mask[0] == 1 && Mask[1] == 0) {
2891     if (IsVALU) {
2892       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_ALIGNBIT_B32_e64), DstReg)
2893         .addReg(SrcVec)
2894         .addReg(SrcVec)
2895         .addImm(16);
2896     } else {
2897       Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2898       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg)
2899         .addReg(SrcVec)
2900         .addImm(16);
2901       BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
2902         .addReg(TmpReg)
2903         .addReg(SrcVec);
2904     }
2905   } else
2906     llvm_unreachable("all shuffle masks should be handled");
2907 
2908   MI.eraseFromParent();
2909   return true;
2910 }
2911 
2912 bool AMDGPUInstructionSelector::selectAMDGPU_BUFFER_ATOMIC_FADD(
2913   MachineInstr &MI) const {
2914   if (STI.hasGFX90AInsts())
2915     return selectImpl(MI, *CoverageInfo);
2916 
2917   MachineBasicBlock *MBB = MI.getParent();
2918   const DebugLoc &DL = MI.getDebugLoc();
2919 
2920   if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) {
2921     Function &F = MBB->getParent()->getFunction();
2922     DiagnosticInfoUnsupported
2923       NoFpRet(F, "return versions of fp atomics not supported",
2924               MI.getDebugLoc(), DS_Error);
2925     F.getContext().diagnose(NoFpRet);
2926     return false;
2927   }
2928 
2929   // FIXME: This is only needed because tablegen requires number of dst operands
2930   // in match and replace pattern to be the same. Otherwise patterns can be
2931   // exported from SDag path.
2932   MachineOperand &VDataIn = MI.getOperand(1);
2933   MachineOperand &VIndex = MI.getOperand(3);
2934   MachineOperand &VOffset = MI.getOperand(4);
2935   MachineOperand &SOffset = MI.getOperand(5);
2936   int16_t Offset = MI.getOperand(6).getImm();
2937 
2938   bool HasVOffset = !isOperandImmEqual(VOffset, 0, *MRI);
2939   bool HasVIndex = !isOperandImmEqual(VIndex, 0, *MRI);
2940 
2941   unsigned Opcode;
2942   if (HasVOffset) {
2943     Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN
2944                        : AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN;
2945   } else {
2946     Opcode = HasVIndex ? AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN
2947                        : AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET;
2948   }
2949 
2950   if (MRI->getType(VDataIn.getReg()).isVector()) {
2951     switch (Opcode) {
2952     case AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN:
2953       Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN;
2954       break;
2955     case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFEN:
2956       Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFEN;
2957       break;
2958     case AMDGPU::BUFFER_ATOMIC_ADD_F32_IDXEN:
2959       Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_IDXEN;
2960       break;
2961     case AMDGPU::BUFFER_ATOMIC_ADD_F32_OFFSET:
2962       Opcode = AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_OFFSET;
2963       break;
2964     }
2965   }
2966 
2967   auto I = BuildMI(*MBB, MI, DL, TII.get(Opcode));
2968   I.add(VDataIn);
2969 
2970   if (Opcode == AMDGPU::BUFFER_ATOMIC_ADD_F32_BOTHEN ||
2971       Opcode == AMDGPU::BUFFER_ATOMIC_PK_ADD_F16_BOTHEN) {
2972     Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
2973     BuildMI(*MBB, &*I, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
2974       .addReg(VIndex.getReg())
2975       .addImm(AMDGPU::sub0)
2976       .addReg(VOffset.getReg())
2977       .addImm(AMDGPU::sub1);
2978 
2979     I.addReg(IdxReg);
2980   } else if (HasVIndex) {
2981     I.add(VIndex);
2982   } else if (HasVOffset) {
2983     I.add(VOffset);
2984   }
2985 
2986   I.add(MI.getOperand(2)); // rsrc
2987   I.add(SOffset);
2988   I.addImm(Offset);
2989   renderExtractSLC(I, MI, 7);
2990   I.cloneMemRefs(MI);
2991 
2992   MI.eraseFromParent();
2993 
2994   return true;
2995 }
2996 
2997 bool AMDGPUInstructionSelector::selectGlobalAtomicFaddIntrinsic(
2998   MachineInstr &MI) const{
2999 
3000   if (STI.hasGFX90AInsts())
3001     return selectImpl(MI, *CoverageInfo);
3002 
3003   MachineBasicBlock *MBB = MI.getParent();
3004   const DebugLoc &DL = MI.getDebugLoc();
3005 
3006   if (!MRI->use_nodbg_empty(MI.getOperand(0).getReg())) {
3007     Function &F = MBB->getParent()->getFunction();
3008     DiagnosticInfoUnsupported
3009       NoFpRet(F, "return versions of fp atomics not supported",
3010               MI.getDebugLoc(), DS_Error);
3011     F.getContext().diagnose(NoFpRet);
3012     return false;
3013   }
3014 
3015   // FIXME: This is only needed because tablegen requires number of dst operands
3016   // in match and replace pattern to be the same. Otherwise patterns can be
3017   // exported from SDag path.
3018   auto Addr = selectFlatOffsetImpl<true>(MI.getOperand(2));
3019 
3020   Register Data = MI.getOperand(3).getReg();
3021   const unsigned Opc = MRI->getType(Data).isVector() ?
3022     AMDGPU::GLOBAL_ATOMIC_PK_ADD_F16 : AMDGPU::GLOBAL_ATOMIC_ADD_F32;
3023   auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3024     .addReg(Addr.first)
3025     .addReg(Data)
3026     .addImm(Addr.second)
3027     .addImm(0) // SLC
3028     .addImm(0) // SSCB
3029     .cloneMemRefs(MI);
3030 
3031   MI.eraseFromParent();
3032   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3033 }
3034 
3035 bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
3036   MI.setDesc(TII.get(MI.getOperand(1).getImm()));
3037   MI.RemoveOperand(1);
3038   MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3039   return true;
3040 }
3041 
3042 bool AMDGPUInstructionSelector::select(MachineInstr &I) {
3043   if (I.isPHI())
3044     return selectPHI(I);
3045 
3046   if (!I.isPreISelOpcode()) {
3047     if (I.isCopy())
3048       return selectCOPY(I);
3049     return true;
3050   }
3051 
3052   switch (I.getOpcode()) {
3053   case TargetOpcode::G_AND:
3054   case TargetOpcode::G_OR:
3055   case TargetOpcode::G_XOR:
3056     if (selectImpl(I, *CoverageInfo))
3057       return true;
3058     return selectG_AND_OR_XOR(I);
3059   case TargetOpcode::G_ADD:
3060   case TargetOpcode::G_SUB:
3061     if (selectImpl(I, *CoverageInfo))
3062       return true;
3063     return selectG_ADD_SUB(I);
3064   case TargetOpcode::G_UADDO:
3065   case TargetOpcode::G_USUBO:
3066   case TargetOpcode::G_UADDE:
3067   case TargetOpcode::G_USUBE:
3068     return selectG_UADDO_USUBO_UADDE_USUBE(I);
3069   case TargetOpcode::G_INTTOPTR:
3070   case TargetOpcode::G_BITCAST:
3071   case TargetOpcode::G_PTRTOINT:
3072     return selectCOPY(I);
3073   case TargetOpcode::G_CONSTANT:
3074   case TargetOpcode::G_FCONSTANT:
3075     return selectG_CONSTANT(I);
3076   case TargetOpcode::G_FNEG:
3077     if (selectImpl(I, *CoverageInfo))
3078       return true;
3079     return selectG_FNEG(I);
3080   case TargetOpcode::G_FABS:
3081     if (selectImpl(I, *CoverageInfo))
3082       return true;
3083     return selectG_FABS(I);
3084   case TargetOpcode::G_EXTRACT:
3085     return selectG_EXTRACT(I);
3086   case TargetOpcode::G_MERGE_VALUES:
3087   case TargetOpcode::G_BUILD_VECTOR:
3088   case TargetOpcode::G_CONCAT_VECTORS:
3089     return selectG_MERGE_VALUES(I);
3090   case TargetOpcode::G_UNMERGE_VALUES:
3091     return selectG_UNMERGE_VALUES(I);
3092   case TargetOpcode::G_BUILD_VECTOR_TRUNC:
3093     return selectG_BUILD_VECTOR_TRUNC(I);
3094   case TargetOpcode::G_PTR_ADD:
3095     return selectG_PTR_ADD(I);
3096   case TargetOpcode::G_IMPLICIT_DEF:
3097     return selectG_IMPLICIT_DEF(I);
3098   case TargetOpcode::G_FREEZE:
3099     return selectCOPY(I);
3100   case TargetOpcode::G_INSERT:
3101     return selectG_INSERT(I);
3102   case TargetOpcode::G_INTRINSIC:
3103     return selectG_INTRINSIC(I);
3104   case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3105     return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
3106   case TargetOpcode::G_ICMP:
3107     if (selectG_ICMP(I))
3108       return true;
3109     return selectImpl(I, *CoverageInfo);
3110   case TargetOpcode::G_LOAD:
3111   case TargetOpcode::G_STORE:
3112   case TargetOpcode::G_ATOMIC_CMPXCHG:
3113   case TargetOpcode::G_ATOMICRMW_XCHG:
3114   case TargetOpcode::G_ATOMICRMW_ADD:
3115   case TargetOpcode::G_ATOMICRMW_SUB:
3116   case TargetOpcode::G_ATOMICRMW_AND:
3117   case TargetOpcode::G_ATOMICRMW_OR:
3118   case TargetOpcode::G_ATOMICRMW_XOR:
3119   case TargetOpcode::G_ATOMICRMW_MIN:
3120   case TargetOpcode::G_ATOMICRMW_MAX:
3121   case TargetOpcode::G_ATOMICRMW_UMIN:
3122   case TargetOpcode::G_ATOMICRMW_UMAX:
3123   case TargetOpcode::G_ATOMICRMW_FADD:
3124   case AMDGPU::G_AMDGPU_ATOMIC_INC:
3125   case AMDGPU::G_AMDGPU_ATOMIC_DEC:
3126   case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
3127   case AMDGPU::G_AMDGPU_ATOMIC_FMAX:
3128     return selectG_LOAD_STORE_ATOMICRMW(I);
3129   case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
3130     return selectG_AMDGPU_ATOMIC_CMPXCHG(I);
3131   case TargetOpcode::G_SELECT:
3132     return selectG_SELECT(I);
3133   case TargetOpcode::G_TRUNC:
3134     return selectG_TRUNC(I);
3135   case TargetOpcode::G_SEXT:
3136   case TargetOpcode::G_ZEXT:
3137   case TargetOpcode::G_ANYEXT:
3138   case TargetOpcode::G_SEXT_INREG:
3139     if (selectImpl(I, *CoverageInfo))
3140       return true;
3141     return selectG_SZA_EXT(I);
3142   case TargetOpcode::G_BRCOND:
3143     return selectG_BRCOND(I);
3144   case TargetOpcode::G_GLOBAL_VALUE:
3145     return selectG_GLOBAL_VALUE(I);
3146   case TargetOpcode::G_PTRMASK:
3147     return selectG_PTRMASK(I);
3148   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3149     return selectG_EXTRACT_VECTOR_ELT(I);
3150   case TargetOpcode::G_INSERT_VECTOR_ELT:
3151     return selectG_INSERT_VECTOR_ELT(I);
3152   case TargetOpcode::G_SHUFFLE_VECTOR:
3153     return selectG_SHUFFLE_VECTOR(I);
3154   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3155   case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
3156     const AMDGPU::ImageDimIntrinsicInfo *Intr
3157       = AMDGPU::getImageDimIntrinsicInfo(I.getIntrinsicID());
3158     assert(Intr && "not an image intrinsic with image pseudo");
3159     return selectImageIntrinsic(I, Intr);
3160   }
3161   case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY:
3162     return selectBVHIntrinsic(I);
3163   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
3164     return selectAMDGPU_BUFFER_ATOMIC_FADD(I);
3165   default:
3166     return selectImpl(I, *CoverageInfo);
3167   }
3168   return false;
3169 }
3170 
3171 InstructionSelector::ComplexRendererFns
3172 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
3173   return {{
3174       [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3175   }};
3176 
3177 }
3178 
3179 std::pair<Register, unsigned>
3180 AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root,
3181                                               bool AllowAbs) const {
3182   Register Src = Root.getReg();
3183   Register OrigSrc = Src;
3184   unsigned Mods = 0;
3185   MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
3186 
3187   if (MI && MI->getOpcode() == AMDGPU::G_FNEG) {
3188     Src = MI->getOperand(1).getReg();
3189     Mods |= SISrcMods::NEG;
3190     MI = getDefIgnoringCopies(Src, *MRI);
3191   }
3192 
3193   if (AllowAbs && MI && MI->getOpcode() == AMDGPU::G_FABS) {
3194     Src = MI->getOperand(1).getReg();
3195     Mods |= SISrcMods::ABS;
3196   }
3197 
3198   if (Mods != 0 &&
3199       RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
3200     MachineInstr *UseMI = Root.getParent();
3201 
3202     // If we looked through copies to find source modifiers on an SGPR operand,
3203     // we now have an SGPR register source. To avoid potentially violating the
3204     // constant bus restriction, we need to insert a copy to a VGPR.
3205     Register VGPRSrc = MRI->cloneVirtualRegister(OrigSrc);
3206     BuildMI(*UseMI->getParent(), UseMI, UseMI->getDebugLoc(),
3207             TII.get(AMDGPU::COPY), VGPRSrc)
3208       .addReg(Src);
3209     Src = VGPRSrc;
3210   }
3211 
3212   return std::make_pair(Src, Mods);
3213 }
3214 
3215 ///
3216 /// This will select either an SGPR or VGPR operand and will save us from
3217 /// having to write an extra tablegen pattern.
3218 InstructionSelector::ComplexRendererFns
3219 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
3220   return {{
3221       [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3222   }};
3223 }
3224 
3225 InstructionSelector::ComplexRendererFns
3226 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
3227   Register Src;
3228   unsigned Mods;
3229   std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3230 
3231   return {{
3232       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3233       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3234       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },    // clamp
3235       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }     // omod
3236   }};
3237 }
3238 
3239 InstructionSelector::ComplexRendererFns
3240 AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
3241   Register Src;
3242   unsigned Mods;
3243   std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /* AllowAbs */ false);
3244 
3245   return {{
3246       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3247       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3248       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },    // clamp
3249       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }     // omod
3250   }};
3251 }
3252 
3253 InstructionSelector::ComplexRendererFns
3254 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
3255   return {{
3256       [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
3257       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3258       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }  // omod
3259   }};
3260 }
3261 
3262 InstructionSelector::ComplexRendererFns
3263 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
3264   Register Src;
3265   unsigned Mods;
3266   std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3267 
3268   return {{
3269       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3270       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
3271   }};
3272 }
3273 
3274 InstructionSelector::ComplexRendererFns
3275 AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
3276   Register Src;
3277   unsigned Mods;
3278   std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /* AllowAbs */ false);
3279 
3280   return {{
3281       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3282       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3283   }};
3284 }
3285 
3286 InstructionSelector::ComplexRendererFns
3287 AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
3288   Register Reg = Root.getReg();
3289   const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3290   if (Def && (Def->getOpcode() == AMDGPU::G_FNEG ||
3291               Def->getOpcode() == AMDGPU::G_FABS))
3292     return {};
3293   return {{
3294       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3295   }};
3296 }
3297 
3298 std::pair<Register, unsigned>
3299 AMDGPUInstructionSelector::selectVOP3PModsImpl(
3300   Register Src, const MachineRegisterInfo &MRI) const {
3301   unsigned Mods = 0;
3302   MachineInstr *MI = MRI.getVRegDef(Src);
3303 
3304   if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
3305       // It's possible to see an f32 fneg here, but unlikely.
3306       // TODO: Treat f32 fneg as only high bit.
3307       MRI.getType(Src) == LLT::vector(2, 16)) {
3308     Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
3309     Src = MI->getOperand(1).getReg();
3310     MI = MRI.getVRegDef(Src);
3311   }
3312 
3313   // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
3314 
3315   // Packed instructions do not have abs modifiers.
3316   Mods |= SISrcMods::OP_SEL_1;
3317 
3318   return std::make_pair(Src, Mods);
3319 }
3320 
3321 InstructionSelector::ComplexRendererFns
3322 AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
3323   MachineRegisterInfo &MRI
3324     = Root.getParent()->getParent()->getParent()->getRegInfo();
3325 
3326   Register Src;
3327   unsigned Mods;
3328   std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
3329 
3330   return {{
3331       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3332       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
3333   }};
3334 }
3335 
3336 InstructionSelector::ComplexRendererFns
3337 AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {
3338   Register Src;
3339   unsigned Mods;
3340   std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3341   if (!isKnownNeverNaN(Src, *MRI))
3342     return None;
3343 
3344   return {{
3345       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3346       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
3347   }};
3348 }
3349 
3350 InstructionSelector::ComplexRendererFns
3351 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
3352   // FIXME: Handle op_sel
3353   return {{
3354       [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
3355       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods
3356   }};
3357 }
3358 
3359 InstructionSelector::ComplexRendererFns
3360 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
3361   SmallVector<GEPInfo, 4> AddrInfo;
3362   getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
3363 
3364   if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3365     return None;
3366 
3367   const GEPInfo &GEPInfo = AddrInfo[0];
3368   Optional<int64_t> EncodedImm =
3369       AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm, false);
3370   if (!EncodedImm)
3371     return None;
3372 
3373   unsigned PtrReg = GEPInfo.SgprParts[0];
3374   return {{
3375     [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3376     [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
3377   }};
3378 }
3379 
3380 InstructionSelector::ComplexRendererFns
3381 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
3382   SmallVector<GEPInfo, 4> AddrInfo;
3383   getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
3384 
3385   if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3386     return None;
3387 
3388   const GEPInfo &GEPInfo = AddrInfo[0];
3389   Register PtrReg = GEPInfo.SgprParts[0];
3390   Optional<int64_t> EncodedImm =
3391       AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
3392   if (!EncodedImm)
3393     return None;
3394 
3395   return {{
3396     [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3397     [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
3398   }};
3399 }
3400 
3401 InstructionSelector::ComplexRendererFns
3402 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
3403   MachineInstr *MI = Root.getParent();
3404   MachineBasicBlock *MBB = MI->getParent();
3405 
3406   SmallVector<GEPInfo, 4> AddrInfo;
3407   getAddrModeInfo(*MI, *MRI, AddrInfo);
3408 
3409   // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
3410   // then we can select all ptr + 32-bit offsets not just immediate offsets.
3411   if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
3412     return None;
3413 
3414   const GEPInfo &GEPInfo = AddrInfo[0];
3415   // SGPR offset is unsigned.
3416   if (!GEPInfo.Imm || GEPInfo.Imm < 0 || !isUInt<32>(GEPInfo.Imm))
3417     return None;
3418 
3419   // If we make it this far we have a load with an 32-bit immediate offset.
3420   // It is OK to select this using a sgpr offset, because we have already
3421   // failed trying to select this load into one of the _IMM variants since
3422   // the _IMM Patterns are considered before the _SGPR patterns.
3423   Register PtrReg = GEPInfo.SgprParts[0];
3424   Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3425   BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg)
3426           .addImm(GEPInfo.Imm);
3427   return {{
3428     [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
3429     [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }
3430   }};
3431 }
3432 
3433 template <bool Signed>
3434 std::pair<Register, int>
3435 AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const {
3436   MachineInstr *MI = Root.getParent();
3437 
3438   auto Default = std::make_pair(Root.getReg(), 0);
3439 
3440   if (!STI.hasFlatInstOffsets())
3441     return Default;
3442 
3443   Register PtrBase;
3444   int64_t ConstOffset;
3445   std::tie(PtrBase, ConstOffset) =
3446       getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3447   if (ConstOffset == 0)
3448     return Default;
3449 
3450   unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
3451   if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, Signed))
3452     return Default;
3453 
3454   return std::make_pair(PtrBase, ConstOffset);
3455 }
3456 
3457 InstructionSelector::ComplexRendererFns
3458 AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
3459   auto PtrWithOffset = selectFlatOffsetImpl<false>(Root);
3460 
3461   return {{
3462       [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
3463       [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
3464     }};
3465 }
3466 
3467 InstructionSelector::ComplexRendererFns
3468 AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const {
3469   auto PtrWithOffset = selectFlatOffsetImpl<true>(Root);
3470 
3471   return {{
3472       [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
3473       [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
3474     }};
3475 }
3476 
3477 /// Match a zero extend from a 32-bit value to 64-bits.
3478 static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
3479   Register ZExtSrc;
3480   if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
3481     return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3482 
3483   // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3484   const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
3485   if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3486     return false;
3487 
3488   if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
3489     return Def->getOperand(1).getReg();
3490   }
3491 
3492   return Register();
3493 }
3494 
3495 // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
3496 InstructionSelector::ComplexRendererFns
3497 AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
3498   Register Addr = Root.getReg();
3499   Register PtrBase;
3500   int64_t ConstOffset;
3501   int64_t ImmOffset = 0;
3502 
3503   // Match the immediate offset first, which canonically is moved as low as
3504   // possible.
3505   std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
3506 
3507   if (ConstOffset != 0) {
3508     if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, true)) {
3509       Addr = PtrBase;
3510       ImmOffset = ConstOffset;
3511     } else if (ConstOffset > 0) {
3512       auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
3513       if (!PtrBaseDef)
3514         return None;
3515 
3516       if (isSGPR(PtrBaseDef->Reg)) {
3517         // Offset is too large.
3518         //
3519         // saddr + large_offset -> saddr + (voffset = large_offset & ~MaxOffset)
3520         //                         + (large_offset & MaxOffset);
3521         int64_t SplitImmOffset, RemainderOffset;
3522         std::tie(SplitImmOffset, RemainderOffset)
3523           = TII.splitFlatOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS, true);
3524 
3525         if (isUInt<32>(RemainderOffset)) {
3526           MachineInstr *MI = Root.getParent();
3527           MachineBasicBlock *MBB = MI->getParent();
3528           Register HighBits
3529             = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3530 
3531           BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
3532                   HighBits)
3533             .addImm(RemainderOffset);
3534 
3535           return {{
3536             [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); },  // saddr
3537             [=](MachineInstrBuilder &MIB) { MIB.addReg(HighBits); }, // voffset
3538             [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
3539           }};
3540         }
3541       }
3542     }
3543   }
3544 
3545   auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3546   if (!AddrDef)
3547     return None;
3548 
3549   // Match the variable offset.
3550   if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD) {
3551     // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
3552     // drop this.
3553     if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
3554         AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT)
3555       return None;
3556 
3557     // It's cheaper to materialize a single 32-bit zero for vaddr than the two
3558     // moves required to copy a 64-bit SGPR to VGPR.
3559     const Register SAddr = AddrDef->Reg;
3560     if (!isSGPR(SAddr))
3561       return None;
3562 
3563     MachineInstr *MI = Root.getParent();
3564     MachineBasicBlock *MBB = MI->getParent();
3565     Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3566 
3567     BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
3568             VOffset)
3569       .addImm(0);
3570 
3571     return {{
3572         [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); },    // saddr
3573         [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); },  // voffset
3574         [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
3575     }};
3576   }
3577 
3578   // Look through the SGPR->VGPR copy.
3579   Register SAddr =
3580     getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3581   if (!SAddr || !isSGPR(SAddr))
3582     return None;
3583 
3584   Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3585 
3586   // It's possible voffset is an SGPR here, but the copy to VGPR will be
3587   // inserted later.
3588   Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset);
3589   if (!VOffset)
3590     return None;
3591 
3592   return {{[=](MachineInstrBuilder &MIB) { // saddr
3593              MIB.addReg(SAddr);
3594            },
3595            [=](MachineInstrBuilder &MIB) { // voffset
3596              MIB.addReg(VOffset);
3597            },
3598            [=](MachineInstrBuilder &MIB) { // offset
3599              MIB.addImm(ImmOffset);
3600            }}};
3601 }
3602 
3603 InstructionSelector::ComplexRendererFns
3604 AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
3605   Register Addr = Root.getReg();
3606   Register PtrBase;
3607   int64_t ConstOffset;
3608   int64_t ImmOffset = 0;
3609 
3610   // Match the immediate offset first, which canonically is moved as low as
3611   // possible.
3612   std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
3613 
3614   if (ConstOffset != 0 &&
3615       TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
3616     Addr = PtrBase;
3617     ImmOffset = ConstOffset;
3618   }
3619 
3620   auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3621   if (!AddrDef)
3622     return None;
3623 
3624   if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
3625     int FI = AddrDef->MI->getOperand(1).getIndex();
3626     return {{
3627         [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
3628         [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
3629     }};
3630   }
3631 
3632   Register SAddr = AddrDef->Reg;
3633 
3634   if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3635     Register LHS = AddrDef->MI->getOperand(1).getReg();
3636     Register RHS = AddrDef->MI->getOperand(2).getReg();
3637     auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
3638     auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
3639 
3640     if (LHSDef && RHSDef &&
3641         LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
3642         isSGPR(RHSDef->Reg)) {
3643       int FI = LHSDef->MI->getOperand(1).getIndex();
3644       MachineInstr &I = *Root.getParent();
3645       MachineBasicBlock *BB = I.getParent();
3646       const DebugLoc &DL = I.getDebugLoc();
3647       SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3648 
3649       BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), SAddr)
3650         .addFrameIndex(FI)
3651         .addReg(RHSDef->Reg);
3652     }
3653   }
3654 
3655   if (!isSGPR(SAddr))
3656     return None;
3657 
3658   return {{
3659       [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
3660       [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
3661   }};
3662 }
3663 
3664 static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
3665   auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>();
3666   return PSV && PSV->isStack();
3667 }
3668 
3669 InstructionSelector::ComplexRendererFns
3670 AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
3671   MachineInstr *MI = Root.getParent();
3672   MachineBasicBlock *MBB = MI->getParent();
3673   MachineFunction *MF = MBB->getParent();
3674   const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3675 
3676   int64_t Offset = 0;
3677   if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
3678       Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) {
3679     Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3680 
3681     // TODO: Should this be inside the render function? The iterator seems to
3682     // move.
3683     BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
3684             HighBits)
3685       .addImm(Offset & ~4095);
3686 
3687     return {{[=](MachineInstrBuilder &MIB) { // rsrc
3688                MIB.addReg(Info->getScratchRSrcReg());
3689              },
3690              [=](MachineInstrBuilder &MIB) { // vaddr
3691                MIB.addReg(HighBits);
3692              },
3693              [=](MachineInstrBuilder &MIB) { // soffset
3694                // Use constant zero for soffset and rely on eliminateFrameIndex
3695                // to choose the appropriate frame register if need be.
3696                MIB.addImm(0);
3697              },
3698              [=](MachineInstrBuilder &MIB) { // offset
3699                MIB.addImm(Offset & 4095);
3700              }}};
3701   }
3702 
3703   assert(Offset == 0 || Offset == -1);
3704 
3705   // Try to fold a frame index directly into the MUBUF vaddr field, and any
3706   // offsets.
3707   Optional<int> FI;
3708   Register VAddr = Root.getReg();
3709   if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) {
3710     if (isBaseWithConstantOffset(Root, *MRI)) {
3711       const MachineOperand &LHS = RootDef->getOperand(1);
3712       const MachineOperand &RHS = RootDef->getOperand(2);
3713       const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg());
3714       const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg());
3715       if (LHSDef && RHSDef) {
3716         int64_t PossibleOffset =
3717             RHSDef->getOperand(1).getCImm()->getSExtValue();
3718         if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) &&
3719             (!STI.privateMemoryResourceIsRangeChecked() ||
3720              KnownBits->signBitIsZero(LHS.getReg()))) {
3721           if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
3722             FI = LHSDef->getOperand(1).getIndex();
3723           else
3724             VAddr = LHS.getReg();
3725           Offset = PossibleOffset;
3726         }
3727       }
3728     } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
3729       FI = RootDef->getOperand(1).getIndex();
3730     }
3731   }
3732 
3733   return {{[=](MachineInstrBuilder &MIB) { // rsrc
3734              MIB.addReg(Info->getScratchRSrcReg());
3735            },
3736            [=](MachineInstrBuilder &MIB) { // vaddr
3737              if (FI.hasValue())
3738                MIB.addFrameIndex(FI.getValue());
3739              else
3740                MIB.addReg(VAddr);
3741            },
3742            [=](MachineInstrBuilder &MIB) { // soffset
3743              // Use constant zero for soffset and rely on eliminateFrameIndex
3744              // to choose the appropriate frame register if need be.
3745              MIB.addImm(0);
3746            },
3747            [=](MachineInstrBuilder &MIB) { // offset
3748              MIB.addImm(Offset);
3749            }}};
3750 }
3751 
3752 bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
3753                                                 int64_t Offset) const {
3754   if (!isUInt<16>(Offset))
3755     return false;
3756 
3757   if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
3758     return true;
3759 
3760   // On Southern Islands instruction with a negative base value and an offset
3761   // don't seem to work.
3762   return KnownBits->signBitIsZero(Base);
3763 }
3764 
3765 bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
3766                                                  int64_t Offset1,
3767                                                  unsigned Size) const {
3768   if (Offset0 % Size != 0 || Offset1 % Size != 0)
3769     return false;
3770   if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
3771     return false;
3772 
3773   if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
3774     return true;
3775 
3776   // On Southern Islands instruction with a negative base value and an offset
3777   // don't seem to work.
3778   return KnownBits->signBitIsZero(Base);
3779 }
3780 
3781 InstructionSelector::ComplexRendererFns
3782 AMDGPUInstructionSelector::selectMUBUFScratchOffset(
3783     MachineOperand &Root) const {
3784   MachineInstr *MI = Root.getParent();
3785   MachineBasicBlock *MBB = MI->getParent();
3786 
3787   int64_t Offset = 0;
3788   if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
3789       !SIInstrInfo::isLegalMUBUFImmOffset(Offset))
3790     return {};
3791 
3792   const MachineFunction *MF = MBB->getParent();
3793   const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3794   const MachineMemOperand *MMO = *MI->memoperands_begin();
3795   const MachinePointerInfo &PtrInfo = MMO->getPointerInfo();
3796 
3797   return {{
3798       [=](MachineInstrBuilder &MIB) { // rsrc
3799         MIB.addReg(Info->getScratchRSrcReg());
3800       },
3801       [=](MachineInstrBuilder &MIB) { // soffset
3802         if (isStackPtrRelative(PtrInfo))
3803           MIB.addReg(Info->getStackPtrOffsetReg());
3804         else
3805           MIB.addImm(0);
3806       },
3807       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
3808   }};
3809 }
3810 
3811 std::pair<Register, unsigned>
3812 AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
3813   const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
3814   if (!RootDef)
3815     return std::make_pair(Root.getReg(), 0);
3816 
3817   int64_t ConstAddr = 0;
3818 
3819   Register PtrBase;
3820   int64_t Offset;
3821   std::tie(PtrBase, Offset) =
3822     getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3823 
3824   if (Offset) {
3825     if (isDSOffsetLegal(PtrBase, Offset)) {
3826       // (add n0, c0)
3827       return std::make_pair(PtrBase, Offset);
3828     }
3829   } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
3830     // TODO
3831 
3832 
3833   } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
3834     // TODO
3835 
3836   }
3837 
3838   return std::make_pair(Root.getReg(), 0);
3839 }
3840 
3841 InstructionSelector::ComplexRendererFns
3842 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
3843   Register Reg;
3844   unsigned Offset;
3845   std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
3846   return {{
3847       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3848       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
3849     }};
3850 }
3851 
3852 InstructionSelector::ComplexRendererFns
3853 AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
3854   return selectDSReadWrite2(Root, 4);
3855 }
3856 
3857 InstructionSelector::ComplexRendererFns
3858 AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
3859   return selectDSReadWrite2(Root, 8);
3860 }
3861 
3862 InstructionSelector::ComplexRendererFns
3863 AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
3864                                               unsigned Size) const {
3865   Register Reg;
3866   unsigned Offset;
3867   std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
3868   return {{
3869       [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3870       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
3871       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
3872     }};
3873 }
3874 
3875 std::pair<Register, unsigned>
3876 AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
3877                                                   unsigned Size) const {
3878   const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
3879   if (!RootDef)
3880     return std::make_pair(Root.getReg(), 0);
3881 
3882   int64_t ConstAddr = 0;
3883 
3884   Register PtrBase;
3885   int64_t Offset;
3886   std::tie(PtrBase, Offset) =
3887     getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
3888 
3889   if (Offset) {
3890     int64_t OffsetValue0 = Offset;
3891     int64_t OffsetValue1 = Offset + Size;
3892     if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
3893       // (add n0, c0)
3894       return std::make_pair(PtrBase, OffsetValue0 / Size);
3895     }
3896   } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
3897     // TODO
3898 
3899   } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
3900     // TODO
3901 
3902   }
3903 
3904   return std::make_pair(Root.getReg(), 0);
3905 }
3906 
3907 /// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
3908 /// the base value with the constant offset. There may be intervening copies
3909 /// between \p Root and the identified constant. Returns \p Root, 0 if this does
3910 /// not match the pattern.
3911 std::pair<Register, int64_t>
3912 AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
3913   Register Root, const MachineRegisterInfo &MRI) const {
3914   MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
3915   if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
3916     return {Root, 0};
3917 
3918   MachineOperand &RHS = RootI->getOperand(2);
3919   Optional<ValueAndVReg> MaybeOffset
3920     = getConstantVRegValWithLookThrough(RHS.getReg(), MRI, true);
3921   if (!MaybeOffset)
3922     return {Root, 0};
3923   return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()};
3924 }
3925 
3926 static void addZeroImm(MachineInstrBuilder &MIB) {
3927   MIB.addImm(0);
3928 }
3929 
3930 /// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
3931 /// BasePtr is not valid, a null base pointer will be used.
3932 static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI,
3933                           uint32_t FormatLo, uint32_t FormatHi,
3934                           Register BasePtr) {
3935   Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
3936   Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
3937   Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3938   Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
3939 
3940   B.buildInstr(AMDGPU::S_MOV_B32)
3941     .addDef(RSrc2)
3942     .addImm(FormatLo);
3943   B.buildInstr(AMDGPU::S_MOV_B32)
3944     .addDef(RSrc3)
3945     .addImm(FormatHi);
3946 
3947   // Build the half of the subregister with the constants before building the
3948   // full 128-bit register. If we are building multiple resource descriptors,
3949   // this will allow CSEing of the 2-component register.
3950   B.buildInstr(AMDGPU::REG_SEQUENCE)
3951     .addDef(RSrcHi)
3952     .addReg(RSrc2)
3953     .addImm(AMDGPU::sub0)
3954     .addReg(RSrc3)
3955     .addImm(AMDGPU::sub1);
3956 
3957   Register RSrcLo = BasePtr;
3958   if (!BasePtr) {
3959     RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
3960     B.buildInstr(AMDGPU::S_MOV_B64)
3961       .addDef(RSrcLo)
3962       .addImm(0);
3963   }
3964 
3965   B.buildInstr(AMDGPU::REG_SEQUENCE)
3966     .addDef(RSrc)
3967     .addReg(RSrcLo)
3968     .addImm(AMDGPU::sub0_sub1)
3969     .addReg(RSrcHi)
3970     .addImm(AMDGPU::sub2_sub3);
3971 
3972   return RSrc;
3973 }
3974 
3975 static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
3976                                 const SIInstrInfo &TII, Register BasePtr) {
3977   uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
3978 
3979   // FIXME: Why are half the "default" bits ignored based on the addressing
3980   // mode?
3981   return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
3982 }
3983 
3984 static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
3985                                const SIInstrInfo &TII, Register BasePtr) {
3986   uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
3987 
3988   // FIXME: Why are half the "default" bits ignored based on the addressing
3989   // mode?
3990   return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
3991 }
3992 
3993 AMDGPUInstructionSelector::MUBUFAddressData
3994 AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
3995   MUBUFAddressData Data;
3996   Data.N0 = Src;
3997 
3998   Register PtrBase;
3999   int64_t Offset;
4000 
4001   std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
4002   if (isUInt<32>(Offset)) {
4003     Data.N0 = PtrBase;
4004     Data.Offset = Offset;
4005   }
4006 
4007   if (MachineInstr *InputAdd
4008       = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
4009     Data.N2 = InputAdd->getOperand(1).getReg();
4010     Data.N3 = InputAdd->getOperand(2).getReg();
4011 
4012     // FIXME: Need to fix extra SGPR->VGPRcopies inserted
4013     // FIXME: Don't know this was defined by operand 0
4014     //
4015     // TODO: Remove this when we have copy folding optimizations after
4016     // RegBankSelect.
4017     Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
4018     Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
4019   }
4020 
4021   return Data;
4022 }
4023 
4024 /// Return if the addr64 mubuf mode should be used for the given address.
4025 bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
4026   // (ptr_add N2, N3) -> addr64, or
4027   // (ptr_add (ptr_add N2, N3), C1) -> addr64
4028   if (Addr.N2)
4029     return true;
4030 
4031   const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
4032   return N0Bank->getID() == AMDGPU::VGPRRegBankID;
4033 }
4034 
4035 /// Split an immediate offset \p ImmOffset depending on whether it fits in the
4036 /// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
4037 /// component.
4038 void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
4039   MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
4040   if (SIInstrInfo::isLegalMUBUFImmOffset(ImmOffset))
4041     return;
4042 
4043   // Illegal offset, store it in soffset.
4044   SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4045   B.buildInstr(AMDGPU::S_MOV_B32)
4046     .addDef(SOffset)
4047     .addImm(ImmOffset);
4048   ImmOffset = 0;
4049 }
4050 
4051 bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
4052   MachineOperand &Root, Register &VAddr, Register &RSrcReg,
4053   Register &SOffset, int64_t &Offset) const {
4054   // FIXME: Predicates should stop this from reaching here.
4055   // addr64 bit was removed for volcanic islands.
4056   if (!STI.hasAddr64() || STI.useFlatForGlobal())
4057     return false;
4058 
4059   MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
4060   if (!shouldUseAddr64(AddrData))
4061     return false;
4062 
4063   Register N0 = AddrData.N0;
4064   Register N2 = AddrData.N2;
4065   Register N3 = AddrData.N3;
4066   Offset = AddrData.Offset;
4067 
4068   // Base pointer for the SRD.
4069   Register SRDPtr;
4070 
4071   if (N2) {
4072     if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
4073       assert(N3);
4074       if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
4075         // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
4076         // addr64, and construct the default resource from a 0 address.
4077         VAddr = N0;
4078       } else {
4079         SRDPtr = N3;
4080         VAddr = N2;
4081       }
4082     } else {
4083       // N2 is not divergent.
4084       SRDPtr = N2;
4085       VAddr = N3;
4086     }
4087   } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
4088     // Use the default null pointer in the resource
4089     VAddr = N0;
4090   } else {
4091     // N0 -> offset, or
4092     // (N0 + C1) -> offset
4093     SRDPtr = N0;
4094   }
4095 
4096   MachineIRBuilder B(*Root.getParent());
4097   RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
4098   splitIllegalMUBUFOffset(B, SOffset, Offset);
4099   return true;
4100 }
4101 
4102 bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
4103   MachineOperand &Root, Register &RSrcReg, Register &SOffset,
4104   int64_t &Offset) const {
4105 
4106   // FIXME: Pattern should not reach here.
4107   if (STI.useFlatForGlobal())
4108     return false;
4109 
4110   MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
4111   if (shouldUseAddr64(AddrData))
4112     return false;
4113 
4114   // N0 -> offset, or
4115   // (N0 + C1) -> offset
4116   Register SRDPtr = AddrData.N0;
4117   Offset = AddrData.Offset;
4118 
4119   // TODO: Look through extensions for 32-bit soffset.
4120   MachineIRBuilder B(*Root.getParent());
4121 
4122   RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
4123   splitIllegalMUBUFOffset(B, SOffset, Offset);
4124   return true;
4125 }
4126 
4127 InstructionSelector::ComplexRendererFns
4128 AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
4129   Register VAddr;
4130   Register RSrcReg;
4131   Register SOffset;
4132   int64_t Offset = 0;
4133 
4134   if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
4135     return {};
4136 
4137   // FIXME: Use defaulted operands for trailing 0s and remove from the complex
4138   // pattern.
4139   return {{
4140       [=](MachineInstrBuilder &MIB) {  // rsrc
4141         MIB.addReg(RSrcReg);
4142       },
4143       [=](MachineInstrBuilder &MIB) { // vaddr
4144         MIB.addReg(VAddr);
4145       },
4146       [=](MachineInstrBuilder &MIB) { // soffset
4147         if (SOffset)
4148           MIB.addReg(SOffset);
4149         else
4150           MIB.addImm(0);
4151       },
4152       [=](MachineInstrBuilder &MIB) { // offset
4153         MIB.addImm(Offset);
4154       },
4155       addZeroImm, //  glc
4156       addZeroImm, //  slc
4157       addZeroImm, //  tfe
4158       addZeroImm, //  dlc
4159       addZeroImm, //  swz
4160       addZeroImm  //  scc
4161     }};
4162 }
4163 
4164 InstructionSelector::ComplexRendererFns
4165 AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
4166   Register RSrcReg;
4167   Register SOffset;
4168   int64_t Offset = 0;
4169 
4170   if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
4171     return {};
4172 
4173   return {{
4174       [=](MachineInstrBuilder &MIB) {  // rsrc
4175         MIB.addReg(RSrcReg);
4176       },
4177       [=](MachineInstrBuilder &MIB) { // soffset
4178         if (SOffset)
4179           MIB.addReg(SOffset);
4180         else
4181           MIB.addImm(0);
4182       },
4183       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
4184       addZeroImm, //  glc
4185       addZeroImm, //  slc
4186       addZeroImm, //  tfe
4187       addZeroImm, //  dlc
4188       addZeroImm, //  swz
4189       addZeroImm  //  scc
4190     }};
4191 }
4192 
4193 InstructionSelector::ComplexRendererFns
4194 AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const {
4195   Register VAddr;
4196   Register RSrcReg;
4197   Register SOffset;
4198   int64_t Offset = 0;
4199 
4200   if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
4201     return {};
4202 
4203   // FIXME: Use defaulted operands for trailing 0s and remove from the complex
4204   // pattern.
4205   return {{
4206       [=](MachineInstrBuilder &MIB) {  // rsrc
4207         MIB.addReg(RSrcReg);
4208       },
4209       [=](MachineInstrBuilder &MIB) { // vaddr
4210         MIB.addReg(VAddr);
4211       },
4212       [=](MachineInstrBuilder &MIB) { // soffset
4213         if (SOffset)
4214           MIB.addReg(SOffset);
4215         else
4216           MIB.addImm(0);
4217       },
4218       [=](MachineInstrBuilder &MIB) { // offset
4219         MIB.addImm(Offset);
4220       },
4221       addZeroImm //  slc
4222     }};
4223 }
4224 
4225 InstructionSelector::ComplexRendererFns
4226 AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const {
4227   Register RSrcReg;
4228   Register SOffset;
4229   int64_t Offset = 0;
4230 
4231   if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
4232     return {};
4233 
4234   return {{
4235       [=](MachineInstrBuilder &MIB) {  // rsrc
4236         MIB.addReg(RSrcReg);
4237       },
4238       [=](MachineInstrBuilder &MIB) { // soffset
4239         if (SOffset)
4240           MIB.addReg(SOffset);
4241         else
4242           MIB.addImm(0);
4243       },
4244       [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
4245       addZeroImm //  slc
4246     }};
4247 }
4248 
4249 /// Get an immediate that must be 32-bits, and treated as zero extended.
4250 static Optional<uint64_t> getConstantZext32Val(Register Reg,
4251                                                const MachineRegisterInfo &MRI) {
4252   // getConstantVRegVal sexts any values, so see if that matters.
4253   Optional<int64_t> OffsetVal = getConstantVRegSExtVal(Reg, MRI);
4254   if (!OffsetVal || !isInt<32>(*OffsetVal))
4255     return None;
4256   return Lo_32(*OffsetVal);
4257 }
4258 
4259 InstructionSelector::ComplexRendererFns
4260 AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
4261   Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
4262   if (!OffsetVal)
4263     return {};
4264 
4265   Optional<int64_t> EncodedImm =
4266       AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
4267   if (!EncodedImm)
4268     return {};
4269 
4270   return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }  }};
4271 }
4272 
4273 InstructionSelector::ComplexRendererFns
4274 AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
4275   assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
4276 
4277   Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
4278   if (!OffsetVal)
4279     return {};
4280 
4281   Optional<int64_t> EncodedImm
4282     = AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal);
4283   if (!EncodedImm)
4284     return {};
4285 
4286   return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }  }};
4287 }
4288 
4289 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
4290                                                  const MachineInstr &MI,
4291                                                  int OpIdx) const {
4292   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
4293          "Expected G_CONSTANT");
4294   MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
4295 }
4296 
4297 void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
4298                                                 const MachineInstr &MI,
4299                                                 int OpIdx) const {
4300   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
4301          "Expected G_CONSTANT");
4302   MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
4303 }
4304 
4305 void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB,
4306                                                  const MachineInstr &MI,
4307                                                  int OpIdx) const {
4308   assert(OpIdx == -1);
4309 
4310   const MachineOperand &Op = MI.getOperand(1);
4311   if (MI.getOpcode() == TargetOpcode::G_FCONSTANT)
4312     MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
4313   else {
4314     assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
4315     MIB.addImm(Op.getCImm()->getSExtValue());
4316   }
4317 }
4318 
4319 void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
4320                                                 const MachineInstr &MI,
4321                                                 int OpIdx) const {
4322   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
4323          "Expected G_CONSTANT");
4324   MIB.addImm(MI.getOperand(1).getCImm()->getValue().countPopulation());
4325 }
4326 
4327 /// This only really exists to satisfy DAG type checking machinery, so is a
4328 /// no-op here.
4329 void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
4330                                                 const MachineInstr &MI,
4331                                                 int OpIdx) const {
4332   MIB.addImm(MI.getOperand(OpIdx).getImm());
4333 }
4334 
4335 void AMDGPUInstructionSelector::renderExtractGLC(MachineInstrBuilder &MIB,
4336                                                  const MachineInstr &MI,
4337                                                  int OpIdx) const {
4338   assert(OpIdx >= 0 && "expected to match an immediate operand");
4339   MIB.addImm(MI.getOperand(OpIdx).getImm() & 1);
4340 }
4341 
4342 void AMDGPUInstructionSelector::renderExtractSLC(MachineInstrBuilder &MIB,
4343                                                  const MachineInstr &MI,
4344                                                  int OpIdx) const {
4345   assert(OpIdx >= 0 && "expected to match an immediate operand");
4346   MIB.addImm((MI.getOperand(OpIdx).getImm() >> 1) & 1);
4347 }
4348 
4349 void AMDGPUInstructionSelector::renderExtractDLC(MachineInstrBuilder &MIB,
4350                                                  const MachineInstr &MI,
4351                                                  int OpIdx) const {
4352   assert(OpIdx >= 0 && "expected to match an immediate operand");
4353   MIB.addImm((MI.getOperand(OpIdx).getImm() >> 2) & 1);
4354 }
4355 
4356 void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
4357                                                  const MachineInstr &MI,
4358                                                  int OpIdx) const {
4359   assert(OpIdx >= 0 && "expected to match an immediate operand");
4360   MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1);
4361 }
4362 
4363 void AMDGPUInstructionSelector::renderExtractSCCB(MachineInstrBuilder &MIB,
4364                                                   const MachineInstr &MI,
4365                                                   int OpIdx) const {
4366   assert(OpIdx >= 0 && "expected to match an immediate operand");
4367   MIB.addImm((MI.getOperand(OpIdx).getImm() >> 4) & 1);
4368 }
4369 
4370 void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
4371                                                  const MachineInstr &MI,
4372                                                  int OpIdx) const {
4373   MIB.addFrameIndex((MI.getOperand(1).getIndex()));
4374 }
4375 
4376 bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const {
4377   return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm());
4378 }
4379 
4380 bool AMDGPUInstructionSelector::isInlineImmediate32(int64_t Imm) const {
4381   return AMDGPU::isInlinableLiteral32(Imm, STI.hasInv2PiInlineImm());
4382 }
4383 
4384 bool AMDGPUInstructionSelector::isInlineImmediate64(int64_t Imm) const {
4385   return AMDGPU::isInlinableLiteral64(Imm, STI.hasInv2PiInlineImm());
4386 }
4387 
4388 bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
4389   return TII.isInlineConstant(Imm);
4390 }
4391