1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the InstructionSelector class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUInstructionSelector.h"
15 #include "AMDGPUInstrInfo.h"
16 #include "AMDGPURegisterBankInfo.h"
17 #include "AMDGPURegisterInfo.h"
18 #include "AMDGPUSubtarget.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
22 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
23 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
24 #include "llvm/CodeGen/GlobalISel/Utils.h"
25 #include "llvm/CodeGen/MachineBasicBlock.h"
26 #include "llvm/CodeGen/MachineFunction.h"
27 #include "llvm/CodeGen/MachineInstr.h"
28 #include "llvm/CodeGen/MachineInstrBuilder.h"
29 #include "llvm/CodeGen/MachineRegisterInfo.h"
30 #include "llvm/IR/Type.h"
31 #include "llvm/Support/Debug.h"
32 #include "llvm/Support/raw_ostream.h"
33 
34 #define DEBUG_TYPE "amdgpu-isel"
35 
36 using namespace llvm;
37 
38 #define GET_GLOBALISEL_IMPL
39 #define AMDGPUSubtarget GCNSubtarget
40 #include "AMDGPUGenGlobalISel.inc"
41 #undef GET_GLOBALISEL_IMPL
42 #undef AMDGPUSubtarget
43 
44 AMDGPUInstructionSelector::AMDGPUInstructionSelector(
45     const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
46     const AMDGPUTargetMachine &TM)
47     : InstructionSelector(), TII(*STI.getInstrInfo()),
48       TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
49       STI(STI),
50       EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG),
51 #define GET_GLOBALISEL_PREDICATES_INIT
52 #include "AMDGPUGenGlobalISel.inc"
53 #undef GET_GLOBALISEL_PREDICATES_INIT
54 #define GET_GLOBALISEL_TEMPORARIES_INIT
55 #include "AMDGPUGenGlobalISel.inc"
56 #undef GET_GLOBALISEL_TEMPORARIES_INIT
57 {
58 }
59 
60 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
61 
62 static bool isSCC(Register Reg, const MachineRegisterInfo &MRI) {
63   if (TargetRegisterInfo::isPhysicalRegister(Reg))
64     return Reg == AMDGPU::SCC;
65 
66   auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
67   const TargetRegisterClass *RC =
68       RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
69   if (RC) {
70     if (RC->getID() != AMDGPU::SReg_32_XM0RegClassID)
71       return false;
72     const LLT Ty = MRI.getType(Reg);
73     return Ty.isValid() && Ty.getSizeInBits() == 1;
74   }
75 
76   const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
77   return RB->getID() == AMDGPU::SCCRegBankID;
78 }
79 
80 bool AMDGPUInstructionSelector::isVCC(Register Reg,
81                                       const MachineRegisterInfo &MRI) const {
82   if (TargetRegisterInfo::isPhysicalRegister(Reg))
83     return Reg == TRI.getVCC();
84 
85   auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
86   const TargetRegisterClass *RC =
87       RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
88   if (RC) {
89     return RC->hasSuperClassEq(TRI.getBoolRC()) &&
90            MRI.getType(Reg).getSizeInBits() == 1;
91   }
92 
93   const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
94   return RB->getID() == AMDGPU::VCCRegBankID;
95 }
96 
97 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
98   MachineBasicBlock *BB = I.getParent();
99   MachineFunction *MF = BB->getParent();
100   MachineRegisterInfo &MRI = MF->getRegInfo();
101   I.setDesc(TII.get(TargetOpcode::COPY));
102 
103   // Special case for COPY from the scc register bank.  The scc register bank
104   // is modeled using 32-bit sgprs.
105   const MachineOperand &Src = I.getOperand(1);
106   unsigned SrcReg = Src.getReg();
107   if (!TargetRegisterInfo::isPhysicalRegister(SrcReg) && isSCC(SrcReg, MRI)) {
108     unsigned DstReg = I.getOperand(0).getReg();
109 
110     // Specially handle scc->vcc copies.
111     if (isVCC(DstReg, MRI)) {
112       const DebugLoc &DL = I.getDebugLoc();
113       BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
114         .addImm(0)
115         .addReg(SrcReg);
116       if (!MRI.getRegClassOrNull(SrcReg))
117         MRI.setRegClass(SrcReg, TRI.getConstrainedRegClassForOperand(Src, MRI));
118       I.eraseFromParent();
119       return true;
120     }
121   }
122 
123   for (const MachineOperand &MO : I.operands()) {
124     if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
125       continue;
126 
127     const TargetRegisterClass *RC =
128             TRI.getConstrainedRegClassForOperand(MO, MRI);
129     if (!RC)
130       continue;
131     RBI.constrainGenericRegister(MO.getReg(), *RC, MRI);
132   }
133   return true;
134 }
135 
136 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
137   MachineBasicBlock *BB = I.getParent();
138   MachineFunction *MF = BB->getParent();
139   MachineRegisterInfo &MRI = MF->getRegInfo();
140 
141   const Register DefReg = I.getOperand(0).getReg();
142   const LLT DefTy = MRI.getType(DefReg);
143 
144   // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
145 
146   const RegClassOrRegBank &RegClassOrBank =
147     MRI.getRegClassOrRegBank(DefReg);
148 
149   const TargetRegisterClass *DefRC
150     = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
151   if (!DefRC) {
152     if (!DefTy.isValid()) {
153       LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
154       return false;
155     }
156 
157     const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
158     if (RB.getID() == AMDGPU::SCCRegBankID) {
159       LLVM_DEBUG(dbgs() << "illegal scc phi\n");
160       return false;
161     }
162 
163     DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, MRI);
164     if (!DefRC) {
165       LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
166       return false;
167     }
168   }
169 
170   I.setDesc(TII.get(TargetOpcode::PHI));
171   return RBI.constrainGenericRegister(DefReg, *DefRC, MRI);
172 }
173 
174 MachineOperand
175 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
176                                            const TargetRegisterClass &SubRC,
177                                            unsigned SubIdx) const {
178 
179   MachineInstr *MI = MO.getParent();
180   MachineBasicBlock *BB = MO.getParent()->getParent();
181   MachineFunction *MF = BB->getParent();
182   MachineRegisterInfo &MRI = MF->getRegInfo();
183   Register DstReg = MRI.createVirtualRegister(&SubRC);
184 
185   if (MO.isReg()) {
186     unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
187     unsigned Reg = MO.getReg();
188     BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
189             .addReg(Reg, 0, ComposedSubIdx);
190 
191     return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
192                                      MO.isKill(), MO.isDead(), MO.isUndef(),
193                                      MO.isEarlyClobber(), 0, MO.isDebug(),
194                                      MO.isInternalRead());
195   }
196 
197   assert(MO.isImm());
198 
199   APInt Imm(64, MO.getImm());
200 
201   switch (SubIdx) {
202   default:
203     llvm_unreachable("do not know to split immediate with this sub index.");
204   case AMDGPU::sub0:
205     return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
206   case AMDGPU::sub1:
207     return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
208   }
209 }
210 
211 static int64_t getConstant(const MachineInstr *MI) {
212   return MI->getOperand(1).getCImm()->getSExtValue();
213 }
214 
215 bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
216   MachineBasicBlock *BB = I.getParent();
217   MachineFunction *MF = BB->getParent();
218   MachineRegisterInfo &MRI = MF->getRegInfo();
219   Register DstReg = I.getOperand(0).getReg();
220   const DebugLoc &DL = I.getDebugLoc();
221   unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI);
222   const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI);
223   const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
224   const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
225 
226   if (Size == 32) {
227     if (IsSALU) {
228       const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
229       MachineInstr *Add =
230         BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
231         .add(I.getOperand(1))
232         .add(I.getOperand(2));
233       I.eraseFromParent();
234       return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
235     }
236 
237     if (STI.hasAddNoCarry()) {
238       const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
239       I.setDesc(TII.get(Opc));
240       I.addOperand(*MF, MachineOperand::CreateImm(0));
241       I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
242       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
243     }
244 
245     const unsigned Opc = Sub ? AMDGPU::V_SUB_I32_e64 : AMDGPU::V_ADD_I32_e64;
246 
247     Register UnusedCarry = MRI.createVirtualRegister(TRI.getWaveMaskRegClass());
248     MachineInstr *Add
249       = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
250       .addDef(UnusedCarry, RegState::Dead)
251       .add(I.getOperand(1))
252       .add(I.getOperand(2))
253       .addImm(0);
254     I.eraseFromParent();
255     return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
256   }
257 
258   assert(!Sub && "illegal sub should not reach here");
259 
260   const TargetRegisterClass &RC
261     = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
262   const TargetRegisterClass &HalfRC
263     = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
264 
265   MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
266   MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
267   MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
268   MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
269 
270   Register DstLo = MRI.createVirtualRegister(&HalfRC);
271   Register DstHi = MRI.createVirtualRegister(&HalfRC);
272 
273   if (IsSALU) {
274     BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
275       .add(Lo1)
276       .add(Lo2);
277     BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
278       .add(Hi1)
279       .add(Hi2);
280   } else {
281     const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
282     Register CarryReg = MRI.createVirtualRegister(CarryRC);
283     BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_I32_e64), DstLo)
284       .addDef(CarryReg)
285       .add(Lo1)
286       .add(Lo2)
287       .addImm(0);
288     MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
289       .addDef(MRI.createVirtualRegister(CarryRC), RegState::Dead)
290       .add(Hi1)
291       .add(Hi2)
292       .addReg(CarryReg, RegState::Kill)
293       .addImm(0);
294 
295     if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
296       return false;
297   }
298 
299   BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
300     .addReg(DstLo)
301     .addImm(AMDGPU::sub0)
302     .addReg(DstHi)
303     .addImm(AMDGPU::sub1);
304 
305 
306   if (!RBI.constrainGenericRegister(DstReg, RC, MRI))
307     return false;
308 
309   I.eraseFromParent();
310   return true;
311 }
312 
313 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
314   MachineBasicBlock *BB = I.getParent();
315   MachineFunction *MF = BB->getParent();
316   MachineRegisterInfo &MRI = MF->getRegInfo();
317   assert(I.getOperand(2).getImm() % 32 == 0);
318   unsigned SubReg = TRI.getSubRegFromChannel(I.getOperand(2).getImm() / 32);
319   const DebugLoc &DL = I.getDebugLoc();
320   MachineInstr *Copy = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY),
321                                I.getOperand(0).getReg())
322                                .addReg(I.getOperand(1).getReg(), 0, SubReg);
323 
324   for (const MachineOperand &MO : Copy->operands()) {
325     const TargetRegisterClass *RC =
326             TRI.getConstrainedRegClassForOperand(MO, MRI);
327     if (!RC)
328       continue;
329     RBI.constrainGenericRegister(MO.getReg(), *RC, MRI);
330   }
331   I.eraseFromParent();
332   return true;
333 }
334 
335 bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
336   MachineBasicBlock *BB = MI.getParent();
337   MachineFunction *MF = BB->getParent();
338   MachineRegisterInfo &MRI = MF->getRegInfo();
339   Register DstReg = MI.getOperand(0).getReg();
340   LLT DstTy = MRI.getType(DstReg);
341   LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
342 
343   const unsigned SrcSize = SrcTy.getSizeInBits();
344   const DebugLoc &DL = MI.getDebugLoc();
345   const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, TRI);
346   const unsigned DstSize = DstTy.getSizeInBits();
347   const TargetRegisterClass *DstRC =
348     TRI.getRegClassForSizeOnBank(DstSize, *DstBank, MRI);
349   if (!DstRC)
350     return false;
351 
352   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
353   MachineInstrBuilder MIB =
354     BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
355   for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
356     MachineOperand &Src = MI.getOperand(I + 1);
357     MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
358     MIB.addImm(SubRegs[I]);
359 
360     const TargetRegisterClass *SrcRC
361       = TRI.getConstrainedRegClassForOperand(Src, MRI);
362     if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, MRI))
363       return false;
364   }
365 
366   if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI))
367     return false;
368 
369   MI.eraseFromParent();
370   return true;
371 }
372 
373 bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
374   MachineBasicBlock *BB = MI.getParent();
375   MachineFunction *MF = BB->getParent();
376   MachineRegisterInfo &MRI = MF->getRegInfo();
377   const int NumDst = MI.getNumOperands() - 1;
378 
379   MachineOperand &Src = MI.getOperand(NumDst);
380 
381   Register SrcReg = Src.getReg();
382   Register DstReg0 = MI.getOperand(0).getReg();
383   LLT DstTy = MRI.getType(DstReg0);
384   LLT SrcTy = MRI.getType(SrcReg);
385 
386   const unsigned DstSize = DstTy.getSizeInBits();
387   const unsigned SrcSize = SrcTy.getSizeInBits();
388   const DebugLoc &DL = MI.getDebugLoc();
389   const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, TRI);
390 
391   const TargetRegisterClass *SrcRC =
392     TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, MRI);
393   if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI))
394     return false;
395 
396   const unsigned SrcFlags = getUndefRegState(Src.isUndef());
397 
398   // Note we could have mixed SGPR and VGPR destination banks for an SGPR
399   // source, and this relies on the fact that the same subregister indices are
400   // used for both.
401   ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
402   for (int I = 0, E = NumDst; I != E; ++I) {
403     MachineOperand &Dst = MI.getOperand(I);
404     BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
405       .addReg(SrcReg, SrcFlags, SubRegs[I]);
406 
407     const TargetRegisterClass *DstRC =
408       TRI.getConstrainedRegClassForOperand(Dst, MRI);
409     if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, MRI))
410       return false;
411   }
412 
413   MI.eraseFromParent();
414   return true;
415 }
416 
417 bool AMDGPUInstructionSelector::selectG_GEP(MachineInstr &I) const {
418   return selectG_ADD_SUB(I);
419 }
420 
421 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
422   MachineBasicBlock *BB = I.getParent();
423   MachineFunction *MF = BB->getParent();
424   MachineRegisterInfo &MRI = MF->getRegInfo();
425   const MachineOperand &MO = I.getOperand(0);
426 
427   // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
428   // regbank check here is to know why getConstrainedRegClassForOperand failed.
429   const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, MRI);
430   if ((!RC && !MRI.getRegBankOrNull(MO.getReg())) ||
431       (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, MRI))) {
432     I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
433     return true;
434   }
435 
436   return false;
437 }
438 
439 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
440   MachineBasicBlock *BB = I.getParent();
441   MachineFunction *MF = BB->getParent();
442   MachineRegisterInfo &MRI = MF->getRegInfo();
443   unsigned SubReg = TRI.getSubRegFromChannel(I.getOperand(3).getImm() / 32);
444   DebugLoc DL = I.getDebugLoc();
445   MachineInstr *Ins = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG))
446                                .addDef(I.getOperand(0).getReg())
447                                .addReg(I.getOperand(1).getReg())
448                                .addReg(I.getOperand(2).getReg())
449                                .addImm(SubReg);
450 
451   for (const MachineOperand &MO : Ins->operands()) {
452     if (!MO.isReg())
453       continue;
454     if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
455       continue;
456 
457     const TargetRegisterClass *RC =
458             TRI.getConstrainedRegClassForOperand(MO, MRI);
459     if (!RC)
460       continue;
461     RBI.constrainGenericRegister(MO.getReg(), *RC, MRI);
462   }
463   I.eraseFromParent();
464   return true;
465 }
466 
467 bool AMDGPUInstructionSelector::selectG_INTRINSIC(
468   MachineInstr &I, CodeGenCoverage &CoverageInfo) const {
469   unsigned IntrinsicID =  I.getOperand(I.getNumExplicitDefs()).getIntrinsicID();
470   switch (IntrinsicID) {
471   case Intrinsic::maxnum:
472   case Intrinsic::minnum:
473   case Intrinsic::amdgcn_cvt_pkrtz:
474     return selectImpl(I, CoverageInfo);
475   default:
476     return selectImpl(I, CoverageInfo);
477   }
478 }
479 
480 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) {
481   if (Size != 32 && Size != 64)
482     return -1;
483   switch (P) {
484   default:
485     llvm_unreachable("Unknown condition code!");
486   case CmpInst::ICMP_NE:
487     return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64;
488   case CmpInst::ICMP_EQ:
489     return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64;
490   case CmpInst::ICMP_SGT:
491     return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64;
492   case CmpInst::ICMP_SGE:
493     return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64;
494   case CmpInst::ICMP_SLT:
495     return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64;
496   case CmpInst::ICMP_SLE:
497     return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64;
498   case CmpInst::ICMP_UGT:
499     return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64;
500   case CmpInst::ICMP_UGE:
501     return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64;
502   case CmpInst::ICMP_ULT:
503     return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64;
504   case CmpInst::ICMP_ULE:
505     return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64;
506   }
507 }
508 
509 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
510                                               unsigned Size) const {
511   if (Size == 64) {
512     if (!STI.hasScalarCompareEq64())
513       return -1;
514 
515     switch (P) {
516     case CmpInst::ICMP_NE:
517       return AMDGPU::S_CMP_LG_U64;
518     case CmpInst::ICMP_EQ:
519       return AMDGPU::S_CMP_EQ_U64;
520     default:
521       return -1;
522     }
523   }
524 
525   if (Size != 32)
526     return -1;
527 
528   switch (P) {
529   case CmpInst::ICMP_NE:
530     return AMDGPU::S_CMP_LG_U32;
531   case CmpInst::ICMP_EQ:
532     return AMDGPU::S_CMP_EQ_U32;
533   case CmpInst::ICMP_SGT:
534     return AMDGPU::S_CMP_GT_I32;
535   case CmpInst::ICMP_SGE:
536     return AMDGPU::S_CMP_GE_I32;
537   case CmpInst::ICMP_SLT:
538     return AMDGPU::S_CMP_LT_I32;
539   case CmpInst::ICMP_SLE:
540     return AMDGPU::S_CMP_LE_I32;
541   case CmpInst::ICMP_UGT:
542     return AMDGPU::S_CMP_GT_U32;
543   case CmpInst::ICMP_UGE:
544     return AMDGPU::S_CMP_GE_U32;
545   case CmpInst::ICMP_ULT:
546     return AMDGPU::S_CMP_LT_U32;
547   case CmpInst::ICMP_ULE:
548     return AMDGPU::S_CMP_LE_U32;
549   default:
550     llvm_unreachable("Unknown condition code!");
551   }
552 }
553 
554 bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
555   MachineBasicBlock *BB = I.getParent();
556   MachineFunction *MF = BB->getParent();
557   MachineRegisterInfo &MRI = MF->getRegInfo();
558   DebugLoc DL = I.getDebugLoc();
559 
560   unsigned SrcReg = I.getOperand(2).getReg();
561   unsigned Size = RBI.getSizeInBits(SrcReg, MRI, TRI);
562 
563   auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
564 
565   unsigned CCReg = I.getOperand(0).getReg();
566   if (isSCC(CCReg, MRI)) {
567     int Opcode = getS_CMPOpcode(Pred, Size);
568     if (Opcode == -1)
569       return false;
570     MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
571             .add(I.getOperand(2))
572             .add(I.getOperand(3));
573     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
574       .addReg(AMDGPU::SCC);
575     bool Ret =
576         constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
577         RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, MRI);
578     I.eraseFromParent();
579     return Ret;
580   }
581 
582   int Opcode = getV_CMPOpcode(Pred, Size);
583   if (Opcode == -1)
584     return false;
585 
586   MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode),
587             I.getOperand(0).getReg())
588             .add(I.getOperand(2))
589             .add(I.getOperand(3));
590   RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
591                                AMDGPU::SReg_64RegClass, MRI);
592   bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
593   I.eraseFromParent();
594   return Ret;
595 }
596 
597 static MachineInstr *
598 buildEXP(const TargetInstrInfo &TII, MachineInstr *Insert, unsigned Tgt,
599          unsigned Reg0, unsigned Reg1, unsigned Reg2, unsigned Reg3,
600          unsigned VM, bool Compr, unsigned Enabled, bool Done) {
601   const DebugLoc &DL = Insert->getDebugLoc();
602   MachineBasicBlock &BB = *Insert->getParent();
603   unsigned Opcode = Done ? AMDGPU::EXP_DONE : AMDGPU::EXP;
604   return BuildMI(BB, Insert, DL, TII.get(Opcode))
605           .addImm(Tgt)
606           .addReg(Reg0)
607           .addReg(Reg1)
608           .addReg(Reg2)
609           .addReg(Reg3)
610           .addImm(VM)
611           .addImm(Compr)
612           .addImm(Enabled);
613 }
614 
615 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
616   MachineInstr &I, CodeGenCoverage &CoverageInfo) const {
617   MachineBasicBlock *BB = I.getParent();
618   MachineFunction *MF = BB->getParent();
619   MachineRegisterInfo &MRI = MF->getRegInfo();
620 
621   unsigned IntrinsicID = I.getOperand(0).getIntrinsicID();
622   switch (IntrinsicID) {
623   case Intrinsic::amdgcn_exp: {
624     int64_t Tgt = getConstant(MRI.getVRegDef(I.getOperand(1).getReg()));
625     int64_t Enabled = getConstant(MRI.getVRegDef(I.getOperand(2).getReg()));
626     int64_t Done = getConstant(MRI.getVRegDef(I.getOperand(7).getReg()));
627     int64_t VM = getConstant(MRI.getVRegDef(I.getOperand(8).getReg()));
628 
629     MachineInstr *Exp = buildEXP(TII, &I, Tgt, I.getOperand(3).getReg(),
630                                  I.getOperand(4).getReg(),
631                                  I.getOperand(5).getReg(),
632                                  I.getOperand(6).getReg(),
633                                  VM, false, Enabled, Done);
634 
635     I.eraseFromParent();
636     return constrainSelectedInstRegOperands(*Exp, TII, TRI, RBI);
637   }
638   case Intrinsic::amdgcn_exp_compr: {
639     const DebugLoc &DL = I.getDebugLoc();
640     int64_t Tgt = getConstant(MRI.getVRegDef(I.getOperand(1).getReg()));
641     int64_t Enabled = getConstant(MRI.getVRegDef(I.getOperand(2).getReg()));
642     unsigned Reg0 = I.getOperand(3).getReg();
643     unsigned Reg1 = I.getOperand(4).getReg();
644     unsigned Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
645     int64_t Done = getConstant(MRI.getVRegDef(I.getOperand(5).getReg()));
646     int64_t VM = getConstant(MRI.getVRegDef(I.getOperand(6).getReg()));
647 
648     BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
649     MachineInstr *Exp = buildEXP(TII, &I, Tgt, Reg0, Reg1, Undef, Undef, VM,
650                                  true,  Enabled, Done);
651 
652     I.eraseFromParent();
653     return constrainSelectedInstRegOperands(*Exp, TII, TRI, RBI);
654   }
655   default:
656     return selectImpl(I, CoverageInfo);
657   }
658 }
659 
660 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
661   MachineBasicBlock *BB = I.getParent();
662   MachineFunction *MF = BB->getParent();
663   MachineRegisterInfo &MRI = MF->getRegInfo();
664   const DebugLoc &DL = I.getDebugLoc();
665 
666   unsigned DstReg = I.getOperand(0).getReg();
667   unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI);
668   assert(Size <= 32 || Size == 64);
669   const MachineOperand &CCOp = I.getOperand(1);
670   unsigned CCReg = CCOp.getReg();
671   if (isSCC(CCReg, MRI)) {
672     unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
673                                          AMDGPU::S_CSELECT_B32;
674     MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
675             .addReg(CCReg);
676 
677     // The generic constrainSelectedInstRegOperands doesn't work for the scc register
678     // bank, because it does not cover the register class that we used to represent
679     // for it.  So we need to manually set the register class here.
680     if (!MRI.getRegClassOrNull(CCReg))
681         MRI.setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, MRI));
682     MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
683             .add(I.getOperand(2))
684             .add(I.getOperand(3));
685 
686     bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI) |
687                constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
688     I.eraseFromParent();
689     return Ret;
690   }
691 
692   // Wide VGPR select should have been split in RegBankSelect.
693   if (Size > 32)
694     return false;
695 
696   MachineInstr *Select =
697       BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
698               .addImm(0)
699               .add(I.getOperand(3))
700               .addImm(0)
701               .add(I.getOperand(2))
702               .add(I.getOperand(1));
703 
704   bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
705   I.eraseFromParent();
706   return Ret;
707 }
708 
709 bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const {
710   MachineBasicBlock *BB = I.getParent();
711   MachineFunction *MF = BB->getParent();
712   MachineRegisterInfo &MRI = MF->getRegInfo();
713   DebugLoc DL = I.getDebugLoc();
714   unsigned PtrSize = RBI.getSizeInBits(I.getOperand(1).getReg(), MRI, TRI);
715   if (PtrSize != 64) {
716     LLVM_DEBUG(dbgs() << "Unhandled address space\n");
717     return false;
718   }
719 
720   unsigned StoreSize = RBI.getSizeInBits(I.getOperand(0).getReg(), MRI, TRI);
721   unsigned Opcode;
722 
723   // FIXME: Select store instruction based on address space
724   switch (StoreSize) {
725   default:
726     return false;
727   case 32:
728     Opcode = AMDGPU::FLAT_STORE_DWORD;
729     break;
730   case 64:
731     Opcode = AMDGPU::FLAT_STORE_DWORDX2;
732     break;
733   case 96:
734     Opcode = AMDGPU::FLAT_STORE_DWORDX3;
735     break;
736   case 128:
737     Opcode = AMDGPU::FLAT_STORE_DWORDX4;
738     break;
739   }
740 
741   MachineInstr *Flat = BuildMI(*BB, &I, DL, TII.get(Opcode))
742           .add(I.getOperand(1))
743           .add(I.getOperand(0))
744           .addImm(0)  // offset
745           .addImm(0)  // glc
746           .addImm(0)  // slc
747           .addImm(0); // dlc
748 
749 
750   // Now that we selected an opcode, we need to constrain the register
751   // operands to use appropriate classes.
752   bool Ret = constrainSelectedInstRegOperands(*Flat, TII, TRI, RBI);
753 
754   I.eraseFromParent();
755   return Ret;
756 }
757 
758 static int sizeToSubRegIndex(unsigned Size) {
759   switch (Size) {
760   case 32:
761     return AMDGPU::sub0;
762   case 64:
763     return AMDGPU::sub0_sub1;
764   case 96:
765     return AMDGPU::sub0_sub1_sub2;
766   case 128:
767     return AMDGPU::sub0_sub1_sub2_sub3;
768   case 256:
769     return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
770   default:
771     if (Size < 32)
772       return AMDGPU::sub0;
773     if (Size > 256)
774       return -1;
775     return sizeToSubRegIndex(PowerOf2Ceil(Size));
776   }
777 }
778 
779 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
780   MachineBasicBlock *BB = I.getParent();
781   MachineFunction *MF = BB->getParent();
782   MachineRegisterInfo &MRI = MF->getRegInfo();
783 
784   unsigned DstReg = I.getOperand(0).getReg();
785   unsigned SrcReg = I.getOperand(1).getReg();
786   const LLT DstTy = MRI.getType(DstReg);
787   const LLT SrcTy = MRI.getType(SrcReg);
788   if (!DstTy.isScalar())
789     return false;
790 
791   const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI);
792   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, MRI, TRI);
793   if (SrcRB != DstRB)
794     return false;
795 
796   unsigned DstSize = DstTy.getSizeInBits();
797   unsigned SrcSize = SrcTy.getSizeInBits();
798 
799   const TargetRegisterClass *SrcRC
800     = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, MRI);
801   const TargetRegisterClass *DstRC
802     = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, MRI);
803 
804   if (SrcSize > 32) {
805     int SubRegIdx = sizeToSubRegIndex(DstSize);
806     if (SubRegIdx == -1)
807       return false;
808 
809     // Deal with weird cases where the class only partially supports the subreg
810     // index.
811     SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
812     if (!SrcRC)
813       return false;
814 
815     I.getOperand(1).setSubReg(SubRegIdx);
816   }
817 
818   if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
819       !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
820     LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
821     return false;
822   }
823 
824   I.setDesc(TII.get(TargetOpcode::COPY));
825   return true;
826 }
827 
828 /// \returns true if a bitmask for \p Size bits will be an inline immediate.
829 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
830   Mask = maskTrailingOnes<unsigned>(Size);
831   int SignedMask = static_cast<int>(Mask);
832   return SignedMask >= -16 && SignedMask <= 64;
833 }
834 
835 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
836   bool Signed = I.getOpcode() == AMDGPU::G_SEXT;
837   const DebugLoc &DL = I.getDebugLoc();
838   MachineBasicBlock &MBB = *I.getParent();
839   MachineFunction &MF = *MBB.getParent();
840   MachineRegisterInfo &MRI = MF.getRegInfo();
841   const unsigned DstReg = I.getOperand(0).getReg();
842   const unsigned SrcReg = I.getOperand(1).getReg();
843 
844   const LLT DstTy = MRI.getType(DstReg);
845   const LLT SrcTy = MRI.getType(SrcReg);
846   const LLT S1 = LLT::scalar(1);
847   const unsigned SrcSize = SrcTy.getSizeInBits();
848   const unsigned DstSize = DstTy.getSizeInBits();
849   if (!DstTy.isScalar())
850     return false;
851 
852   const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, TRI);
853 
854   if (SrcBank->getID() == AMDGPU::SCCRegBankID) {
855     if (SrcTy != S1 || DstSize > 64) // Invalid
856       return false;
857 
858     unsigned Opcode =
859         DstSize > 32 ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
860     const TargetRegisterClass *DstRC =
861         DstSize > 32 ? &AMDGPU::SReg_64RegClass : &AMDGPU::SReg_32RegClass;
862 
863     // FIXME: Create an extra copy to avoid incorrectly constraining the result
864     // of the scc producer.
865     unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
866     BuildMI(MBB, I, DL, TII.get(AMDGPU::COPY), TmpReg)
867       .addReg(SrcReg);
868     BuildMI(MBB, I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
869       .addReg(TmpReg);
870 
871     // The instruction operands are backwards from what you would expect.
872     BuildMI(MBB, I, DL, TII.get(Opcode), DstReg)
873       .addImm(0)
874       .addImm(Signed ? -1 : 1);
875     return RBI.constrainGenericRegister(DstReg, *DstRC, MRI);
876   }
877 
878   if (SrcBank->getID() == AMDGPU::VCCRegBankID && DstSize <= 32) {
879     if (SrcTy != S1) // Invalid
880       return false;
881 
882     MachineInstr *ExtI =
883       BuildMI(MBB, I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
884       .addImm(0)               // src0_modifiers
885       .addImm(0)               // src0
886       .addImm(0)               // src1_modifiers
887       .addImm(Signed ? -1 : 1) // src1
888       .addUse(SrcReg);
889     return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
890   }
891 
892   if (I.getOpcode() == AMDGPU::G_ANYEXT)
893     return selectCOPY(I);
894 
895   if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
896     // 64-bit should have been split up in RegBankSelect
897 
898     // Try to use an and with a mask if it will save code size.
899     unsigned Mask;
900     if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
901       MachineInstr *ExtI =
902       BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
903         .addImm(Mask)
904         .addReg(SrcReg);
905       return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
906     }
907 
908     const unsigned BFE = Signed ? AMDGPU::V_BFE_I32 : AMDGPU::V_BFE_U32;
909     MachineInstr *ExtI =
910       BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
911       .addReg(SrcReg)
912       .addImm(0) // Offset
913       .addImm(SrcSize); // Width
914     return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
915   }
916 
917   if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
918     if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI))
919       return false;
920 
921     if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
922       const unsigned SextOpc = SrcSize == 8 ?
923         AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
924       BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
925         .addReg(SrcReg);
926       return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, MRI);
927     }
928 
929     const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
930     const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
931 
932     // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
933     if (DstSize > 32 && SrcSize <= 32) {
934       // We need a 64-bit register source, but the high bits don't matter.
935       unsigned ExtReg
936         = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
937       unsigned UndefReg
938         = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
939       BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
940       BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
941         .addReg(SrcReg)
942         .addImm(AMDGPU::sub0)
943         .addReg(UndefReg)
944         .addImm(AMDGPU::sub1);
945 
946       BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
947         .addReg(ExtReg)
948         .addImm(SrcSize << 16);
949 
950       return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, MRI);
951     }
952 
953     unsigned Mask;
954     if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
955       BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
956         .addReg(SrcReg)
957         .addImm(Mask);
958     } else {
959       BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
960         .addReg(SrcReg)
961         .addImm(SrcSize << 16);
962     }
963 
964     return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, MRI);
965   }
966 
967   return false;
968 }
969 
970 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
971   MachineBasicBlock *BB = I.getParent();
972   MachineFunction *MF = BB->getParent();
973   MachineRegisterInfo &MRI = MF->getRegInfo();
974   MachineOperand &ImmOp = I.getOperand(1);
975 
976   // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
977   if (ImmOp.isFPImm()) {
978     const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
979     ImmOp.ChangeToImmediate(Imm.getZExtValue());
980   } else if (ImmOp.isCImm()) {
981     ImmOp.ChangeToImmediate(ImmOp.getCImm()->getZExtValue());
982   }
983 
984   unsigned DstReg = I.getOperand(0).getReg();
985   unsigned Size;
986   bool IsSgpr;
987   const RegisterBank *RB = MRI.getRegBankOrNull(I.getOperand(0).getReg());
988   if (RB) {
989     IsSgpr = RB->getID() == AMDGPU::SGPRRegBankID;
990     Size = MRI.getType(DstReg).getSizeInBits();
991   } else {
992     const TargetRegisterClass *RC = TRI.getRegClassForReg(MRI, DstReg);
993     IsSgpr = TRI.isSGPRClass(RC);
994     Size = TRI.getRegSizeInBits(*RC);
995   }
996 
997   if (Size != 32 && Size != 64)
998     return false;
999 
1000   unsigned Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1001   if (Size == 32) {
1002     I.setDesc(TII.get(Opcode));
1003     I.addImplicitDefUseOperands(*MF);
1004     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
1005   }
1006 
1007   DebugLoc DL = I.getDebugLoc();
1008   const TargetRegisterClass *RC = IsSgpr ? &AMDGPU::SReg_32_XM0RegClass :
1009                                            &AMDGPU::VGPR_32RegClass;
1010   unsigned LoReg = MRI.createVirtualRegister(RC);
1011   unsigned HiReg = MRI.createVirtualRegister(RC);
1012   const APInt &Imm = APInt(Size, I.getOperand(1).getImm());
1013 
1014   BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
1015           .addImm(Imm.trunc(32).getZExtValue());
1016 
1017   BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
1018           .addImm(Imm.ashr(32).getZExtValue());
1019 
1020   const MachineInstr *RS =
1021       BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1022               .addReg(LoReg)
1023               .addImm(AMDGPU::sub0)
1024               .addReg(HiReg)
1025               .addImm(AMDGPU::sub1);
1026 
1027   // We can't call constrainSelectedInstRegOperands here, because it doesn't
1028   // work for target independent opcodes
1029   I.eraseFromParent();
1030   const TargetRegisterClass *DstRC =
1031       TRI.getConstrainedRegClassForOperand(RS->getOperand(0), MRI);
1032   if (!DstRC)
1033     return true;
1034   return RBI.constrainGenericRegister(DstReg, *DstRC, MRI);
1035 }
1036 
1037 static bool isConstant(const MachineInstr &MI) {
1038   return MI.getOpcode() == TargetOpcode::G_CONSTANT;
1039 }
1040 
1041 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
1042     const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
1043 
1044   const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg());
1045 
1046   assert(PtrMI);
1047 
1048   if (PtrMI->getOpcode() != TargetOpcode::G_GEP)
1049     return;
1050 
1051   GEPInfo GEPInfo(*PtrMI);
1052 
1053   for (unsigned i = 1, e = 3; i < e; ++i) {
1054     const MachineOperand &GEPOp = PtrMI->getOperand(i);
1055     const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
1056     assert(OpDef);
1057     if (isConstant(*OpDef)) {
1058       // FIXME: Is it possible to have multiple Imm parts?  Maybe if we
1059       // are lacking other optimizations.
1060       assert(GEPInfo.Imm == 0);
1061       GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
1062       continue;
1063     }
1064     const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
1065     if (OpBank->getID() == AMDGPU::SGPRRegBankID)
1066       GEPInfo.SgprParts.push_back(GEPOp.getReg());
1067     else
1068       GEPInfo.VgprParts.push_back(GEPOp.getReg());
1069   }
1070 
1071   AddrInfo.push_back(GEPInfo);
1072   getAddrModeInfo(*PtrMI, MRI, AddrInfo);
1073 }
1074 
1075 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
1076   if (!MI.hasOneMemOperand())
1077     return false;
1078 
1079   const MachineMemOperand *MMO = *MI.memoperands_begin();
1080   const Value *Ptr = MMO->getValue();
1081 
1082   // UndefValue means this is a load of a kernel input.  These are uniform.
1083   // Sometimes LDS instructions have constant pointers.
1084   // If Ptr is null, then that means this mem operand contains a
1085   // PseudoSourceValue like GOT.
1086   if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
1087       isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
1088     return true;
1089 
1090   if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
1091     return true;
1092 
1093   const Instruction *I = dyn_cast<Instruction>(Ptr);
1094   return I && I->getMetadata("amdgpu.uniform");
1095 }
1096 
1097 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
1098   for (const GEPInfo &GEPInfo : AddrInfo) {
1099     if (!GEPInfo.VgprParts.empty())
1100       return true;
1101   }
1102   return false;
1103 }
1104 
1105 bool AMDGPUInstructionSelector::selectG_LOAD(MachineInstr &I) const {
1106   MachineBasicBlock *BB = I.getParent();
1107   MachineFunction *MF = BB->getParent();
1108   MachineRegisterInfo &MRI = MF->getRegInfo();
1109   const DebugLoc &DL = I.getDebugLoc();
1110   Register DstReg = I.getOperand(0).getReg();
1111   Register PtrReg = I.getOperand(1).getReg();
1112   unsigned LoadSize = RBI.getSizeInBits(DstReg, MRI, TRI);
1113   unsigned Opcode;
1114 
1115   if (MRI.getType(I.getOperand(1).getReg()).getSizeInBits() == 32) {
1116     LLVM_DEBUG(dbgs() << "Unhandled address space\n");
1117     return false;
1118   }
1119 
1120   SmallVector<GEPInfo, 4> AddrInfo;
1121 
1122   getAddrModeInfo(I, MRI, AddrInfo);
1123 
1124   switch (LoadSize) {
1125   case 32:
1126     Opcode = AMDGPU::FLAT_LOAD_DWORD;
1127     break;
1128   case 64:
1129     Opcode = AMDGPU::FLAT_LOAD_DWORDX2;
1130     break;
1131   default:
1132     LLVM_DEBUG(dbgs() << "Unhandled load size\n");
1133     return false;
1134   }
1135 
1136   MachineInstr *Flat = BuildMI(*BB, &I, DL, TII.get(Opcode))
1137                                .add(I.getOperand(0))
1138                                .addReg(PtrReg)
1139                                .addImm(0)  // offset
1140                                .addImm(0)  // glc
1141                                .addImm(0)  // slc
1142                                .addImm(0); // dlc
1143 
1144   bool Ret = constrainSelectedInstRegOperands(*Flat, TII, TRI, RBI);
1145   I.eraseFromParent();
1146   return Ret;
1147 }
1148 
1149 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
1150   MachineBasicBlock *BB = I.getParent();
1151   MachineFunction *MF = BB->getParent();
1152   MachineRegisterInfo &MRI = MF->getRegInfo();
1153   MachineOperand &CondOp = I.getOperand(0);
1154   Register CondReg = CondOp.getReg();
1155   const DebugLoc &DL = I.getDebugLoc();
1156 
1157   unsigned BrOpcode;
1158   Register CondPhysReg;
1159   const TargetRegisterClass *ConstrainRC;
1160 
1161   // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
1162   // whether the branch is uniform when selecting the instruction. In
1163   // GlobalISel, we should push that decision into RegBankSelect. Assume for now
1164   // RegBankSelect knows what it's doing if the branch condition is scc, even
1165   // though it currently does not.
1166   if (isSCC(CondReg, MRI)) {
1167     CondPhysReg = AMDGPU::SCC;
1168     BrOpcode = AMDGPU::S_CBRANCH_SCC1;
1169     ConstrainRC = &AMDGPU::SReg_32_XM0RegClass;
1170   } else if (isVCC(CondReg, MRI)) {
1171     // FIXME: Do we have to insert an and with exec here, like in SelectionDAG?
1172     // We sort of know that a VCC producer based on the register bank, that ands
1173     // inactive lanes with 0. What if there was a logical operation with vcc
1174     // producers in different blocks/with different exec masks?
1175     // FIXME: Should scc->vcc copies and with exec?
1176     CondPhysReg = TRI.getVCC();
1177     BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
1178     ConstrainRC = TRI.getBoolRC();
1179   } else
1180     return false;
1181 
1182   if (!MRI.getRegClassOrNull(CondReg))
1183     MRI.setRegClass(CondReg, ConstrainRC);
1184 
1185   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
1186     .addReg(CondReg);
1187   BuildMI(*BB, &I, DL, TII.get(BrOpcode))
1188     .addMBB(I.getOperand(1).getMBB());
1189 
1190   I.eraseFromParent();
1191   return true;
1192 }
1193 
1194 bool AMDGPUInstructionSelector::selectG_FRAME_INDEX(MachineInstr &I) const {
1195   MachineBasicBlock *BB = I.getParent();
1196   MachineFunction *MF = BB->getParent();
1197   MachineRegisterInfo &MRI = MF->getRegInfo();
1198 
1199   Register DstReg = I.getOperand(0).getReg();
1200   const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI);
1201   const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
1202   I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
1203   if (IsVGPR)
1204     I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
1205 
1206   return RBI.constrainGenericRegister(
1207     DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, MRI);
1208 }
1209 
1210 bool AMDGPUInstructionSelector::select(MachineInstr &I,
1211                                        CodeGenCoverage &CoverageInfo) const {
1212   if (I.isPHI())
1213     return selectPHI(I);
1214 
1215   if (!isPreISelGenericOpcode(I.getOpcode())) {
1216     if (I.isCopy())
1217       return selectCOPY(I);
1218     return true;
1219   }
1220 
1221   switch (I.getOpcode()) {
1222   case TargetOpcode::G_ADD:
1223   case TargetOpcode::G_SUB:
1224     if (selectG_ADD_SUB(I))
1225       return true;
1226     LLVM_FALLTHROUGH;
1227   default:
1228     return selectImpl(I, CoverageInfo);
1229   case TargetOpcode::G_INTTOPTR:
1230   case TargetOpcode::G_BITCAST:
1231     return selectCOPY(I);
1232   case TargetOpcode::G_CONSTANT:
1233   case TargetOpcode::G_FCONSTANT:
1234     return selectG_CONSTANT(I);
1235   case TargetOpcode::G_EXTRACT:
1236     return selectG_EXTRACT(I);
1237   case TargetOpcode::G_MERGE_VALUES:
1238   case TargetOpcode::G_CONCAT_VECTORS:
1239     return selectG_MERGE_VALUES(I);
1240   case TargetOpcode::G_UNMERGE_VALUES:
1241     return selectG_UNMERGE_VALUES(I);
1242   case TargetOpcode::G_GEP:
1243     return selectG_GEP(I);
1244   case TargetOpcode::G_IMPLICIT_DEF:
1245     return selectG_IMPLICIT_DEF(I);
1246   case TargetOpcode::G_INSERT:
1247     return selectG_INSERT(I);
1248   case TargetOpcode::G_INTRINSIC:
1249     return selectG_INTRINSIC(I, CoverageInfo);
1250   case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
1251     return selectG_INTRINSIC_W_SIDE_EFFECTS(I, CoverageInfo);
1252   case TargetOpcode::G_ICMP:
1253     if (selectG_ICMP(I))
1254       return true;
1255     return selectImpl(I, CoverageInfo);
1256   case TargetOpcode::G_LOAD:
1257     if (selectImpl(I, CoverageInfo))
1258       return true;
1259     return selectG_LOAD(I);
1260   case TargetOpcode::G_SELECT:
1261     return selectG_SELECT(I);
1262   case TargetOpcode::G_STORE:
1263     return selectG_STORE(I);
1264   case TargetOpcode::G_TRUNC:
1265     return selectG_TRUNC(I);
1266   case TargetOpcode::G_SEXT:
1267   case TargetOpcode::G_ZEXT:
1268   case TargetOpcode::G_ANYEXT:
1269     if (selectG_SZA_EXT(I)) {
1270       I.eraseFromParent();
1271       return true;
1272     }
1273 
1274     return false;
1275   case TargetOpcode::G_BRCOND:
1276     return selectG_BRCOND(I);
1277   case TargetOpcode::G_FRAME_INDEX:
1278     return selectG_FRAME_INDEX(I);
1279   case TargetOpcode::G_FENCE:
1280     // FIXME: Tablegen importer doesn't handle the imm operands correctly, and
1281     // is checking for G_CONSTANT
1282     I.setDesc(TII.get(AMDGPU::ATOMIC_FENCE));
1283     return true;
1284   }
1285   return false;
1286 }
1287 
1288 InstructionSelector::ComplexRendererFns
1289 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
1290   return {{
1291       [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
1292   }};
1293 
1294 }
1295 
1296 std::pair<Register, unsigned>
1297 AMDGPUInstructionSelector::selectVOP3ModsImpl(
1298   Register Src, const MachineRegisterInfo &MRI) const {
1299   unsigned Mods = 0;
1300   MachineInstr *MI = MRI.getVRegDef(Src);
1301 
1302   if (MI && MI->getOpcode() == AMDGPU::G_FNEG) {
1303     Src = MI->getOperand(1).getReg();
1304     Mods |= SISrcMods::NEG;
1305     MI = MRI.getVRegDef(Src);
1306   }
1307 
1308   if (MI && MI->getOpcode() == AMDGPU::G_FABS) {
1309     Src = MI->getOperand(1).getReg();
1310     Mods |= SISrcMods::ABS;
1311   }
1312 
1313   return std::make_pair(Src, Mods);
1314 }
1315 
1316 ///
1317 /// This will select either an SGPR or VGPR operand and will save us from
1318 /// having to write an extra tablegen pattern.
1319 InstructionSelector::ComplexRendererFns
1320 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
1321   return {{
1322       [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
1323   }};
1324 }
1325 
1326 InstructionSelector::ComplexRendererFns
1327 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
1328   MachineRegisterInfo &MRI
1329     = Root.getParent()->getParent()->getParent()->getRegInfo();
1330 
1331   Register Src;
1332   unsigned Mods;
1333   std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(), MRI);
1334 
1335   return {{
1336       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
1337       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
1338       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },    // clamp
1339       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }     // omod
1340   }};
1341 }
1342 InstructionSelector::ComplexRendererFns
1343 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
1344   return {{
1345       [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
1346       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
1347       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }  // omod
1348   }};
1349 }
1350 
1351 InstructionSelector::ComplexRendererFns
1352 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
1353   MachineRegisterInfo &MRI
1354     = Root.getParent()->getParent()->getParent()->getRegInfo();
1355 
1356   Register Src;
1357   unsigned Mods;
1358   std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(), MRI);
1359 
1360   return {{
1361       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
1362       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
1363   }};
1364 }
1365 
1366 InstructionSelector::ComplexRendererFns
1367 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
1368   MachineRegisterInfo &MRI =
1369       Root.getParent()->getParent()->getParent()->getRegInfo();
1370 
1371   SmallVector<GEPInfo, 4> AddrInfo;
1372   getAddrModeInfo(*Root.getParent(), MRI, AddrInfo);
1373 
1374   if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
1375     return None;
1376 
1377   const GEPInfo &GEPInfo = AddrInfo[0];
1378 
1379   if (!AMDGPU::isLegalSMRDImmOffset(STI, GEPInfo.Imm))
1380     return None;
1381 
1382   unsigned PtrReg = GEPInfo.SgprParts[0];
1383   int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm);
1384   return {{
1385     [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
1386     [=](MachineInstrBuilder &MIB) { MIB.addImm(EncodedImm); }
1387   }};
1388 }
1389 
1390 InstructionSelector::ComplexRendererFns
1391 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
1392   MachineRegisterInfo &MRI =
1393       Root.getParent()->getParent()->getParent()->getRegInfo();
1394 
1395   SmallVector<GEPInfo, 4> AddrInfo;
1396   getAddrModeInfo(*Root.getParent(), MRI, AddrInfo);
1397 
1398   if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
1399     return None;
1400 
1401   const GEPInfo &GEPInfo = AddrInfo[0];
1402   unsigned PtrReg = GEPInfo.SgprParts[0];
1403   int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm);
1404   if (!isUInt<32>(EncodedImm))
1405     return None;
1406 
1407   return {{
1408     [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
1409     [=](MachineInstrBuilder &MIB) { MIB.addImm(EncodedImm); }
1410   }};
1411 }
1412 
1413 InstructionSelector::ComplexRendererFns
1414 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
1415   MachineInstr *MI = Root.getParent();
1416   MachineBasicBlock *MBB = MI->getParent();
1417   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1418 
1419   SmallVector<GEPInfo, 4> AddrInfo;
1420   getAddrModeInfo(*MI, MRI, AddrInfo);
1421 
1422   // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
1423   // then we can select all ptr + 32-bit offsets not just immediate offsets.
1424   if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
1425     return None;
1426 
1427   const GEPInfo &GEPInfo = AddrInfo[0];
1428   if (!GEPInfo.Imm || !isUInt<32>(GEPInfo.Imm))
1429     return None;
1430 
1431   // If we make it this far we have a load with an 32-bit immediate offset.
1432   // It is OK to select this using a sgpr offset, because we have already
1433   // failed trying to select this load into one of the _IMM variants since
1434   // the _IMM Patterns are considered before the _SGPR patterns.
1435   unsigned PtrReg = GEPInfo.SgprParts[0];
1436   unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
1437   BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg)
1438           .addImm(GEPInfo.Imm);
1439   return {{
1440     [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
1441     [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }
1442   }};
1443 }
1444