1 //===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the InstructionSelector class for
10 /// AMDGPU.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUInstructionSelector.h"
15 #include "AMDGPUInstrInfo.h"
16 #include "AMDGPURegisterBankInfo.h"
17 #include "AMDGPURegisterInfo.h"
18 #include "AMDGPUSubtarget.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
22 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
23 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
24 #include "llvm/CodeGen/GlobalISel/Utils.h"
25 #include "llvm/CodeGen/MachineBasicBlock.h"
26 #include "llvm/CodeGen/MachineFunction.h"
27 #include "llvm/CodeGen/MachineInstr.h"
28 #include "llvm/CodeGen/MachineInstrBuilder.h"
29 #include "llvm/CodeGen/MachineRegisterInfo.h"
30 #include "llvm/IR/Type.h"
31 #include "llvm/Support/Debug.h"
32 #include "llvm/Support/raw_ostream.h"
33 
34 #define DEBUG_TYPE "amdgpu-isel"
35 
36 using namespace llvm;
37 
38 #define GET_GLOBALISEL_IMPL
39 #define AMDGPUSubtarget GCNSubtarget
40 #include "AMDGPUGenGlobalISel.inc"
41 #undef GET_GLOBALISEL_IMPL
42 #undef AMDGPUSubtarget
43 
44 AMDGPUInstructionSelector::AMDGPUInstructionSelector(
45     const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
46     const AMDGPUTargetMachine &TM)
47     : InstructionSelector(), TII(*STI.getInstrInfo()),
48       TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
49       STI(STI),
50       EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG),
51 #define GET_GLOBALISEL_PREDICATES_INIT
52 #include "AMDGPUGenGlobalISel.inc"
53 #undef GET_GLOBALISEL_PREDICATES_INIT
54 #define GET_GLOBALISEL_TEMPORARIES_INIT
55 #include "AMDGPUGenGlobalISel.inc"
56 #undef GET_GLOBALISEL_TEMPORARIES_INIT
57 {
58 }
59 
60 const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
61 
62 static bool isSCC(Register Reg, const MachineRegisterInfo &MRI) {
63   if (TargetRegisterInfo::isPhysicalRegister(Reg))
64     return Reg == AMDGPU::SCC;
65 
66   auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
67   const TargetRegisterClass *RC =
68       RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
69   if (RC) {
70     if (RC->getID() != AMDGPU::SReg_32_XM0RegClassID)
71       return false;
72     const LLT Ty = MRI.getType(Reg);
73     return Ty.isValid() && Ty.getSizeInBits() == 1;
74   }
75 
76   const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
77   return RB->getID() == AMDGPU::SCCRegBankID;
78 }
79 
80 bool AMDGPUInstructionSelector::isVCC(Register Reg,
81                                       const MachineRegisterInfo &MRI) const {
82   if (TargetRegisterInfo::isPhysicalRegister(Reg))
83     return Reg == TRI.getVCC();
84 
85   auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
86   const TargetRegisterClass *RC =
87       RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
88   if (RC) {
89     return RC->hasSuperClassEq(TRI.getBoolRC()) &&
90            MRI.getType(Reg).getSizeInBits() == 1;
91   }
92 
93   const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
94   return RB->getID() == AMDGPU::VCCRegBankID;
95 }
96 
97 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
98   MachineBasicBlock *BB = I.getParent();
99   MachineFunction *MF = BB->getParent();
100   MachineRegisterInfo &MRI = MF->getRegInfo();
101   I.setDesc(TII.get(TargetOpcode::COPY));
102 
103   // Special case for COPY from the scc register bank.  The scc register bank
104   // is modeled using 32-bit sgprs.
105   const MachineOperand &Src = I.getOperand(1);
106   unsigned SrcReg = Src.getReg();
107   if (!TargetRegisterInfo::isPhysicalRegister(SrcReg) && isSCC(SrcReg, MRI)) {
108     unsigned DstReg = I.getOperand(0).getReg();
109 
110     // Specially handle scc->vcc copies.
111     if (isVCC(DstReg, MRI)) {
112       const DebugLoc &DL = I.getDebugLoc();
113       BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
114         .addImm(0)
115         .addReg(SrcReg);
116       if (!MRI.getRegClassOrNull(SrcReg))
117         MRI.setRegClass(SrcReg, TRI.getConstrainedRegClassForOperand(Src, MRI));
118       I.eraseFromParent();
119       return true;
120     }
121   }
122 
123   for (const MachineOperand &MO : I.operands()) {
124     if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
125       continue;
126 
127     const TargetRegisterClass *RC =
128             TRI.getConstrainedRegClassForOperand(MO, MRI);
129     if (!RC)
130       continue;
131     RBI.constrainGenericRegister(MO.getReg(), *RC, MRI);
132   }
133   return true;
134 }
135 
136 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
137   MachineBasicBlock *BB = I.getParent();
138   MachineFunction *MF = BB->getParent();
139   MachineRegisterInfo &MRI = MF->getRegInfo();
140 
141   const Register DefReg = I.getOperand(0).getReg();
142   const LLT DefTy = MRI.getType(DefReg);
143 
144   // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
145 
146   const RegClassOrRegBank &RegClassOrBank =
147     MRI.getRegClassOrRegBank(DefReg);
148 
149   const TargetRegisterClass *DefRC
150     = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
151   if (!DefRC) {
152     if (!DefTy.isValid()) {
153       LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
154       return false;
155     }
156 
157     const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
158     if (RB.getID() == AMDGPU::SCCRegBankID) {
159       LLVM_DEBUG(dbgs() << "illegal scc phi\n");
160       return false;
161     }
162 
163     DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, MRI);
164     if (!DefRC) {
165       LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
166       return false;
167     }
168   }
169 
170   I.setDesc(TII.get(TargetOpcode::PHI));
171   return RBI.constrainGenericRegister(DefReg, *DefRC, MRI);
172 }
173 
174 MachineOperand
175 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
176                                            const TargetRegisterClass &SubRC,
177                                            unsigned SubIdx) const {
178 
179   MachineInstr *MI = MO.getParent();
180   MachineBasicBlock *BB = MO.getParent()->getParent();
181   MachineFunction *MF = BB->getParent();
182   MachineRegisterInfo &MRI = MF->getRegInfo();
183   Register DstReg = MRI.createVirtualRegister(&SubRC);
184 
185   if (MO.isReg()) {
186     unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
187     unsigned Reg = MO.getReg();
188     BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
189             .addReg(Reg, 0, ComposedSubIdx);
190 
191     return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
192                                      MO.isKill(), MO.isDead(), MO.isUndef(),
193                                      MO.isEarlyClobber(), 0, MO.isDebug(),
194                                      MO.isInternalRead());
195   }
196 
197   assert(MO.isImm());
198 
199   APInt Imm(64, MO.getImm());
200 
201   switch (SubIdx) {
202   default:
203     llvm_unreachable("do not know to split immediate with this sub index.");
204   case AMDGPU::sub0:
205     return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
206   case AMDGPU::sub1:
207     return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
208   }
209 }
210 
211 static int64_t getConstant(const MachineInstr *MI) {
212   return MI->getOperand(1).getCImm()->getSExtValue();
213 }
214 
215 bool AMDGPUInstructionSelector::selectG_ADD(MachineInstr &I) const {
216   MachineBasicBlock *BB = I.getParent();
217   MachineFunction *MF = BB->getParent();
218   MachineRegisterInfo &MRI = MF->getRegInfo();
219   Register DstReg = I.getOperand(0).getReg();
220   const DebugLoc &DL = I.getDebugLoc();
221   unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI);
222   const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI);
223   const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
224 
225   if (Size == 32) {
226     if (IsSALU) {
227       MachineInstr *Add =
228         BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstReg)
229         .add(I.getOperand(1))
230         .add(I.getOperand(2));
231       I.eraseFromParent();
232       return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
233     }
234 
235     if (STI.hasAddNoCarry()) {
236       I.setDesc(TII.get(AMDGPU::V_ADD_U32_e64));
237       I.addOperand(*MF, MachineOperand::CreateImm(0));
238       I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
239       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
240     }
241 
242     Register UnusedCarry = MRI.createVirtualRegister(TRI.getWaveMaskRegClass());
243     MachineInstr *Add
244       = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_I32_e64), DstReg)
245       .addDef(UnusedCarry, RegState::Dead)
246       .add(I.getOperand(1))
247       .add(I.getOperand(2))
248       .addImm(0);
249     I.eraseFromParent();
250     return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
251   }
252 
253   const TargetRegisterClass &RC
254     = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
255   const TargetRegisterClass &HalfRC
256     = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
257 
258   MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
259   MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
260   MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
261   MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
262 
263   Register DstLo = MRI.createVirtualRegister(&HalfRC);
264   Register DstHi = MRI.createVirtualRegister(&HalfRC);
265 
266   if (IsSALU) {
267     BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
268       .add(Lo1)
269       .add(Lo2);
270     BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
271       .add(Hi1)
272       .add(Hi2);
273   } else {
274     const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
275     Register CarryReg = MRI.createVirtualRegister(CarryRC);
276     BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_I32_e64), DstLo)
277       .addDef(CarryReg)
278       .add(Lo1)
279       .add(Lo2)
280       .addImm(0);
281     MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
282       .addDef(MRI.createVirtualRegister(CarryRC), RegState::Dead)
283       .add(Hi1)
284       .add(Hi2)
285       .addReg(CarryReg, RegState::Kill)
286       .addImm(0);
287 
288     if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
289       return false;
290   }
291 
292   BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
293     .addReg(DstLo)
294     .addImm(AMDGPU::sub0)
295     .addReg(DstHi)
296     .addImm(AMDGPU::sub1);
297 
298 
299   if (!RBI.constrainGenericRegister(DstReg, RC, MRI))
300     return false;
301 
302   I.eraseFromParent();
303   return true;
304 }
305 
306 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
307   MachineBasicBlock *BB = I.getParent();
308   MachineFunction *MF = BB->getParent();
309   MachineRegisterInfo &MRI = MF->getRegInfo();
310   assert(I.getOperand(2).getImm() % 32 == 0);
311   unsigned SubReg = TRI.getSubRegFromChannel(I.getOperand(2).getImm() / 32);
312   const DebugLoc &DL = I.getDebugLoc();
313   MachineInstr *Copy = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY),
314                                I.getOperand(0).getReg())
315                                .addReg(I.getOperand(1).getReg(), 0, SubReg);
316 
317   for (const MachineOperand &MO : Copy->operands()) {
318     const TargetRegisterClass *RC =
319             TRI.getConstrainedRegClassForOperand(MO, MRI);
320     if (!RC)
321       continue;
322     RBI.constrainGenericRegister(MO.getReg(), *RC, MRI);
323   }
324   I.eraseFromParent();
325   return true;
326 }
327 
328 bool AMDGPUInstructionSelector::selectG_GEP(MachineInstr &I) const {
329   return selectG_ADD(I);
330 }
331 
332 bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
333   MachineBasicBlock *BB = I.getParent();
334   MachineFunction *MF = BB->getParent();
335   MachineRegisterInfo &MRI = MF->getRegInfo();
336   const MachineOperand &MO = I.getOperand(0);
337 
338   // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
339   // regbank check here is to know why getConstrainedRegClassForOperand failed.
340   const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, MRI);
341   if ((!RC && !MRI.getRegBankOrNull(MO.getReg())) ||
342       (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, MRI))) {
343     I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
344     return true;
345   }
346 
347   return false;
348 }
349 
350 bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
351   MachineBasicBlock *BB = I.getParent();
352   MachineFunction *MF = BB->getParent();
353   MachineRegisterInfo &MRI = MF->getRegInfo();
354   unsigned SubReg = TRI.getSubRegFromChannel(I.getOperand(3).getImm() / 32);
355   DebugLoc DL = I.getDebugLoc();
356   MachineInstr *Ins = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG))
357                                .addDef(I.getOperand(0).getReg())
358                                .addReg(I.getOperand(1).getReg())
359                                .addReg(I.getOperand(2).getReg())
360                                .addImm(SubReg);
361 
362   for (const MachineOperand &MO : Ins->operands()) {
363     if (!MO.isReg())
364       continue;
365     if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
366       continue;
367 
368     const TargetRegisterClass *RC =
369             TRI.getConstrainedRegClassForOperand(MO, MRI);
370     if (!RC)
371       continue;
372     RBI.constrainGenericRegister(MO.getReg(), *RC, MRI);
373   }
374   I.eraseFromParent();
375   return true;
376 }
377 
378 bool AMDGPUInstructionSelector::selectG_INTRINSIC(
379   MachineInstr &I, CodeGenCoverage &CoverageInfo) const {
380   unsigned IntrinsicID =  I.getOperand(I.getNumExplicitDefs()).getIntrinsicID();
381   switch (IntrinsicID) {
382   case Intrinsic::maxnum:
383   case Intrinsic::minnum:
384   case Intrinsic::amdgcn_cvt_pkrtz:
385     return selectImpl(I, CoverageInfo);
386   default:
387     return selectImpl(I, CoverageInfo);
388   }
389 }
390 
391 static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size) {
392   if (Size != 32 && Size != 64)
393     return -1;
394   switch (P) {
395   default:
396     llvm_unreachable("Unknown condition code!");
397   case CmpInst::ICMP_NE:
398     return Size == 32 ? AMDGPU::V_CMP_NE_U32_e64 : AMDGPU::V_CMP_NE_U64_e64;
399   case CmpInst::ICMP_EQ:
400     return Size == 32 ? AMDGPU::V_CMP_EQ_U32_e64 : AMDGPU::V_CMP_EQ_U64_e64;
401   case CmpInst::ICMP_SGT:
402     return Size == 32 ? AMDGPU::V_CMP_GT_I32_e64 : AMDGPU::V_CMP_GT_I64_e64;
403   case CmpInst::ICMP_SGE:
404     return Size == 32 ? AMDGPU::V_CMP_GE_I32_e64 : AMDGPU::V_CMP_GE_I64_e64;
405   case CmpInst::ICMP_SLT:
406     return Size == 32 ? AMDGPU::V_CMP_LT_I32_e64 : AMDGPU::V_CMP_LT_I64_e64;
407   case CmpInst::ICMP_SLE:
408     return Size == 32 ? AMDGPU::V_CMP_LE_I32_e64 : AMDGPU::V_CMP_LE_I64_e64;
409   case CmpInst::ICMP_UGT:
410     return Size == 32 ? AMDGPU::V_CMP_GT_U32_e64 : AMDGPU::V_CMP_GT_U64_e64;
411   case CmpInst::ICMP_UGE:
412     return Size == 32 ? AMDGPU::V_CMP_GE_U32_e64 : AMDGPU::V_CMP_GE_U64_e64;
413   case CmpInst::ICMP_ULT:
414     return Size == 32 ? AMDGPU::V_CMP_LT_U32_e64 : AMDGPU::V_CMP_LT_U64_e64;
415   case CmpInst::ICMP_ULE:
416     return Size == 32 ? AMDGPU::V_CMP_LE_U32_e64 : AMDGPU::V_CMP_LE_U64_e64;
417   }
418 }
419 
420 int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
421                                               unsigned Size) const {
422   if (Size == 64) {
423     if (!STI.hasScalarCompareEq64())
424       return -1;
425 
426     switch (P) {
427     case CmpInst::ICMP_NE:
428       return AMDGPU::S_CMP_LG_U64;
429     case CmpInst::ICMP_EQ:
430       return AMDGPU::S_CMP_EQ_U64;
431     default:
432       return -1;
433     }
434   }
435 
436   if (Size != 32)
437     return -1;
438 
439   switch (P) {
440   case CmpInst::ICMP_NE:
441     return AMDGPU::S_CMP_LG_U32;
442   case CmpInst::ICMP_EQ:
443     return AMDGPU::S_CMP_EQ_U32;
444   case CmpInst::ICMP_SGT:
445     return AMDGPU::S_CMP_GT_I32;
446   case CmpInst::ICMP_SGE:
447     return AMDGPU::S_CMP_GE_I32;
448   case CmpInst::ICMP_SLT:
449     return AMDGPU::S_CMP_LT_I32;
450   case CmpInst::ICMP_SLE:
451     return AMDGPU::S_CMP_LE_I32;
452   case CmpInst::ICMP_UGT:
453     return AMDGPU::S_CMP_GT_U32;
454   case CmpInst::ICMP_UGE:
455     return AMDGPU::S_CMP_GE_U32;
456   case CmpInst::ICMP_ULT:
457     return AMDGPU::S_CMP_LT_U32;
458   case CmpInst::ICMP_ULE:
459     return AMDGPU::S_CMP_LE_U32;
460   default:
461     llvm_unreachable("Unknown condition code!");
462   }
463 }
464 
465 bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
466   MachineBasicBlock *BB = I.getParent();
467   MachineFunction *MF = BB->getParent();
468   MachineRegisterInfo &MRI = MF->getRegInfo();
469   DebugLoc DL = I.getDebugLoc();
470 
471   unsigned SrcReg = I.getOperand(2).getReg();
472   unsigned Size = RBI.getSizeInBits(SrcReg, MRI, TRI);
473 
474   auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
475 
476   unsigned CCReg = I.getOperand(0).getReg();
477   if (isSCC(CCReg, MRI)) {
478     int Opcode = getS_CMPOpcode(Pred, Size);
479     if (Opcode == -1)
480       return false;
481     MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
482             .add(I.getOperand(2))
483             .add(I.getOperand(3));
484     BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
485       .addReg(AMDGPU::SCC);
486     bool Ret =
487         constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
488         RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, MRI);
489     I.eraseFromParent();
490     return Ret;
491   }
492 
493   int Opcode = getV_CMPOpcode(Pred, Size);
494   if (Opcode == -1)
495     return false;
496 
497   MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode),
498             I.getOperand(0).getReg())
499             .add(I.getOperand(2))
500             .add(I.getOperand(3));
501   RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
502                                AMDGPU::SReg_64RegClass, MRI);
503   bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
504   I.eraseFromParent();
505   return Ret;
506 }
507 
508 static MachineInstr *
509 buildEXP(const TargetInstrInfo &TII, MachineInstr *Insert, unsigned Tgt,
510          unsigned Reg0, unsigned Reg1, unsigned Reg2, unsigned Reg3,
511          unsigned VM, bool Compr, unsigned Enabled, bool Done) {
512   const DebugLoc &DL = Insert->getDebugLoc();
513   MachineBasicBlock &BB = *Insert->getParent();
514   unsigned Opcode = Done ? AMDGPU::EXP_DONE : AMDGPU::EXP;
515   return BuildMI(BB, Insert, DL, TII.get(Opcode))
516           .addImm(Tgt)
517           .addReg(Reg0)
518           .addReg(Reg1)
519           .addReg(Reg2)
520           .addReg(Reg3)
521           .addImm(VM)
522           .addImm(Compr)
523           .addImm(Enabled);
524 }
525 
526 bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
527   MachineInstr &I, CodeGenCoverage &CoverageInfo) const {
528   MachineBasicBlock *BB = I.getParent();
529   MachineFunction *MF = BB->getParent();
530   MachineRegisterInfo &MRI = MF->getRegInfo();
531 
532   unsigned IntrinsicID = I.getOperand(0).getIntrinsicID();
533   switch (IntrinsicID) {
534   case Intrinsic::amdgcn_exp: {
535     int64_t Tgt = getConstant(MRI.getVRegDef(I.getOperand(1).getReg()));
536     int64_t Enabled = getConstant(MRI.getVRegDef(I.getOperand(2).getReg()));
537     int64_t Done = getConstant(MRI.getVRegDef(I.getOperand(7).getReg()));
538     int64_t VM = getConstant(MRI.getVRegDef(I.getOperand(8).getReg()));
539 
540     MachineInstr *Exp = buildEXP(TII, &I, Tgt, I.getOperand(3).getReg(),
541                                  I.getOperand(4).getReg(),
542                                  I.getOperand(5).getReg(),
543                                  I.getOperand(6).getReg(),
544                                  VM, false, Enabled, Done);
545 
546     I.eraseFromParent();
547     return constrainSelectedInstRegOperands(*Exp, TII, TRI, RBI);
548   }
549   case Intrinsic::amdgcn_exp_compr: {
550     const DebugLoc &DL = I.getDebugLoc();
551     int64_t Tgt = getConstant(MRI.getVRegDef(I.getOperand(1).getReg()));
552     int64_t Enabled = getConstant(MRI.getVRegDef(I.getOperand(2).getReg()));
553     unsigned Reg0 = I.getOperand(3).getReg();
554     unsigned Reg1 = I.getOperand(4).getReg();
555     unsigned Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
556     int64_t Done = getConstant(MRI.getVRegDef(I.getOperand(5).getReg()));
557     int64_t VM = getConstant(MRI.getVRegDef(I.getOperand(6).getReg()));
558 
559     BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
560     MachineInstr *Exp = buildEXP(TII, &I, Tgt, Reg0, Reg1, Undef, Undef, VM,
561                                  true,  Enabled, Done);
562 
563     I.eraseFromParent();
564     return constrainSelectedInstRegOperands(*Exp, TII, TRI, RBI);
565   }
566   default:
567     return selectImpl(I, CoverageInfo);
568   }
569 }
570 
571 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
572   MachineBasicBlock *BB = I.getParent();
573   MachineFunction *MF = BB->getParent();
574   MachineRegisterInfo &MRI = MF->getRegInfo();
575   const DebugLoc &DL = I.getDebugLoc();
576 
577   unsigned DstReg = I.getOperand(0).getReg();
578   unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI);
579   assert(Size <= 32 || Size == 64);
580   const MachineOperand &CCOp = I.getOperand(1);
581   unsigned CCReg = CCOp.getReg();
582   if (isSCC(CCReg, MRI)) {
583     unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
584                                          AMDGPU::S_CSELECT_B32;
585     MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
586             .addReg(CCReg);
587 
588     // The generic constrainSelectedInstRegOperands doesn't work for the scc register
589     // bank, because it does not cover the register class that we used to represent
590     // for it.  So we need to manually set the register class here.
591     if (!MRI.getRegClassOrNull(CCReg))
592         MRI.setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, MRI));
593     MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
594             .add(I.getOperand(2))
595             .add(I.getOperand(3));
596 
597     bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI) |
598                constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
599     I.eraseFromParent();
600     return Ret;
601   }
602 
603   // Wide VGPR select should have been split in RegBankSelect.
604   if (Size > 32)
605     return false;
606 
607   MachineInstr *Select =
608       BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
609               .addImm(0)
610               .add(I.getOperand(3))
611               .addImm(0)
612               .add(I.getOperand(2))
613               .add(I.getOperand(1));
614 
615   bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
616   I.eraseFromParent();
617   return Ret;
618 }
619 
620 bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const {
621   MachineBasicBlock *BB = I.getParent();
622   MachineFunction *MF = BB->getParent();
623   MachineRegisterInfo &MRI = MF->getRegInfo();
624   DebugLoc DL = I.getDebugLoc();
625   unsigned PtrSize = RBI.getSizeInBits(I.getOperand(1).getReg(), MRI, TRI);
626   if (PtrSize != 64) {
627     LLVM_DEBUG(dbgs() << "Unhandled address space\n");
628     return false;
629   }
630 
631   unsigned StoreSize = RBI.getSizeInBits(I.getOperand(0).getReg(), MRI, TRI);
632   unsigned Opcode;
633 
634   // FIXME: Select store instruction based on address space
635   switch (StoreSize) {
636   default:
637     return false;
638   case 32:
639     Opcode = AMDGPU::FLAT_STORE_DWORD;
640     break;
641   case 64:
642     Opcode = AMDGPU::FLAT_STORE_DWORDX2;
643     break;
644   case 96:
645     Opcode = AMDGPU::FLAT_STORE_DWORDX3;
646     break;
647   case 128:
648     Opcode = AMDGPU::FLAT_STORE_DWORDX4;
649     break;
650   }
651 
652   MachineInstr *Flat = BuildMI(*BB, &I, DL, TII.get(Opcode))
653           .add(I.getOperand(1))
654           .add(I.getOperand(0))
655           .addImm(0)  // offset
656           .addImm(0)  // glc
657           .addImm(0)  // slc
658           .addImm(0); // dlc
659 
660 
661   // Now that we selected an opcode, we need to constrain the register
662   // operands to use appropriate classes.
663   bool Ret = constrainSelectedInstRegOperands(*Flat, TII, TRI, RBI);
664 
665   I.eraseFromParent();
666   return Ret;
667 }
668 
669 static int sizeToSubRegIndex(unsigned Size) {
670   switch (Size) {
671   case 32:
672     return AMDGPU::sub0;
673   case 64:
674     return AMDGPU::sub0_sub1;
675   case 96:
676     return AMDGPU::sub0_sub1_sub2;
677   case 128:
678     return AMDGPU::sub0_sub1_sub2_sub3;
679   case 256:
680     return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
681   default:
682     if (Size < 32)
683       return AMDGPU::sub0;
684     if (Size > 256)
685       return -1;
686     return sizeToSubRegIndex(PowerOf2Ceil(Size));
687   }
688 }
689 
690 bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
691   MachineBasicBlock *BB = I.getParent();
692   MachineFunction *MF = BB->getParent();
693   MachineRegisterInfo &MRI = MF->getRegInfo();
694 
695   unsigned DstReg = I.getOperand(0).getReg();
696   unsigned SrcReg = I.getOperand(1).getReg();
697   const LLT DstTy = MRI.getType(DstReg);
698   const LLT SrcTy = MRI.getType(SrcReg);
699   if (!DstTy.isScalar())
700     return false;
701 
702   const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI);
703   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, MRI, TRI);
704   if (SrcRB != DstRB)
705     return false;
706 
707   unsigned DstSize = DstTy.getSizeInBits();
708   unsigned SrcSize = SrcTy.getSizeInBits();
709 
710   const TargetRegisterClass *SrcRC
711     = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, MRI);
712   const TargetRegisterClass *DstRC
713     = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, MRI);
714 
715   if (SrcSize > 32) {
716     int SubRegIdx = sizeToSubRegIndex(DstSize);
717     if (SubRegIdx == -1)
718       return false;
719 
720     // Deal with weird cases where the class only partially supports the subreg
721     // index.
722     SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
723     if (!SrcRC)
724       return false;
725 
726     I.getOperand(1).setSubReg(SubRegIdx);
727   }
728 
729   if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
730       !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
731     LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
732     return false;
733   }
734 
735   I.setDesc(TII.get(TargetOpcode::COPY));
736   return true;
737 }
738 
739 /// \returns true if a bitmask for \p Size bits will be an inline immediate.
740 static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
741   Mask = maskTrailingOnes<unsigned>(Size);
742   int SignedMask = static_cast<int>(Mask);
743   return SignedMask >= -16 && SignedMask <= 64;
744 }
745 
746 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
747   bool Signed = I.getOpcode() == AMDGPU::G_SEXT;
748   const DebugLoc &DL = I.getDebugLoc();
749   MachineBasicBlock &MBB = *I.getParent();
750   MachineFunction &MF = *MBB.getParent();
751   MachineRegisterInfo &MRI = MF.getRegInfo();
752   const unsigned DstReg = I.getOperand(0).getReg();
753   const unsigned SrcReg = I.getOperand(1).getReg();
754 
755   const LLT DstTy = MRI.getType(DstReg);
756   const LLT SrcTy = MRI.getType(SrcReg);
757   const LLT S1 = LLT::scalar(1);
758   const unsigned SrcSize = SrcTy.getSizeInBits();
759   const unsigned DstSize = DstTy.getSizeInBits();
760   if (!DstTy.isScalar())
761     return false;
762 
763   const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, TRI);
764 
765   if (SrcBank->getID() == AMDGPU::SCCRegBankID) {
766     if (SrcTy != S1 || DstSize > 64) // Invalid
767       return false;
768 
769     unsigned Opcode =
770         DstSize > 32 ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
771     const TargetRegisterClass *DstRC =
772         DstSize > 32 ? &AMDGPU::SReg_64RegClass : &AMDGPU::SReg_32RegClass;
773 
774     // FIXME: Create an extra copy to avoid incorrectly constraining the result
775     // of the scc producer.
776     unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
777     BuildMI(MBB, I, DL, TII.get(AMDGPU::COPY), TmpReg)
778       .addReg(SrcReg);
779     BuildMI(MBB, I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
780       .addReg(TmpReg);
781 
782     // The instruction operands are backwards from what you would expect.
783     BuildMI(MBB, I, DL, TII.get(Opcode), DstReg)
784       .addImm(0)
785       .addImm(Signed ? -1 : 1);
786     return RBI.constrainGenericRegister(DstReg, *DstRC, MRI);
787   }
788 
789   if (SrcBank->getID() == AMDGPU::VCCRegBankID && DstSize <= 32) {
790     if (SrcTy != S1) // Invalid
791       return false;
792 
793     MachineInstr *ExtI =
794       BuildMI(MBB, I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
795       .addImm(0)               // src0_modifiers
796       .addImm(0)               // src0
797       .addImm(0)               // src1_modifiers
798       .addImm(Signed ? -1 : 1) // src1
799       .addUse(SrcReg);
800     return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
801   }
802 
803   if (I.getOpcode() == AMDGPU::G_ANYEXT)
804     return selectCOPY(I);
805 
806   if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
807     // 64-bit should have been split up in RegBankSelect
808 
809     // Try to use an and with a mask if it will save code size.
810     unsigned Mask;
811     if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
812       MachineInstr *ExtI =
813       BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
814         .addImm(Mask)
815         .addReg(SrcReg);
816       return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
817     }
818 
819     const unsigned BFE = Signed ? AMDGPU::V_BFE_I32 : AMDGPU::V_BFE_U32;
820     MachineInstr *ExtI =
821       BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
822       .addReg(SrcReg)
823       .addImm(0) // Offset
824       .addImm(SrcSize); // Width
825     return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
826   }
827 
828   if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
829     if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI))
830       return false;
831 
832     if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
833       const unsigned SextOpc = SrcSize == 8 ?
834         AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
835       BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
836         .addReg(SrcReg);
837       return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, MRI);
838     }
839 
840     const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
841     const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
842 
843     // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
844     if (DstSize > 32 && SrcSize <= 32) {
845       // We need a 64-bit register source, but the high bits don't matter.
846       unsigned ExtReg
847         = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
848       unsigned UndefReg
849         = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
850       BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
851       BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
852         .addReg(SrcReg)
853         .addImm(AMDGPU::sub0)
854         .addReg(UndefReg)
855         .addImm(AMDGPU::sub1);
856 
857       BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
858         .addReg(ExtReg)
859         .addImm(SrcSize << 16);
860 
861       return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, MRI);
862     }
863 
864     unsigned Mask;
865     if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
866       BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
867         .addReg(SrcReg)
868         .addImm(Mask);
869     } else {
870       BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
871         .addReg(SrcReg)
872         .addImm(SrcSize << 16);
873     }
874 
875     return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, MRI);
876   }
877 
878   return false;
879 }
880 
881 bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
882   MachineBasicBlock *BB = I.getParent();
883   MachineFunction *MF = BB->getParent();
884   MachineRegisterInfo &MRI = MF->getRegInfo();
885   MachineOperand &ImmOp = I.getOperand(1);
886 
887   // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
888   if (ImmOp.isFPImm()) {
889     const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
890     ImmOp.ChangeToImmediate(Imm.getZExtValue());
891   } else if (ImmOp.isCImm()) {
892     ImmOp.ChangeToImmediate(ImmOp.getCImm()->getZExtValue());
893   }
894 
895   unsigned DstReg = I.getOperand(0).getReg();
896   unsigned Size;
897   bool IsSgpr;
898   const RegisterBank *RB = MRI.getRegBankOrNull(I.getOperand(0).getReg());
899   if (RB) {
900     IsSgpr = RB->getID() == AMDGPU::SGPRRegBankID;
901     Size = MRI.getType(DstReg).getSizeInBits();
902   } else {
903     const TargetRegisterClass *RC = TRI.getRegClassForReg(MRI, DstReg);
904     IsSgpr = TRI.isSGPRClass(RC);
905     Size = TRI.getRegSizeInBits(*RC);
906   }
907 
908   if (Size != 32 && Size != 64)
909     return false;
910 
911   unsigned Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
912   if (Size == 32) {
913     I.setDesc(TII.get(Opcode));
914     I.addImplicitDefUseOperands(*MF);
915     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
916   }
917 
918   DebugLoc DL = I.getDebugLoc();
919   const TargetRegisterClass *RC = IsSgpr ? &AMDGPU::SReg_32_XM0RegClass :
920                                            &AMDGPU::VGPR_32RegClass;
921   unsigned LoReg = MRI.createVirtualRegister(RC);
922   unsigned HiReg = MRI.createVirtualRegister(RC);
923   const APInt &Imm = APInt(Size, I.getOperand(1).getImm());
924 
925   BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
926           .addImm(Imm.trunc(32).getZExtValue());
927 
928   BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
929           .addImm(Imm.ashr(32).getZExtValue());
930 
931   const MachineInstr *RS =
932       BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
933               .addReg(LoReg)
934               .addImm(AMDGPU::sub0)
935               .addReg(HiReg)
936               .addImm(AMDGPU::sub1);
937 
938   // We can't call constrainSelectedInstRegOperands here, because it doesn't
939   // work for target independent opcodes
940   I.eraseFromParent();
941   const TargetRegisterClass *DstRC =
942       TRI.getConstrainedRegClassForOperand(RS->getOperand(0), MRI);
943   if (!DstRC)
944     return true;
945   return RBI.constrainGenericRegister(DstReg, *DstRC, MRI);
946 }
947 
948 static bool isConstant(const MachineInstr &MI) {
949   return MI.getOpcode() == TargetOpcode::G_CONSTANT;
950 }
951 
952 void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
953     const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
954 
955   const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg());
956 
957   assert(PtrMI);
958 
959   if (PtrMI->getOpcode() != TargetOpcode::G_GEP)
960     return;
961 
962   GEPInfo GEPInfo(*PtrMI);
963 
964   for (unsigned i = 1, e = 3; i < e; ++i) {
965     const MachineOperand &GEPOp = PtrMI->getOperand(i);
966     const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
967     assert(OpDef);
968     if (isConstant(*OpDef)) {
969       // FIXME: Is it possible to have multiple Imm parts?  Maybe if we
970       // are lacking other optimizations.
971       assert(GEPInfo.Imm == 0);
972       GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
973       continue;
974     }
975     const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
976     if (OpBank->getID() == AMDGPU::SGPRRegBankID)
977       GEPInfo.SgprParts.push_back(GEPOp.getReg());
978     else
979       GEPInfo.VgprParts.push_back(GEPOp.getReg());
980   }
981 
982   AddrInfo.push_back(GEPInfo);
983   getAddrModeInfo(*PtrMI, MRI, AddrInfo);
984 }
985 
986 bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
987   if (!MI.hasOneMemOperand())
988     return false;
989 
990   const MachineMemOperand *MMO = *MI.memoperands_begin();
991   const Value *Ptr = MMO->getValue();
992 
993   // UndefValue means this is a load of a kernel input.  These are uniform.
994   // Sometimes LDS instructions have constant pointers.
995   // If Ptr is null, then that means this mem operand contains a
996   // PseudoSourceValue like GOT.
997   if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
998       isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
999     return true;
1000 
1001   if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
1002     return true;
1003 
1004   const Instruction *I = dyn_cast<Instruction>(Ptr);
1005   return I && I->getMetadata("amdgpu.uniform");
1006 }
1007 
1008 bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
1009   for (const GEPInfo &GEPInfo : AddrInfo) {
1010     if (!GEPInfo.VgprParts.empty())
1011       return true;
1012   }
1013   return false;
1014 }
1015 
1016 bool AMDGPUInstructionSelector::selectG_LOAD(MachineInstr &I) const {
1017   MachineBasicBlock *BB = I.getParent();
1018   MachineFunction *MF = BB->getParent();
1019   MachineRegisterInfo &MRI = MF->getRegInfo();
1020   const DebugLoc &DL = I.getDebugLoc();
1021   Register DstReg = I.getOperand(0).getReg();
1022   Register PtrReg = I.getOperand(1).getReg();
1023   unsigned LoadSize = RBI.getSizeInBits(DstReg, MRI, TRI);
1024   unsigned Opcode;
1025 
1026   if (MRI.getType(I.getOperand(1).getReg()).getSizeInBits() == 32) {
1027     LLVM_DEBUG(dbgs() << "Unhandled address space\n");
1028     return false;
1029   }
1030 
1031   SmallVector<GEPInfo, 4> AddrInfo;
1032 
1033   getAddrModeInfo(I, MRI, AddrInfo);
1034 
1035   switch (LoadSize) {
1036   case 32:
1037     Opcode = AMDGPU::FLAT_LOAD_DWORD;
1038     break;
1039   case 64:
1040     Opcode = AMDGPU::FLAT_LOAD_DWORDX2;
1041     break;
1042   default:
1043     LLVM_DEBUG(dbgs() << "Unhandled load size\n");
1044     return false;
1045   }
1046 
1047   MachineInstr *Flat = BuildMI(*BB, &I, DL, TII.get(Opcode))
1048                                .add(I.getOperand(0))
1049                                .addReg(PtrReg)
1050                                .addImm(0)  // offset
1051                                .addImm(0)  // glc
1052                                .addImm(0)  // slc
1053                                .addImm(0); // dlc
1054 
1055   bool Ret = constrainSelectedInstRegOperands(*Flat, TII, TRI, RBI);
1056   I.eraseFromParent();
1057   return Ret;
1058 }
1059 
1060 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
1061   MachineBasicBlock *BB = I.getParent();
1062   MachineFunction *MF = BB->getParent();
1063   MachineRegisterInfo &MRI = MF->getRegInfo();
1064   MachineOperand &CondOp = I.getOperand(0);
1065   Register CondReg = CondOp.getReg();
1066   const DebugLoc &DL = I.getDebugLoc();
1067 
1068   unsigned BrOpcode;
1069   Register CondPhysReg;
1070   const TargetRegisterClass *ConstrainRC;
1071 
1072   // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
1073   // whether the branch is uniform when selecting the instruction. In
1074   // GlobalISel, we should push that decision into RegBankSelect. Assume for now
1075   // RegBankSelect knows what it's doing if the branch condition is scc, even
1076   // though it currently does not.
1077   if (isSCC(CondReg, MRI)) {
1078     CondPhysReg = AMDGPU::SCC;
1079     BrOpcode = AMDGPU::S_CBRANCH_SCC1;
1080     ConstrainRC = &AMDGPU::SReg_32_XM0RegClass;
1081   } else if (isVCC(CondReg, MRI)) {
1082     // FIXME: Do we have to insert an and with exec here, like in SelectionDAG?
1083     // We sort of know that a VCC producer based on the register bank, that ands
1084     // inactive lanes with 0. What if there was a logical operation with vcc
1085     // producers in different blocks/with different exec masks?
1086     // FIXME: Should scc->vcc copies and with exec?
1087     CondPhysReg = TRI.getVCC();
1088     BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
1089     ConstrainRC = TRI.getBoolRC();
1090   } else
1091     return false;
1092 
1093   if (!MRI.getRegClassOrNull(CondReg))
1094     MRI.setRegClass(CondReg, ConstrainRC);
1095 
1096   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
1097     .addReg(CondReg);
1098   BuildMI(*BB, &I, DL, TII.get(BrOpcode))
1099     .addMBB(I.getOperand(1).getMBB());
1100 
1101   I.eraseFromParent();
1102   return true;
1103 }
1104 
1105 bool AMDGPUInstructionSelector::selectG_FRAME_INDEX(MachineInstr &I) const {
1106   MachineBasicBlock *BB = I.getParent();
1107   MachineFunction *MF = BB->getParent();
1108   MachineRegisterInfo &MRI = MF->getRegInfo();
1109 
1110   Register DstReg = I.getOperand(0).getReg();
1111   const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI);
1112   const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
1113   I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
1114   if (IsVGPR)
1115     I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
1116 
1117   return RBI.constrainGenericRegister(
1118     DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, MRI);
1119 }
1120 
1121 bool AMDGPUInstructionSelector::select(MachineInstr &I,
1122                                        CodeGenCoverage &CoverageInfo) const {
1123   if (I.isPHI())
1124     return selectPHI(I);
1125 
1126   if (!isPreISelGenericOpcode(I.getOpcode())) {
1127     if (I.isCopy())
1128       return selectCOPY(I);
1129     return true;
1130   }
1131 
1132   switch (I.getOpcode()) {
1133   case TargetOpcode::G_ADD:
1134     if (selectG_ADD(I))
1135       return true;
1136     LLVM_FALLTHROUGH;
1137   default:
1138     return selectImpl(I, CoverageInfo);
1139   case TargetOpcode::G_INTTOPTR:
1140   case TargetOpcode::G_BITCAST:
1141     return selectCOPY(I);
1142   case TargetOpcode::G_CONSTANT:
1143   case TargetOpcode::G_FCONSTANT:
1144     return selectG_CONSTANT(I);
1145   case TargetOpcode::G_EXTRACT:
1146     return selectG_EXTRACT(I);
1147   case TargetOpcode::G_GEP:
1148     return selectG_GEP(I);
1149   case TargetOpcode::G_IMPLICIT_DEF:
1150     return selectG_IMPLICIT_DEF(I);
1151   case TargetOpcode::G_INSERT:
1152     return selectG_INSERT(I);
1153   case TargetOpcode::G_INTRINSIC:
1154     return selectG_INTRINSIC(I, CoverageInfo);
1155   case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
1156     return selectG_INTRINSIC_W_SIDE_EFFECTS(I, CoverageInfo);
1157   case TargetOpcode::G_ICMP:
1158     if (selectG_ICMP(I))
1159       return true;
1160     return selectImpl(I, CoverageInfo);
1161   case TargetOpcode::G_LOAD:
1162     if (selectImpl(I, CoverageInfo))
1163       return true;
1164     return selectG_LOAD(I);
1165   case TargetOpcode::G_SELECT:
1166     return selectG_SELECT(I);
1167   case TargetOpcode::G_STORE:
1168     return selectG_STORE(I);
1169   case TargetOpcode::G_TRUNC:
1170     return selectG_TRUNC(I);
1171   case TargetOpcode::G_SEXT:
1172   case TargetOpcode::G_ZEXT:
1173   case TargetOpcode::G_ANYEXT:
1174     if (selectG_SZA_EXT(I)) {
1175       I.eraseFromParent();
1176       return true;
1177     }
1178 
1179     return false;
1180   case TargetOpcode::G_BRCOND:
1181     return selectG_BRCOND(I);
1182   case TargetOpcode::G_FRAME_INDEX:
1183     return selectG_FRAME_INDEX(I);
1184   case TargetOpcode::G_FENCE:
1185     // FIXME: Tablegen importer doesn't handle the imm operands correctly, and
1186     // is checking for G_CONSTANT
1187     I.setDesc(TII.get(AMDGPU::ATOMIC_FENCE));
1188     return true;
1189   }
1190   return false;
1191 }
1192 
1193 InstructionSelector::ComplexRendererFns
1194 AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
1195   return {{
1196       [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
1197   }};
1198 
1199 }
1200 
1201 std::pair<Register, unsigned>
1202 AMDGPUInstructionSelector::selectVOP3ModsImpl(
1203   Register Src, const MachineRegisterInfo &MRI) const {
1204   unsigned Mods = 0;
1205   MachineInstr *MI = MRI.getVRegDef(Src);
1206 
1207   if (MI && MI->getOpcode() == AMDGPU::G_FNEG) {
1208     Src = MI->getOperand(1).getReg();
1209     Mods |= SISrcMods::NEG;
1210     MI = MRI.getVRegDef(Src);
1211   }
1212 
1213   if (MI && MI->getOpcode() == AMDGPU::G_FABS) {
1214     Src = MI->getOperand(1).getReg();
1215     Mods |= SISrcMods::ABS;
1216   }
1217 
1218   return std::make_pair(Src, Mods);
1219 }
1220 
1221 ///
1222 /// This will select either an SGPR or VGPR operand and will save us from
1223 /// having to write an extra tablegen pattern.
1224 InstructionSelector::ComplexRendererFns
1225 AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
1226   return {{
1227       [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
1228   }};
1229 }
1230 
1231 InstructionSelector::ComplexRendererFns
1232 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
1233   MachineRegisterInfo &MRI
1234     = Root.getParent()->getParent()->getParent()->getRegInfo();
1235 
1236   Register Src;
1237   unsigned Mods;
1238   std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(), MRI);
1239 
1240   return {{
1241       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
1242       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
1243       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },    // clamp
1244       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }     // omod
1245   }};
1246 }
1247 InstructionSelector::ComplexRendererFns
1248 AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
1249   return {{
1250       [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
1251       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
1252       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }  // omod
1253   }};
1254 }
1255 
1256 InstructionSelector::ComplexRendererFns
1257 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
1258   MachineRegisterInfo &MRI
1259     = Root.getParent()->getParent()->getParent()->getRegInfo();
1260 
1261   Register Src;
1262   unsigned Mods;
1263   std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(), MRI);
1264 
1265   return {{
1266       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
1267       [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
1268   }};
1269 }
1270 
1271 InstructionSelector::ComplexRendererFns
1272 AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
1273   MachineRegisterInfo &MRI =
1274       Root.getParent()->getParent()->getParent()->getRegInfo();
1275 
1276   SmallVector<GEPInfo, 4> AddrInfo;
1277   getAddrModeInfo(*Root.getParent(), MRI, AddrInfo);
1278 
1279   if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
1280     return None;
1281 
1282   const GEPInfo &GEPInfo = AddrInfo[0];
1283 
1284   if (!AMDGPU::isLegalSMRDImmOffset(STI, GEPInfo.Imm))
1285     return None;
1286 
1287   unsigned PtrReg = GEPInfo.SgprParts[0];
1288   int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm);
1289   return {{
1290     [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
1291     [=](MachineInstrBuilder &MIB) { MIB.addImm(EncodedImm); }
1292   }};
1293 }
1294 
1295 InstructionSelector::ComplexRendererFns
1296 AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
1297   MachineRegisterInfo &MRI =
1298       Root.getParent()->getParent()->getParent()->getRegInfo();
1299 
1300   SmallVector<GEPInfo, 4> AddrInfo;
1301   getAddrModeInfo(*Root.getParent(), MRI, AddrInfo);
1302 
1303   if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
1304     return None;
1305 
1306   const GEPInfo &GEPInfo = AddrInfo[0];
1307   unsigned PtrReg = GEPInfo.SgprParts[0];
1308   int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm);
1309   if (!isUInt<32>(EncodedImm))
1310     return None;
1311 
1312   return {{
1313     [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
1314     [=](MachineInstrBuilder &MIB) { MIB.addImm(EncodedImm); }
1315   }};
1316 }
1317 
1318 InstructionSelector::ComplexRendererFns
1319 AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
1320   MachineInstr *MI = Root.getParent();
1321   MachineBasicBlock *MBB = MI->getParent();
1322   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
1323 
1324   SmallVector<GEPInfo, 4> AddrInfo;
1325   getAddrModeInfo(*MI, MRI, AddrInfo);
1326 
1327   // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
1328   // then we can select all ptr + 32-bit offsets not just immediate offsets.
1329   if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
1330     return None;
1331 
1332   const GEPInfo &GEPInfo = AddrInfo[0];
1333   if (!GEPInfo.Imm || !isUInt<32>(GEPInfo.Imm))
1334     return None;
1335 
1336   // If we make it this far we have a load with an 32-bit immediate offset.
1337   // It is OK to select this using a sgpr offset, because we have already
1338   // failed trying to select this load into one of the _IMM variants since
1339   // the _IMM Patterns are considered before the _SGPR patterns.
1340   unsigned PtrReg = GEPInfo.SgprParts[0];
1341   unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
1342   BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg)
1343           .addImm(GEPInfo.Imm);
1344   return {{
1345     [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
1346     [=](MachineInstrBuilder &MIB) { MIB.addReg(OffsetReg); }
1347   }};
1348 }
1349