1 //===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Defines an instruction selector for the AMDGPU target.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUISelDAGToDAG.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "MCTargetDesc/R600MCTargetDesc.h"
18 #include "R600RegisterInfo.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
21 #include "llvm/Analysis/ValueTracking.h"
22 #include "llvm/CodeGen/FunctionLoweringInfo.h"
23 #include "llvm/CodeGen/SelectionDAG.h"
24 #include "llvm/CodeGen/SelectionDAGISel.h"
25 #include "llvm/CodeGen/SelectionDAGNodes.h"
26 #include "llvm/IR/IntrinsicsAMDGPU.h"
27 #include "llvm/InitializePasses.h"
28 
29 #ifdef EXPENSIVE_CHECKS
30 #include "llvm/Analysis/LoopInfo.h"
31 #include "llvm/IR/Dominators.h"
32 #endif
33 
34 #define DEBUG_TYPE "isel"
35 
36 using namespace llvm;
37 
38 //===----------------------------------------------------------------------===//
39 // Instruction Selector Implementation
40 //===----------------------------------------------------------------------===//
41 
42 namespace {
43 
44 static SDValue stripBitcast(SDValue Val) {
45   return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
46 }
47 
48 // Figure out if this is really an extract of the high 16-bits of a dword.
49 static bool isExtractHiElt(SDValue In, SDValue &Out) {
50   In = stripBitcast(In);
51 
52   if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
53     if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
54       if (!Idx->isOne())
55         return false;
56       Out = In.getOperand(0);
57       return true;
58     }
59   }
60 
61   if (In.getOpcode() != ISD::TRUNCATE)
62     return false;
63 
64   SDValue Srl = In.getOperand(0);
65   if (Srl.getOpcode() == ISD::SRL) {
66     if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
67       if (ShiftAmt->getZExtValue() == 16) {
68         Out = stripBitcast(Srl.getOperand(0));
69         return true;
70       }
71     }
72   }
73 
74   return false;
75 }
76 
77 // Look through operations that obscure just looking at the low 16-bits of the
78 // same register.
79 static SDValue stripExtractLoElt(SDValue In) {
80   if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
81     if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
82       if (Idx->isZero() && In.getValueSizeInBits() <= 32)
83         return In.getOperand(0);
84     }
85   }
86 
87   if (In.getOpcode() == ISD::TRUNCATE) {
88     SDValue Src = In.getOperand(0);
89     if (Src.getValueType().getSizeInBits() == 32)
90       return stripBitcast(Src);
91   }
92 
93   return In;
94 }
95 
96 }  // end anonymous namespace
97 
98 INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "amdgpu-isel",
99                       "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
100 INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo)
101 INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysis)
102 INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
103 #ifdef EXPENSIVE_CHECKS
104 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
105 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
106 #endif
107 INITIALIZE_PASS_END(AMDGPUDAGToDAGISel, "amdgpu-isel",
108                     "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
109 
110 /// This pass converts a legalized DAG into a AMDGPU-specific
111 // DAG, ready for instruction scheduling.
112 FunctionPass *llvm::createAMDGPUISelDag(TargetMachine *TM,
113                                         CodeGenOpt::Level OptLevel) {
114   return new AMDGPUDAGToDAGISel(TM, OptLevel);
115 }
116 
117 AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(
118     TargetMachine *TM /*= nullptr*/,
119     CodeGenOpt::Level OptLevel /*= CodeGenOpt::Default*/)
120     : SelectionDAGISel(*TM, OptLevel) {
121   EnableLateStructurizeCFG = AMDGPUTargetMachine::EnableLateStructurizeCFG;
122 }
123 
124 bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
125 #ifdef EXPENSIVE_CHECKS
126   DominatorTree & DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
127   LoopInfo * LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
128   for (auto &L : LI->getLoopsInPreorder()) {
129     assert(L->isLCSSAForm(DT));
130   }
131 #endif
132   Subtarget = &MF.getSubtarget<GCNSubtarget>();
133   Mode = AMDGPU::SIModeRegisterDefaults(MF.getFunction());
134   return SelectionDAGISel::runOnMachineFunction(MF);
135 }
136 
137 bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
138   // XXX - only need to list legal operations.
139   switch (Opc) {
140   case ISD::FADD:
141   case ISD::FSUB:
142   case ISD::FMUL:
143   case ISD::FDIV:
144   case ISD::FREM:
145   case ISD::FCANONICALIZE:
146   case ISD::UINT_TO_FP:
147   case ISD::SINT_TO_FP:
148   case ISD::FABS:
149     // Fabs is lowered to a bit operation, but it's an and which will clear the
150     // high bits anyway.
151   case ISD::FSQRT:
152   case ISD::FSIN:
153   case ISD::FCOS:
154   case ISD::FPOWI:
155   case ISD::FPOW:
156   case ISD::FLOG:
157   case ISD::FLOG2:
158   case ISD::FLOG10:
159   case ISD::FEXP:
160   case ISD::FEXP2:
161   case ISD::FCEIL:
162   case ISD::FTRUNC:
163   case ISD::FRINT:
164   case ISD::FNEARBYINT:
165   case ISD::FROUND:
166   case ISD::FFLOOR:
167   case ISD::FMINNUM:
168   case ISD::FMAXNUM:
169   case AMDGPUISD::FRACT:
170   case AMDGPUISD::CLAMP:
171   case AMDGPUISD::COS_HW:
172   case AMDGPUISD::SIN_HW:
173   case AMDGPUISD::FMIN3:
174   case AMDGPUISD::FMAX3:
175   case AMDGPUISD::FMED3:
176   case AMDGPUISD::FMAD_FTZ:
177   case AMDGPUISD::RCP:
178   case AMDGPUISD::RSQ:
179   case AMDGPUISD::RCP_IFLAG:
180   case AMDGPUISD::LDEXP:
181     // On gfx10, all 16-bit instructions preserve the high bits.
182     return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
183   case ISD::FP_ROUND:
184     // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
185     // high bits on gfx9.
186     // TODO: If we had the source node we could see if the source was fma/mad
187     return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
188   case ISD::FMA:
189   case ISD::FMAD:
190   case AMDGPUISD::DIV_FIXUP:
191     return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
192   default:
193     // fcopysign, select and others may be lowered to 32-bit bit operations
194     // which don't zero the high bits.
195     return false;
196   }
197 }
198 
199 void AMDGPUDAGToDAGISel::getAnalysisUsage(AnalysisUsage &AU) const {
200   AU.addRequired<AMDGPUArgumentUsageInfo>();
201   AU.addRequired<LegacyDivergenceAnalysis>();
202 #ifdef EXPENSIVE_CHECKS
203   AU.addRequired<DominatorTreeWrapperPass>();
204   AU.addRequired<LoopInfoWrapperPass>();
205 #endif
206   SelectionDAGISel::getAnalysisUsage(AU);
207 }
208 
209 bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
210   assert(Subtarget->d16PreservesUnusedBits());
211   MVT VT = N->getValueType(0).getSimpleVT();
212   if (VT != MVT::v2i16 && VT != MVT::v2f16)
213     return false;
214 
215   SDValue Lo = N->getOperand(0);
216   SDValue Hi = N->getOperand(1);
217 
218   LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi));
219 
220   // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
221   // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
222   // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
223 
224   // Need to check for possible indirect dependencies on the other half of the
225   // vector to avoid introducing a cycle.
226   if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) {
227     SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
228 
229     SDValue TiedIn = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Lo);
230     SDValue Ops[] = {
231       LdHi->getChain(), LdHi->getBasePtr(), TiedIn
232     };
233 
234     unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
235     if (LdHi->getMemoryVT() == MVT::i8) {
236       LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
237         AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8;
238     } else {
239       assert(LdHi->getMemoryVT() == MVT::i16);
240     }
241 
242     SDValue NewLoadHi =
243       CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
244                                   Ops, LdHi->getMemoryVT(),
245                                   LdHi->getMemOperand());
246 
247     CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
248     CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
249     return true;
250   }
251 
252   // build_vector (load ptr), hi -> load_d16_lo ptr, hi
253   // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
254   // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
255   LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo));
256   if (LdLo && Lo.hasOneUse()) {
257     SDValue TiedIn = getHi16Elt(Hi);
258     if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode()))
259       return false;
260 
261     SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
262     unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
263     if (LdLo->getMemoryVT() == MVT::i8) {
264       LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
265         AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8;
266     } else {
267       assert(LdLo->getMemoryVT() == MVT::i16);
268     }
269 
270     TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
271 
272     SDValue Ops[] = {
273       LdLo->getChain(), LdLo->getBasePtr(), TiedIn
274     };
275 
276     SDValue NewLoadLo =
277       CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
278                                   Ops, LdLo->getMemoryVT(),
279                                   LdLo->getMemOperand());
280 
281     CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
282     CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
283     return true;
284   }
285 
286   return false;
287 }
288 
289 void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
290   if (!Subtarget->d16PreservesUnusedBits())
291     return;
292 
293   SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
294 
295   bool MadeChange = false;
296   while (Position != CurDAG->allnodes_begin()) {
297     SDNode *N = &*--Position;
298     if (N->use_empty())
299       continue;
300 
301     switch (N->getOpcode()) {
302     case ISD::BUILD_VECTOR:
303       MadeChange |= matchLoadD16FromBuildVector(N);
304       break;
305     default:
306       break;
307     }
308   }
309 
310   if (MadeChange) {
311     CurDAG->RemoveDeadNodes();
312     LLVM_DEBUG(dbgs() << "After PreProcess:\n";
313                CurDAG->dump(););
314   }
315 }
316 
317 bool AMDGPUDAGToDAGISel::isNoNanSrc(SDValue N) const {
318   if (TM.Options.NoNaNsFPMath)
319     return true;
320 
321   // TODO: Move into isKnownNeverNaN
322   if (N->getFlags().hasNoNaNs())
323     return true;
324 
325   return CurDAG->isKnownNeverNaN(N);
326 }
327 
328 bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N,
329                                            bool Negated) const {
330   if (N->isUndef())
331     return true;
332 
333   const SIInstrInfo *TII = Subtarget->getInstrInfo();
334   if (Negated) {
335     if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
336       return TII->isInlineConstant(-C->getAPIntValue());
337 
338     if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
339       return TII->isInlineConstant(-C->getValueAPF().bitcastToAPInt());
340 
341   } else {
342     if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
343       return TII->isInlineConstant(C->getAPIntValue());
344 
345     if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
346       return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt());
347   }
348 
349   return false;
350 }
351 
352 /// Determine the register class for \p OpNo
353 /// \returns The register class of the virtual register that will be used for
354 /// the given operand number \OpNo or NULL if the register class cannot be
355 /// determined.
356 const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
357                                                           unsigned OpNo) const {
358   if (!N->isMachineOpcode()) {
359     if (N->getOpcode() == ISD::CopyToReg) {
360       Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
361       if (Reg.isVirtual()) {
362         MachineRegisterInfo &MRI = CurDAG->getMachineFunction().getRegInfo();
363         return MRI.getRegClass(Reg);
364       }
365 
366       const SIRegisterInfo *TRI
367         = static_cast<const GCNSubtarget *>(Subtarget)->getRegisterInfo();
368       return TRI->getPhysRegClass(Reg);
369     }
370 
371     return nullptr;
372   }
373 
374   switch (N->getMachineOpcode()) {
375   default: {
376     const MCInstrDesc &Desc =
377         Subtarget->getInstrInfo()->get(N->getMachineOpcode());
378     unsigned OpIdx = Desc.getNumDefs() + OpNo;
379     if (OpIdx >= Desc.getNumOperands())
380       return nullptr;
381     int RegClass = Desc.OpInfo[OpIdx].RegClass;
382     if (RegClass == -1)
383       return nullptr;
384 
385     return Subtarget->getRegisterInfo()->getRegClass(RegClass);
386   }
387   case AMDGPU::REG_SEQUENCE: {
388     unsigned RCID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
389     const TargetRegisterClass *SuperRC =
390         Subtarget->getRegisterInfo()->getRegClass(RCID);
391 
392     SDValue SubRegOp = N->getOperand(OpNo + 1);
393     unsigned SubRegIdx = cast<ConstantSDNode>(SubRegOp)->getZExtValue();
394     return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
395                                                               SubRegIdx);
396   }
397   }
398 }
399 
400 SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
401                                          SDValue Glue) const {
402   SmallVector <SDValue, 8> Ops;
403   Ops.push_back(NewChain); // Replace the chain.
404   for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
405     Ops.push_back(N->getOperand(i));
406 
407   Ops.push_back(Glue);
408   return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
409 }
410 
411 SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
412   const SITargetLowering& Lowering =
413     *static_cast<const SITargetLowering*>(getTargetLowering());
414 
415   assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
416 
417   SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val);
418   return glueCopyToOp(N, M0, M0.getValue(1));
419 }
420 
421 SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
422   unsigned AS = cast<MemSDNode>(N)->getAddressSpace();
423   if (AS == AMDGPUAS::LOCAL_ADDRESS) {
424     if (Subtarget->ldsRequiresM0Init())
425       return glueCopyToM0(N, CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));
426   } else if (AS == AMDGPUAS::REGION_ADDRESS) {
427     MachineFunction &MF = CurDAG->getMachineFunction();
428     unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
429     return
430         glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
431   }
432   return N;
433 }
434 
435 MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
436                                                   EVT VT) const {
437   SDNode *Lo = CurDAG->getMachineNode(
438       AMDGPU::S_MOV_B32, DL, MVT::i32,
439       CurDAG->getTargetConstant(Imm & 0xFFFFFFFF, DL, MVT::i32));
440   SDNode *Hi =
441       CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
442                              CurDAG->getTargetConstant(Imm >> 32, DL, MVT::i32));
443   const SDValue Ops[] = {
444       CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
445       SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
446       SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
447 
448   return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
449 }
450 
451 void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
452   EVT VT = N->getValueType(0);
453   unsigned NumVectorElts = VT.getVectorNumElements();
454   EVT EltVT = VT.getVectorElementType();
455   SDLoc DL(N);
456   SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
457 
458   if (NumVectorElts == 1) {
459     CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
460                          RegClass);
461     return;
462   }
463 
464   assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
465                                   "supported yet");
466   // 32 = Max Num Vector Elements
467   // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
468   // 1 = Vector Register Class
469   SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
470 
471   bool IsGCN = CurDAG->getSubtarget().getTargetTriple().getArch() ==
472                Triple::amdgcn;
473   RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
474   bool IsRegSeq = true;
475   unsigned NOps = N->getNumOperands();
476   for (unsigned i = 0; i < NOps; i++) {
477     // XXX: Why is this here?
478     if (isa<RegisterSDNode>(N->getOperand(i))) {
479       IsRegSeq = false;
480       break;
481     }
482     unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
483                          : R600RegisterInfo::getSubRegFromChannel(i);
484     RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
485     RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
486   }
487   if (NOps != NumVectorElts) {
488     // Fill in the missing undef elements if this was a scalar_to_vector.
489     assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
490     MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
491                                                    DL, EltVT);
492     for (unsigned i = NOps; i < NumVectorElts; ++i) {
493       unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
494                            : R600RegisterInfo::getSubRegFromChannel(i);
495       RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
496       RegSeqArgs[1 + (2 * i) + 1] =
497           CurDAG->getTargetConstant(Sub, DL, MVT::i32);
498     }
499   }
500 
501   if (!IsRegSeq)
502     SelectCode(N);
503   CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
504 }
505 
506 void AMDGPUDAGToDAGISel::Select(SDNode *N) {
507   unsigned int Opc = N->getOpcode();
508   if (N->isMachineOpcode()) {
509     N->setNodeId(-1);
510     return;   // Already selected.
511   }
512 
513   // isa<MemSDNode> almost works but is slightly too permissive for some DS
514   // intrinsics.
515   if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N) ||
516       (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC ||
517        Opc == ISD::ATOMIC_LOAD_FADD ||
518        Opc == AMDGPUISD::ATOMIC_LOAD_FMIN ||
519        Opc == AMDGPUISD::ATOMIC_LOAD_FMAX)) {
520     N = glueCopyToM0LDSInit(N);
521     SelectCode(N);
522     return;
523   }
524 
525   switch (Opc) {
526   default:
527     break;
528   // We are selecting i64 ADD here instead of custom lower it during
529   // DAG legalization, so we can fold some i64 ADDs used for address
530   // calculation into the LOAD and STORE instructions.
531   case ISD::ADDC:
532   case ISD::ADDE:
533   case ISD::SUBC:
534   case ISD::SUBE: {
535     if (N->getValueType(0) != MVT::i64)
536       break;
537 
538     SelectADD_SUB_I64(N);
539     return;
540   }
541   case ISD::ADDCARRY:
542   case ISD::SUBCARRY:
543     if (N->getValueType(0) != MVT::i32)
544       break;
545 
546     SelectAddcSubb(N);
547     return;
548   case ISD::UADDO:
549   case ISD::USUBO: {
550     SelectUADDO_USUBO(N);
551     return;
552   }
553   case AMDGPUISD::FMUL_W_CHAIN: {
554     SelectFMUL_W_CHAIN(N);
555     return;
556   }
557   case AMDGPUISD::FMA_W_CHAIN: {
558     SelectFMA_W_CHAIN(N);
559     return;
560   }
561 
562   case ISD::SCALAR_TO_VECTOR:
563   case ISD::BUILD_VECTOR: {
564     EVT VT = N->getValueType(0);
565     unsigned NumVectorElts = VT.getVectorNumElements();
566     if (VT.getScalarSizeInBits() == 16) {
567       if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
568         if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {
569           ReplaceNode(N, Packed);
570           return;
571         }
572       }
573 
574       break;
575     }
576 
577     assert(VT.getVectorElementType().bitsEq(MVT::i32));
578     unsigned RegClassID =
579         SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID();
580     SelectBuildVector(N, RegClassID);
581     return;
582   }
583   case ISD::BUILD_PAIR: {
584     SDValue RC, SubReg0, SubReg1;
585     SDLoc DL(N);
586     if (N->getValueType(0) == MVT::i128) {
587       RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32);
588       SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
589       SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
590     } else if (N->getValueType(0) == MVT::i64) {
591       RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);
592       SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
593       SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
594     } else {
595       llvm_unreachable("Unhandled value type for BUILD_PAIR");
596     }
597     const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
598                             N->getOperand(1), SubReg1 };
599     ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
600                                           N->getValueType(0), Ops));
601     return;
602   }
603 
604   case ISD::Constant:
605   case ISD::ConstantFP: {
606     if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
607       break;
608 
609     uint64_t Imm;
610     if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N))
611       Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
612     else {
613       ConstantSDNode *C = cast<ConstantSDNode>(N);
614       Imm = C->getZExtValue();
615     }
616 
617     SDLoc DL(N);
618     ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
619     return;
620   }
621   case AMDGPUISD::BFE_I32:
622   case AMDGPUISD::BFE_U32: {
623     // There is a scalar version available, but unlike the vector version which
624     // has a separate operand for the offset and width, the scalar version packs
625     // the width and offset into a single operand. Try to move to the scalar
626     // version if the offsets are constant, so that we can try to keep extended
627     // loads of kernel arguments in SGPRs.
628 
629     // TODO: Technically we could try to pattern match scalar bitshifts of
630     // dynamic values, but it's probably not useful.
631     ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
632     if (!Offset)
633       break;
634 
635     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
636     if (!Width)
637       break;
638 
639     bool Signed = Opc == AMDGPUISD::BFE_I32;
640 
641     uint32_t OffsetVal = Offset->getZExtValue();
642     uint32_t WidthVal = Width->getZExtValue();
643 
644     ReplaceNode(N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal,
645                             WidthVal));
646     return;
647   }
648   case AMDGPUISD::DIV_SCALE: {
649     SelectDIV_SCALE(N);
650     return;
651   }
652   case AMDGPUISD::MAD_I64_I32:
653   case AMDGPUISD::MAD_U64_U32: {
654     SelectMAD_64_32(N);
655     return;
656   }
657   case ISD::SMUL_LOHI:
658   case ISD::UMUL_LOHI:
659     return SelectMUL_LOHI(N);
660   case ISD::CopyToReg: {
661     const SITargetLowering& Lowering =
662       *static_cast<const SITargetLowering*>(getTargetLowering());
663     N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);
664     break;
665   }
666   case ISD::AND:
667   case ISD::SRL:
668   case ISD::SRA:
669   case ISD::SIGN_EXTEND_INREG:
670     if (N->getValueType(0) != MVT::i32)
671       break;
672 
673     SelectS_BFE(N);
674     return;
675   case ISD::BRCOND:
676     SelectBRCOND(N);
677     return;
678   case ISD::FMAD:
679   case ISD::FMA:
680     SelectFMAD_FMA(N);
681     return;
682   case AMDGPUISD::CVT_PKRTZ_F16_F32:
683   case AMDGPUISD::CVT_PKNORM_I16_F32:
684   case AMDGPUISD::CVT_PKNORM_U16_F32:
685   case AMDGPUISD::CVT_PK_U16_U32:
686   case AMDGPUISD::CVT_PK_I16_I32: {
687     // Hack around using a legal type if f16 is illegal.
688     if (N->getValueType(0) == MVT::i32) {
689       MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
690       N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),
691                               { N->getOperand(0), N->getOperand(1) });
692       SelectCode(N);
693       return;
694     }
695 
696     break;
697   }
698   case ISD::INTRINSIC_W_CHAIN: {
699     SelectINTRINSIC_W_CHAIN(N);
700     return;
701   }
702   case ISD::INTRINSIC_WO_CHAIN: {
703     SelectINTRINSIC_WO_CHAIN(N);
704     return;
705   }
706   case ISD::INTRINSIC_VOID: {
707     SelectINTRINSIC_VOID(N);
708     return;
709   }
710   }
711 
712   SelectCode(N);
713 }
714 
715 bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
716   const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
717   const Instruction *Term = BB->getTerminator();
718   return Term->getMetadata("amdgpu.uniform") ||
719          Term->getMetadata("structurizecfg.uniform");
720 }
721 
722 bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N,
723                                              unsigned ShAmtBits) const {
724   assert(N->getOpcode() == ISD::AND);
725 
726   const APInt &RHS = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
727   if (RHS.countTrailingOnes() >= ShAmtBits)
728     return true;
729 
730   const APInt &LHSKnownZeros = CurDAG->computeKnownBits(N->getOperand(0)).Zero;
731   return (LHSKnownZeros | RHS).countTrailingOnes() >= ShAmtBits;
732 }
733 
734 static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr,
735                                           SDValue &N0, SDValue &N1) {
736   if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
737       Addr.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
738     // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
739     // (i64 (bitcast (v2i32 (build_vector
740     //                        (or (extract_vector_elt V, 0), OFFSET),
741     //                        (extract_vector_elt V, 1)))))
742     SDValue Lo = Addr.getOperand(0).getOperand(0);
743     if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
744       SDValue BaseLo = Lo.getOperand(0);
745       SDValue BaseHi = Addr.getOperand(0).getOperand(1);
746       // Check that split base (Lo and Hi) are extracted from the same one.
747       if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
748           BaseHi.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
749           BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
750           // Lo is statically extracted from index 0.
751           isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
752           BaseLo.getConstantOperandVal(1) == 0 &&
753           // Hi is statically extracted from index 0.
754           isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
755           BaseHi.getConstantOperandVal(1) == 1) {
756         N0 = BaseLo.getOperand(0).getOperand(0);
757         N1 = Lo.getOperand(1);
758         return true;
759       }
760     }
761   }
762   return false;
763 }
764 
765 bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
766                                                     SDValue &RHS) const {
767   if (CurDAG->isBaseWithConstantOffset(Addr)) {
768     LHS = Addr.getOperand(0);
769     RHS = Addr.getOperand(1);
770     return true;
771   }
772 
773   if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, LHS, RHS)) {
774     assert(LHS && RHS && isa<ConstantSDNode>(RHS));
775     return true;
776   }
777 
778   return false;
779 }
780 
781 StringRef AMDGPUDAGToDAGISel::getPassName() const {
782   return "AMDGPU DAG->DAG Pattern Instruction Selection";
783 }
784 
785 //===----------------------------------------------------------------------===//
786 // Complex Patterns
787 //===----------------------------------------------------------------------===//
788 
789 bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
790                                             SDValue &Offset) {
791   return false;
792 }
793 
794 bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
795                                             SDValue &Offset) {
796   ConstantSDNode *C;
797   SDLoc DL(Addr);
798 
799   if ((C = dyn_cast<ConstantSDNode>(Addr))) {
800     Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
801     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
802   } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
803              (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
804     Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
805     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
806   } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
807             (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
808     Base = Addr.getOperand(0);
809     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
810   } else {
811     Base = Addr;
812     Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
813   }
814 
815   return true;
816 }
817 
818 SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
819                                                        const SDLoc &DL) const {
820   SDNode *Mov = CurDAG->getMachineNode(
821     AMDGPU::S_MOV_B32, DL, MVT::i32,
822     CurDAG->getTargetConstant(Val, DL, MVT::i32));
823   return SDValue(Mov, 0);
824 }
825 
826 // FIXME: Should only handle addcarry/subcarry
827 void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
828   SDLoc DL(N);
829   SDValue LHS = N->getOperand(0);
830   SDValue RHS = N->getOperand(1);
831 
832   unsigned Opcode = N->getOpcode();
833   bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE);
834   bool ProduceCarry =
835       ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC;
836   bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE;
837 
838   SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
839   SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
840 
841   SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
842                                        DL, MVT::i32, LHS, Sub0);
843   SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
844                                        DL, MVT::i32, LHS, Sub1);
845 
846   SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
847                                        DL, MVT::i32, RHS, Sub0);
848   SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
849                                        DL, MVT::i32, RHS, Sub1);
850 
851   SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
852 
853   static const unsigned OpcMap[2][2][2] = {
854       {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
855        {AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}},
856       {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
857        {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
858 
859   unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd];
860   unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd];
861 
862   SDNode *AddLo;
863   if (!ConsumeCarry) {
864     SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
865     AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args);
866   } else {
867     SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) };
868     AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
869   }
870   SDValue AddHiArgs[] = {
871     SDValue(Hi0, 0),
872     SDValue(Hi1, 0),
873     SDValue(AddLo, 1)
874   };
875   SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs);
876 
877   SDValue RegSequenceArgs[] = {
878     CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
879     SDValue(AddLo,0),
880     Sub0,
881     SDValue(AddHi,0),
882     Sub1,
883   };
884   SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
885                                                MVT::i64, RegSequenceArgs);
886 
887   if (ProduceCarry) {
888     // Replace the carry-use
889     ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));
890   }
891 
892   // Replace the remaining uses.
893   ReplaceNode(N, RegSequence);
894 }
895 
896 void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
897   SDLoc DL(N);
898   SDValue LHS = N->getOperand(0);
899   SDValue RHS = N->getOperand(1);
900   SDValue CI = N->getOperand(2);
901 
902   if (N->isDivergent()) {
903     unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::V_ADDC_U32_e64
904                                                    : AMDGPU::V_SUBB_U32_e64;
905     CurDAG->SelectNodeTo(
906         N, Opc, N->getVTList(),
907         {LHS, RHS, CI,
908          CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
909   } else {
910     unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::S_ADD_CO_PSEUDO
911                                                    : AMDGPU::S_SUB_CO_PSEUDO;
912     CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI});
913   }
914 }
915 
916 void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
917   // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
918   // carry out despite the _i32 name. These were renamed in VI to _U32.
919   // FIXME: We should probably rename the opcodes here.
920   bool IsAdd = N->getOpcode() == ISD::UADDO;
921   bool IsVALU = N->isDivergent();
922 
923   for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end(); UI != E;
924        ++UI)
925     if (UI.getUse().getResNo() == 1) {
926       if ((IsAdd && (UI->getOpcode() != ISD::ADDCARRY)) ||
927           (!IsAdd && (UI->getOpcode() != ISD::SUBCARRY))) {
928         IsVALU = true;
929         break;
930       }
931     }
932 
933   if (IsVALU) {
934     unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
935 
936     CurDAG->SelectNodeTo(
937         N, Opc, N->getVTList(),
938         {N->getOperand(0), N->getOperand(1),
939          CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
940   } else {
941     unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::S_UADDO_PSEUDO
942                                                 : AMDGPU::S_USUBO_PSEUDO;
943 
944     CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
945                          {N->getOperand(0), N->getOperand(1)});
946   }
947 }
948 
949 void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
950   SDLoc SL(N);
951   //  src0_modifiers, src0,  src1_modifiers, src1, src2_modifiers, src2, clamp, omod
952   SDValue Ops[10];
953 
954   SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
955   SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
956   SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);
957   Ops[8] = N->getOperand(0);
958   Ops[9] = N->getOperand(4);
959 
960   // If there are no source modifiers, prefer fmac over fma because it can use
961   // the smaller VOP2 encoding.
962   bool UseFMAC = Subtarget->hasDLInsts() &&
963                  cast<ConstantSDNode>(Ops[0])->isZero() &&
964                  cast<ConstantSDNode>(Ops[2])->isZero() &&
965                  cast<ConstantSDNode>(Ops[4])->isZero();
966   unsigned Opcode = UseFMAC ? AMDGPU::V_FMAC_F32_e64 : AMDGPU::V_FMA_F32_e64;
967   CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), Ops);
968 }
969 
970 void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
971   SDLoc SL(N);
972   //    src0_modifiers, src0,  src1_modifiers, src1, clamp, omod
973   SDValue Ops[8];
974 
975   SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
976   SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
977   Ops[6] = N->getOperand(0);
978   Ops[7] = N->getOperand(3);
979 
980   CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
981 }
982 
983 // We need to handle this here because tablegen doesn't support matching
984 // instructions with multiple outputs.
985 void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
986   SDLoc SL(N);
987   EVT VT = N->getValueType(0);
988 
989   assert(VT == MVT::f32 || VT == MVT::f64);
990 
991   unsigned Opc
992     = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
993 
994   // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
995   // omod
996   SDValue Ops[8];
997   SelectVOP3BMods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
998   SelectVOP3BMods(N->getOperand(1), Ops[3], Ops[2]);
999   SelectVOP3BMods(N->getOperand(2), Ops[5], Ops[4]);
1000   CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1001 }
1002 
1003 // We need to handle this here because tablegen doesn't support matching
1004 // instructions with multiple outputs.
1005 void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
1006   SDLoc SL(N);
1007   bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
1008   unsigned Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1009 
1010   SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1011   SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
1012                     Clamp };
1013   CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1014 }
1015 
1016 // We need to handle this here because tablegen doesn't support matching
1017 // instructions with multiple outputs.
1018 void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
1019   SDLoc SL(N);
1020   bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
1021   unsigned Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1022 
1023   SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64);
1024   SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1025   SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp};
1026   SDNode *Mad = CurDAG->getMachineNode(Opc, SL, N->getVTList(), Ops);
1027   if (!SDValue(N, 0).use_empty()) {
1028     SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32);
1029     SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1030                                         MVT::i32, SDValue(Mad, 0), Sub0);
1031     ReplaceUses(SDValue(N, 0), SDValue(Lo, 0));
1032   }
1033   if (!SDValue(N, 1).use_empty()) {
1034     SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32);
1035     SDNode *Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1036                                         MVT::i32, SDValue(Mad, 0), Sub1);
1037     ReplaceUses(SDValue(N, 1), SDValue(Hi, 0));
1038   }
1039   CurDAG->RemoveDeadNode(N);
1040 }
1041 
1042 bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
1043   if (!isUInt<16>(Offset))
1044     return false;
1045 
1046   if (!Base || Subtarget->hasUsableDSOffset() ||
1047       Subtarget->unsafeDSOffsetFoldingEnabled())
1048     return true;
1049 
1050   // On Southern Islands instruction with a negative base value and an offset
1051   // don't seem to work.
1052   return CurDAG->SignBitIsZero(Base);
1053 }
1054 
1055 bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
1056                                               SDValue &Offset) const {
1057   SDLoc DL(Addr);
1058   if (CurDAG->isBaseWithConstantOffset(Addr)) {
1059     SDValue N0 = Addr.getOperand(0);
1060     SDValue N1 = Addr.getOperand(1);
1061     ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1062     if (isDSOffsetLegal(N0, C1->getSExtValue())) {
1063       // (add n0, c0)
1064       Base = N0;
1065       Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1066       return true;
1067     }
1068   } else if (Addr.getOpcode() == ISD::SUB) {
1069     // sub C, x -> add (sub 0, x), C
1070     if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1071       int64_t ByteOffset = C->getSExtValue();
1072       if (isDSOffsetLegal(SDValue(), ByteOffset)) {
1073         SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1074 
1075         // XXX - This is kind of hacky. Create a dummy sub node so we can check
1076         // the known bits in isDSOffsetLegal. We need to emit the selected node
1077         // here, so this is thrown away.
1078         SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
1079                                       Zero, Addr.getOperand(1));
1080 
1081         if (isDSOffsetLegal(Sub, ByteOffset)) {
1082           SmallVector<SDValue, 3> Opnds;
1083           Opnds.push_back(Zero);
1084           Opnds.push_back(Addr.getOperand(1));
1085 
1086           // FIXME: Select to VOP3 version for with-carry.
1087           unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1088           if (Subtarget->hasAddNoCarry()) {
1089             SubOp = AMDGPU::V_SUB_U32_e64;
1090             Opnds.push_back(
1091                 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1092           }
1093 
1094           MachineSDNode *MachineSub =
1095               CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
1096 
1097           Base = SDValue(MachineSub, 0);
1098           Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
1099           return true;
1100         }
1101       }
1102     }
1103   } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1104     // If we have a constant address, prefer to put the constant into the
1105     // offset. This can save moves to load the constant address since multiple
1106     // operations can share the zero base address register, and enables merging
1107     // into read2 / write2 instructions.
1108 
1109     SDLoc DL(Addr);
1110 
1111     if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {
1112       SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1113       MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
1114                                  DL, MVT::i32, Zero);
1115       Base = SDValue(MovZero, 0);
1116       Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1117       return true;
1118     }
1119   }
1120 
1121   // default case
1122   Base = Addr;
1123   Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);
1124   return true;
1125 }
1126 
1127 bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
1128                                           unsigned Offset1,
1129                                           unsigned Size) const {
1130   if (Offset0 % Size != 0 || Offset1 % Size != 0)
1131     return false;
1132   if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
1133     return false;
1134 
1135   if (!Base || Subtarget->hasUsableDSOffset() ||
1136       Subtarget->unsafeDSOffsetFoldingEnabled())
1137     return true;
1138 
1139   // On Southern Islands instruction with a negative base value and an offset
1140   // don't seem to work.
1141   return CurDAG->SignBitIsZero(Base);
1142 }
1143 
1144 // TODO: If offset is too big, put low 16-bit into offset.
1145 bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
1146                                                    SDValue &Offset0,
1147                                                    SDValue &Offset1) const {
1148   return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4);
1149 }
1150 
1151 bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
1152                                                     SDValue &Offset0,
1153                                                     SDValue &Offset1) const {
1154   return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8);
1155 }
1156 
1157 bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
1158                                             SDValue &Offset0, SDValue &Offset1,
1159                                             unsigned Size) const {
1160   SDLoc DL(Addr);
1161 
1162   if (CurDAG->isBaseWithConstantOffset(Addr)) {
1163     SDValue N0 = Addr.getOperand(0);
1164     SDValue N1 = Addr.getOperand(1);
1165     ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1166     unsigned OffsetValue0 = C1->getZExtValue();
1167     unsigned OffsetValue1 = OffsetValue0 + Size;
1168 
1169     // (add n0, c0)
1170     if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) {
1171       Base = N0;
1172       Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1173       Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1174       return true;
1175     }
1176   } else if (Addr.getOpcode() == ISD::SUB) {
1177     // sub C, x -> add (sub 0, x), C
1178     if (const ConstantSDNode *C =
1179             dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1180       unsigned OffsetValue0 = C->getZExtValue();
1181       unsigned OffsetValue1 = OffsetValue0 + Size;
1182 
1183       if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1184         SDLoc DL(Addr);
1185         SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1186 
1187         // XXX - This is kind of hacky. Create a dummy sub node so we can check
1188         // the known bits in isDSOffsetLegal. We need to emit the selected node
1189         // here, so this is thrown away.
1190         SDValue Sub =
1191             CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
1192 
1193         if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) {
1194           SmallVector<SDValue, 3> Opnds;
1195           Opnds.push_back(Zero);
1196           Opnds.push_back(Addr.getOperand(1));
1197           unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1198           if (Subtarget->hasAddNoCarry()) {
1199             SubOp = AMDGPU::V_SUB_U32_e64;
1200             Opnds.push_back(
1201                 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1202           }
1203 
1204           MachineSDNode *MachineSub = CurDAG->getMachineNode(
1205               SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds);
1206 
1207           Base = SDValue(MachineSub, 0);
1208           Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1209           Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1210           return true;
1211         }
1212       }
1213     }
1214   } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1215     unsigned OffsetValue0 = CAddr->getZExtValue();
1216     unsigned OffsetValue1 = OffsetValue0 + Size;
1217 
1218     if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1219       SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1220       MachineSDNode *MovZero =
1221           CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
1222       Base = SDValue(MovZero, 0);
1223       Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1224       Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1225       return true;
1226     }
1227   }
1228 
1229   // default case
1230 
1231   Base = Addr;
1232   Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8);
1233   Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8);
1234   return true;
1235 }
1236 
1237 bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
1238                                      SDValue &SOffset, SDValue &Offset,
1239                                      SDValue &Offen, SDValue &Idxen,
1240                                      SDValue &Addr64) const {
1241   // Subtarget prefers to use flat instruction
1242   // FIXME: This should be a pattern predicate and not reach here
1243   if (Subtarget->useFlatForGlobal())
1244     return false;
1245 
1246   SDLoc DL(Addr);
1247 
1248   Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1249   Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1250   Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
1251   SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1252 
1253   ConstantSDNode *C1 = nullptr;
1254   SDValue N0 = Addr;
1255   if (CurDAG->isBaseWithConstantOffset(Addr)) {
1256     C1 = cast<ConstantSDNode>(Addr.getOperand(1));
1257     if (isUInt<32>(C1->getZExtValue()))
1258       N0 = Addr.getOperand(0);
1259     else
1260       C1 = nullptr;
1261   }
1262 
1263   if (N0.getOpcode() == ISD::ADD) {
1264     // (add N2, N3) -> addr64, or
1265     // (add (add N2, N3), C1) -> addr64
1266     SDValue N2 = N0.getOperand(0);
1267     SDValue N3 = N0.getOperand(1);
1268     Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1269 
1270     if (N2->isDivergent()) {
1271       if (N3->isDivergent()) {
1272         // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1273         // addr64, and construct the resource from a 0 address.
1274         Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1275         VAddr = N0;
1276       } else {
1277         // N2 is divergent, N3 is not.
1278         Ptr = N3;
1279         VAddr = N2;
1280       }
1281     } else {
1282       // N2 is not divergent.
1283       Ptr = N2;
1284       VAddr = N3;
1285     }
1286     Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
1287   } else if (N0->isDivergent()) {
1288     // N0 is divergent. Use it as the addr64, and construct the resource from a
1289     // 0 address.
1290     Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1291     VAddr = N0;
1292     Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1293   } else {
1294     // N0 -> offset, or
1295     // (N0 + C1) -> offset
1296     VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
1297     Ptr = N0;
1298   }
1299 
1300   if (!C1) {
1301     // No offset.
1302     Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
1303     return true;
1304   }
1305 
1306   if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue())) {
1307     // Legal offset for instruction.
1308     Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1309     return true;
1310   }
1311 
1312   // Illegal offset, store it in soffset.
1313   Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
1314   SOffset =
1315       SDValue(CurDAG->getMachineNode(
1316                   AMDGPU::S_MOV_B32, DL, MVT::i32,
1317                   CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
1318               0);
1319   return true;
1320 }
1321 
1322 bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1323                                            SDValue &VAddr, SDValue &SOffset,
1324                                            SDValue &Offset) const {
1325   SDValue Ptr, Offen, Idxen, Addr64;
1326 
1327   // addr64 bit was removed for volcanic islands.
1328   // FIXME: This should be a pattern predicate and not reach here
1329   if (!Subtarget->hasAddr64())
1330     return false;
1331 
1332   if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1333     return false;
1334 
1335   ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
1336   if (C->getSExtValue()) {
1337     SDLoc DL(Addr);
1338 
1339     const SITargetLowering& Lowering =
1340       *static_cast<const SITargetLowering*>(getTargetLowering());
1341 
1342     SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
1343     return true;
1344   }
1345 
1346   return false;
1347 }
1348 
1349 std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
1350   SDLoc DL(N);
1351 
1352   auto *FI = dyn_cast<FrameIndexSDNode>(N);
1353   SDValue TFI =
1354       FI ? CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)) : N;
1355 
1356   // We rebase the base address into an absolute stack address and hence
1357   // use constant 0 for soffset. This value must be retained until
1358   // frame elimination and eliminateFrameIndex will choose the appropriate
1359   // frame register if need be.
1360   return std::make_pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));
1361 }
1362 
1363 bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
1364                                                  SDValue Addr, SDValue &Rsrc,
1365                                                  SDValue &VAddr, SDValue &SOffset,
1366                                                  SDValue &ImmOffset) const {
1367 
1368   SDLoc DL(Addr);
1369   MachineFunction &MF = CurDAG->getMachineFunction();
1370   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1371 
1372   Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1373 
1374   if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1375     int64_t Imm = CAddr->getSExtValue();
1376     const int64_t NullPtr =
1377         AMDGPUTargetMachine::getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS);
1378     // Don't fold null pointer.
1379     if (Imm != NullPtr) {
1380       SDValue HighBits = CurDAG->getTargetConstant(Imm & ~4095, DL, MVT::i32);
1381       MachineSDNode *MovHighBits = CurDAG->getMachineNode(
1382         AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
1383       VAddr = SDValue(MovHighBits, 0);
1384 
1385       SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1386       ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16);
1387       return true;
1388     }
1389   }
1390 
1391   if (CurDAG->isBaseWithConstantOffset(Addr)) {
1392     // (add n0, c1)
1393 
1394     SDValue N0 = Addr.getOperand(0);
1395     SDValue N1 = Addr.getOperand(1);
1396 
1397     // Offsets in vaddr must be positive if range checking is enabled.
1398     //
1399     // The total computation of vaddr + soffset + offset must not overflow.  If
1400     // vaddr is negative, even if offset is 0 the sgpr offset add will end up
1401     // overflowing.
1402     //
1403     // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1404     // always perform a range check. If a negative vaddr base index was used,
1405     // this would fail the range check. The overall address computation would
1406     // compute a valid address, but this doesn't happen due to the range
1407     // check. For out-of-bounds MUBUF loads, a 0 is returned.
1408     //
1409     // Therefore it should be safe to fold any VGPR offset on gfx9 into the
1410     // MUBUF vaddr, but not on older subtargets which can only do this if the
1411     // sign bit is known 0.
1412     ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1413     if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue()) &&
1414         (!Subtarget->privateMemoryResourceIsRangeChecked() ||
1415          CurDAG->SignBitIsZero(N0))) {
1416       std::tie(VAddr, SOffset) = foldFrameIndex(N0);
1417       ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1418       return true;
1419     }
1420   }
1421 
1422   // (node)
1423   std::tie(VAddr, SOffset) = foldFrameIndex(Addr);
1424   ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16);
1425   return true;
1426 }
1427 
1428 static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
1429   if (Val.getOpcode() != ISD::CopyFromReg)
1430     return false;
1431   auto RC =
1432       TRI.getPhysRegClass(cast<RegisterSDNode>(Val.getOperand(1))->getReg());
1433   return RC && TRI.isSGPRClass(RC);
1434 }
1435 
1436 bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
1437                                                   SDValue Addr,
1438                                                   SDValue &SRsrc,
1439                                                   SDValue &SOffset,
1440                                                   SDValue &Offset) const {
1441   const SIRegisterInfo *TRI =
1442       static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
1443   MachineFunction &MF = CurDAG->getMachineFunction();
1444   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1445   SDLoc DL(Addr);
1446 
1447   // CopyFromReg <sgpr>
1448   if (IsCopyFromSGPR(*TRI, Addr)) {
1449     SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1450     SOffset = Addr;
1451     Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
1452     return true;
1453   }
1454 
1455   ConstantSDNode *CAddr;
1456   if (Addr.getOpcode() == ISD::ADD) {
1457     // Add (CopyFromReg <sgpr>) <constant>
1458     CAddr = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
1459     if (!CAddr || !SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue()))
1460       return false;
1461     if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0)))
1462       return false;
1463 
1464     SOffset = Addr.getOperand(0);
1465   } else if ((CAddr = dyn_cast<ConstantSDNode>(Addr)) &&
1466              SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue())) {
1467     // <constant>
1468     SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1469   } else {
1470     return false;
1471   }
1472 
1473   SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1474 
1475   Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1476   return true;
1477 }
1478 
1479 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1480                                            SDValue &SOffset, SDValue &Offset
1481                                            ) const {
1482   SDValue Ptr, VAddr, Offen, Idxen, Addr64;
1483   const SIInstrInfo *TII =
1484     static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
1485 
1486   if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1487     return false;
1488 
1489   if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
1490       !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
1491       !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
1492     uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
1493                     APInt::getAllOnes(32).getZExtValue(); // Size
1494     SDLoc DL(Addr);
1495 
1496     const SITargetLowering& Lowering =
1497       *static_cast<const SITargetLowering*>(getTargetLowering());
1498 
1499     SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
1500     return true;
1501   }
1502   return false;
1503 }
1504 
1505 // Find a load or store from corresponding pattern root.
1506 // Roots may be build_vector, bitconvert or their combinations.
1507 static MemSDNode* findMemSDNode(SDNode *N) {
1508   N = AMDGPUTargetLowering::stripBitcast(SDValue(N,0)).getNode();
1509   if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
1510     return MN;
1511   assert(isa<BuildVectorSDNode>(N));
1512   for (SDValue V : N->op_values())
1513     if (MemSDNode *MN =
1514           dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
1515       return MN;
1516   llvm_unreachable("cannot find MemSDNode in the pattern!");
1517 }
1518 
1519 bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
1520                                               SDValue &VAddr, SDValue &Offset,
1521                                               uint64_t FlatVariant) const {
1522   int64_t OffsetVal = 0;
1523 
1524   unsigned AS = findMemSDNode(N)->getAddressSpace();
1525 
1526   bool CanHaveFlatSegmentOffsetBug =
1527       Subtarget->hasFlatSegmentOffsetBug() &&
1528       FlatVariant == SIInstrFlags::FLAT &&
1529       (AS == AMDGPUAS::FLAT_ADDRESS || AS == AMDGPUAS::GLOBAL_ADDRESS);
1530 
1531   if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
1532     SDValue N0, N1;
1533     if (isBaseWithConstantOffset64(Addr, N0, N1)) {
1534       int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
1535 
1536       const SIInstrInfo *TII = Subtarget->getInstrInfo();
1537       if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
1538         Addr = N0;
1539         OffsetVal = COffsetVal;
1540       } else {
1541         // If the offset doesn't fit, put the low bits into the offset field and
1542         // add the rest.
1543         //
1544         // For a FLAT instruction the hardware decides whether to access
1545         // global/scratch/shared memory based on the high bits of vaddr,
1546         // ignoring the offset field, so we have to ensure that when we add
1547         // remainder to vaddr it still points into the same underlying object.
1548         // The easiest way to do that is to make sure that we split the offset
1549         // into two pieces that are both >= 0 or both <= 0.
1550 
1551         SDLoc DL(N);
1552         uint64_t RemainderOffset;
1553 
1554         std::tie(OffsetVal, RemainderOffset) =
1555             TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
1556 
1557         SDValue AddOffsetLo =
1558             getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1559         SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1560 
1561         if (Addr.getValueType().getSizeInBits() == 32) {
1562           SmallVector<SDValue, 3> Opnds;
1563           Opnds.push_back(N0);
1564           Opnds.push_back(AddOffsetLo);
1565           unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1566           if (Subtarget->hasAddNoCarry()) {
1567             AddOp = AMDGPU::V_ADD_U32_e64;
1568             Opnds.push_back(Clamp);
1569           }
1570           Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
1571         } else {
1572           // TODO: Should this try to use a scalar add pseudo if the base address
1573           // is uniform and saddr is usable?
1574           SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1575           SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1576 
1577           SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1578                                                 DL, MVT::i32, N0, Sub0);
1579           SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1580                                                 DL, MVT::i32, N0, Sub1);
1581 
1582           SDValue AddOffsetHi =
1583               getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
1584 
1585           SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1586 
1587           SDNode *Add =
1588               CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1589                                      {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1590 
1591           SDNode *Addc = CurDAG->getMachineNode(
1592               AMDGPU::V_ADDC_U32_e64, DL, VTs,
1593               {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1594 
1595           SDValue RegSequenceArgs[] = {
1596               CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
1597               SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1598 
1599           Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1600                                                 MVT::i64, RegSequenceArgs),
1601                          0);
1602         }
1603       }
1604     }
1605   }
1606 
1607   VAddr = Addr;
1608   Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i16);
1609   return true;
1610 }
1611 
1612 bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,
1613                                           SDValue &VAddr,
1614                                           SDValue &Offset) const {
1615   return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FLAT);
1616 }
1617 
1618 bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,
1619                                             SDValue &VAddr,
1620                                             SDValue &Offset) const {
1621   return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FlatGlobal);
1622 }
1623 
1624 bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
1625                                              SDValue &VAddr,
1626                                              SDValue &Offset) const {
1627   return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
1628                               SIInstrFlags::FlatScratch);
1629 }
1630 
1631 // If this matches zero_extend i32:x, return x
1632 static SDValue matchZExtFromI32(SDValue Op) {
1633   if (Op.getOpcode() != ISD::ZERO_EXTEND)
1634     return SDValue();
1635 
1636   SDValue ExtSrc = Op.getOperand(0);
1637   return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
1638 }
1639 
1640 // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
1641 bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
1642                                            SDValue Addr,
1643                                            SDValue &SAddr,
1644                                            SDValue &VOffset,
1645                                            SDValue &Offset) const {
1646   int64_t ImmOffset = 0;
1647 
1648   // Match the immediate offset first, which canonically is moved as low as
1649   // possible.
1650 
1651   SDValue LHS, RHS;
1652   if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1653     int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1654     const SIInstrInfo *TII = Subtarget->getInstrInfo();
1655 
1656     if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
1657                                SIInstrFlags::FlatGlobal)) {
1658       Addr = LHS;
1659       ImmOffset = COffsetVal;
1660     } else if (!LHS->isDivergent()) {
1661       if (COffsetVal > 0) {
1662         SDLoc SL(N);
1663         // saddr + large_offset -> saddr +
1664         //                         (voffset = large_offset & ~MaxOffset) +
1665         //                         (large_offset & MaxOffset);
1666         int64_t SplitImmOffset, RemainderOffset;
1667         std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1668             COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
1669 
1670         if (isUInt<32>(RemainderOffset)) {
1671           SDNode *VMov = CurDAG->getMachineNode(
1672               AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1673               CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1674           VOffset = SDValue(VMov, 0);
1675           SAddr = LHS;
1676           Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16);
1677           return true;
1678         }
1679       }
1680 
1681       // We are adding a 64 bit SGPR and a constant. If constant bus limit
1682       // is 1 we would need to perform 1 or 2 extra moves for each half of
1683       // the constant and it is better to do a scalar add and then issue a
1684       // single VALU instruction to materialize zero. Otherwise it is less
1685       // instructions to perform VALU adds with immediates or inline literals.
1686       unsigned NumLiterals =
1687           !TII->isInlineConstant(APInt(32, COffsetVal & 0xffffffff)) +
1688           !TII->isInlineConstant(APInt(32, COffsetVal >> 32));
1689       if (Subtarget->getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
1690         return false;
1691     }
1692   }
1693 
1694   // Match the variable offset.
1695   if (Addr.getOpcode() == ISD::ADD) {
1696     LHS = Addr.getOperand(0);
1697     RHS = Addr.getOperand(1);
1698 
1699     if (!LHS->isDivergent()) {
1700       // add (i64 sgpr), (zero_extend (i32 vgpr))
1701       if (SDValue ZextRHS = matchZExtFromI32(RHS)) {
1702         SAddr = LHS;
1703         VOffset = ZextRHS;
1704       }
1705     }
1706 
1707     if (!SAddr && !RHS->isDivergent()) {
1708       // add (zero_extend (i32 vgpr)), (i64 sgpr)
1709       if (SDValue ZextLHS = matchZExtFromI32(LHS)) {
1710         SAddr = RHS;
1711         VOffset = ZextLHS;
1712       }
1713     }
1714 
1715     if (SAddr) {
1716       Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
1717       return true;
1718     }
1719   }
1720 
1721   if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
1722       isa<ConstantSDNode>(Addr))
1723     return false;
1724 
1725   // It's cheaper to materialize a single 32-bit zero for vaddr than the two
1726   // moves required to copy a 64-bit SGPR to VGPR.
1727   SAddr = Addr;
1728   SDNode *VMov =
1729       CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
1730                              CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
1731   VOffset = SDValue(VMov, 0);
1732   Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
1733   return true;
1734 }
1735 
1736 static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) {
1737   if (auto FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
1738     SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
1739   } else if (SAddr.getOpcode() == ISD::ADD &&
1740              isa<FrameIndexSDNode>(SAddr.getOperand(0))) {
1741     // Materialize this into a scalar move for scalar address to avoid
1742     // readfirstlane.
1743     auto FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
1744     SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
1745                                               FI->getValueType(0));
1746     SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr),
1747                                            MVT::i32, TFI, SAddr.getOperand(1)),
1748                     0);
1749   }
1750 
1751   return SAddr;
1752 }
1753 
1754 // Match (32-bit SGPR base) + sext(imm offset)
1755 bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
1756                                             SDValue &SAddr,
1757                                             SDValue &Offset) const {
1758   if (Addr->isDivergent())
1759     return false;
1760 
1761   SDLoc DL(Addr);
1762 
1763   int64_t COffsetVal = 0;
1764 
1765   if (CurDAG->isBaseWithConstantOffset(Addr)) {
1766     COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
1767     SAddr = Addr.getOperand(0);
1768   } else {
1769     SAddr = Addr;
1770   }
1771 
1772   SAddr = SelectSAddrFI(CurDAG, SAddr);
1773 
1774   const SIInstrInfo *TII = Subtarget->getInstrInfo();
1775 
1776   if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
1777                               SIInstrFlags::FlatScratch)) {
1778     int64_t SplitImmOffset, RemainderOffset;
1779     std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1780         COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, SIInstrFlags::FlatScratch);
1781 
1782     COffsetVal = SplitImmOffset;
1783 
1784     SDValue AddOffset =
1785         SAddr.getOpcode() == ISD::TargetFrameIndex
1786             ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL)
1787             : CurDAG->getTargetConstant(RemainderOffset, DL, MVT::i32);
1788     SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, MVT::i32,
1789                                            SAddr, AddOffset),
1790                     0);
1791   }
1792 
1793   Offset = CurDAG->getTargetConstant(COffsetVal, DL, MVT::i16);
1794 
1795   return true;
1796 }
1797 
1798 bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
1799                                              SDValue &VAddr, SDValue &SAddr,
1800                                              SDValue &Offset) const  {
1801   int64_t ImmOffset = 0;
1802 
1803   SDValue LHS, RHS;
1804   if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1805     int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1806     const SIInstrInfo *TII = Subtarget->getInstrInfo();
1807 
1808     if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true)) {
1809       Addr = LHS;
1810       ImmOffset = COffsetVal;
1811     } else if (!LHS->isDivergent() && COffsetVal > 0) {
1812       SDLoc SL(N);
1813       // saddr + large_offset -> saddr + (vaddr = large_offset & ~MaxOffset) +
1814       //                         (large_offset & MaxOffset);
1815       int64_t SplitImmOffset, RemainderOffset;
1816       std::tie(SplitImmOffset, RemainderOffset)
1817         = TII->splitFlatOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true);
1818 
1819       if (isUInt<32>(RemainderOffset)) {
1820         SDNode *VMov = CurDAG->getMachineNode(
1821           AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1822           CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1823         VAddr = SDValue(VMov, 0);
1824         SAddr = LHS;
1825         Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16);
1826         return true;
1827       }
1828     }
1829   }
1830 
1831   if (Addr.getOpcode() != ISD::ADD)
1832     return false;
1833 
1834   LHS = Addr.getOperand(0);
1835   RHS = Addr.getOperand(1);
1836 
1837   if (!LHS->isDivergent() && RHS->isDivergent()) {
1838     SAddr = LHS;
1839     VAddr = RHS;
1840   } else if (!RHS->isDivergent() && LHS->isDivergent()) {
1841     SAddr = RHS;
1842     VAddr = LHS;
1843   } else {
1844     return false;
1845   }
1846 
1847   SAddr = SelectSAddrFI(CurDAG, SAddr);
1848   Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
1849   return true;
1850 }
1851 
1852 bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
1853                                           SDValue &Offset, bool &Imm) const {
1854   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
1855   if (!C) {
1856     if (ByteOffsetNode.getValueType().isScalarInteger() &&
1857         ByteOffsetNode.getValueType().getSizeInBits() == 32) {
1858       Offset = ByteOffsetNode;
1859       Imm = false;
1860       return true;
1861     }
1862     if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
1863       if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
1864         Offset = ByteOffsetNode.getOperand(0);
1865         Imm = false;
1866         return true;
1867       }
1868     }
1869     return false;
1870   }
1871 
1872   SDLoc SL(ByteOffsetNode);
1873   // GFX9 and GFX10 have signed byte immediate offsets.
1874   int64_t ByteOffset = C->getSExtValue();
1875   Optional<int64_t> EncodedOffset =
1876       AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset, false);
1877   if (EncodedOffset) {
1878     Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
1879     Imm = true;
1880     return true;
1881   }
1882 
1883   // SGPR and literal offsets are unsigned.
1884   if (ByteOffset < 0)
1885     return false;
1886 
1887   EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);
1888   if (EncodedOffset) {
1889     Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
1890     return true;
1891   }
1892 
1893   if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))
1894     return false;
1895 
1896   SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
1897   Offset = SDValue(
1898       CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
1899 
1900   return true;
1901 }
1902 
1903 SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
1904   if (Addr.getValueType() != MVT::i32)
1905     return Addr;
1906 
1907   // Zero-extend a 32-bit address.
1908   SDLoc SL(Addr);
1909 
1910   const MachineFunction &MF = CurDAG->getMachineFunction();
1911   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1912   unsigned AddrHiVal = Info->get32BitAddressHighBits();
1913   SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
1914 
1915   const SDValue Ops[] = {
1916     CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
1917     Addr,
1918     CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
1919     SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
1920             0),
1921     CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
1922   };
1923 
1924   return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
1925                                         Ops), 0);
1926 }
1927 
1928 bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
1929                                      SDValue &Offset, bool &Imm) const {
1930   SDLoc SL(Addr);
1931 
1932   // A 32-bit (address + offset) should not cause unsigned 32-bit integer
1933   // wraparound, because s_load instructions perform the addition in 64 bits.
1934   if ((Addr.getValueType() != MVT::i32 ||
1935        Addr->getFlags().hasNoUnsignedWrap())) {
1936     SDValue N0, N1;
1937     // Extract the base and offset if possible.
1938     if (CurDAG->isBaseWithConstantOffset(Addr) ||
1939         Addr.getOpcode() == ISD::ADD) {
1940       N0 = Addr.getOperand(0);
1941       N1 = Addr.getOperand(1);
1942     } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
1943       assert(N0 && N1 && isa<ConstantSDNode>(N1));
1944     }
1945     if (N0 && N1) {
1946       if (SelectSMRDOffset(N1, Offset, Imm)) {
1947         SBase = Expand32BitAddress(N0);
1948         return true;
1949       }
1950     }
1951   }
1952   SBase = Expand32BitAddress(Addr);
1953   Offset = CurDAG->getTargetConstant(0, SL, MVT::i32);
1954   Imm = true;
1955   return true;
1956 }
1957 
1958 bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
1959                                        SDValue &Offset) const {
1960   bool Imm = false;
1961   return SelectSMRD(Addr, SBase, Offset, Imm) && Imm;
1962 }
1963 
1964 bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
1965                                          SDValue &Offset) const {
1966 
1967   assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
1968 
1969   bool Imm = false;
1970   if (!SelectSMRD(Addr, SBase, Offset, Imm))
1971     return false;
1972 
1973   return !Imm && isa<ConstantSDNode>(Offset);
1974 }
1975 
1976 bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
1977                                         SDValue &Offset) const {
1978   bool Imm = false;
1979   return SelectSMRD(Addr, SBase, Offset, Imm) && !Imm &&
1980          !isa<ConstantSDNode>(Offset);
1981 }
1982 
1983 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue Addr,
1984                                              SDValue &Offset) const {
1985   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr)) {
1986     // The immediate offset for S_BUFFER instructions is unsigned.
1987     if (auto Imm =
1988             AMDGPU::getSMRDEncodedOffset(*Subtarget, C->getZExtValue(), true)) {
1989       Offset = CurDAG->getTargetConstant(*Imm, SDLoc(Addr), MVT::i32);
1990       return true;
1991     }
1992   }
1993 
1994   return false;
1995 }
1996 
1997 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue Addr,
1998                                                SDValue &Offset) const {
1999   assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2000 
2001   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr)) {
2002     if (auto Imm = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget,
2003                                                          C->getZExtValue())) {
2004       Offset = CurDAG->getTargetConstant(*Imm, SDLoc(Addr), MVT::i32);
2005       return true;
2006     }
2007   }
2008 
2009   return false;
2010 }
2011 
2012 bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
2013                                             SDValue &Base,
2014                                             SDValue &Offset) const {
2015   SDLoc DL(Index);
2016 
2017   if (CurDAG->isBaseWithConstantOffset(Index)) {
2018     SDValue N0 = Index.getOperand(0);
2019     SDValue N1 = Index.getOperand(1);
2020     ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
2021 
2022     // (add n0, c0)
2023     // Don't peel off the offset (c0) if doing so could possibly lead
2024     // the base (n0) to be negative.
2025     // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.
2026     if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) ||
2027         (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {
2028       Base = N0;
2029       Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
2030       return true;
2031     }
2032   }
2033 
2034   if (isa<ConstantSDNode>(Index))
2035     return false;
2036 
2037   Base = Index;
2038   Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
2039   return true;
2040 }
2041 
2042 SDNode *AMDGPUDAGToDAGISel::getBFE32(bool IsSigned, const SDLoc &DL,
2043                                      SDValue Val, uint32_t Offset,
2044                                      uint32_t Width) {
2045   if (Val->isDivergent()) {
2046     unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2047     SDValue Off = CurDAG->getTargetConstant(Offset, DL, MVT::i32);
2048     SDValue W = CurDAG->getTargetConstant(Width, DL, MVT::i32);
2049 
2050     return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, Off, W);
2051   }
2052   unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2053   // Transformation function, pack the offset and width of a BFE into
2054   // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
2055   // source, bits [5:0] contain the offset and bits [22:16] the width.
2056   uint32_t PackedVal = Offset | (Width << 16);
2057   SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);
2058 
2059   return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
2060 }
2061 
2062 void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
2063   // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
2064   // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
2065   // Predicate: 0 < b <= c < 32
2066 
2067   const SDValue &Shl = N->getOperand(0);
2068   ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1));
2069   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
2070 
2071   if (B && C) {
2072     uint32_t BVal = B->getZExtValue();
2073     uint32_t CVal = C->getZExtValue();
2074 
2075     if (0 < BVal && BVal <= CVal && CVal < 32) {
2076       bool Signed = N->getOpcode() == ISD::SRA;
2077       ReplaceNode(N, getBFE32(Signed, SDLoc(N), Shl.getOperand(0), CVal - BVal,
2078                   32 - CVal));
2079       return;
2080     }
2081   }
2082   SelectCode(N);
2083 }
2084 
2085 void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
2086   switch (N->getOpcode()) {
2087   case ISD::AND:
2088     if (N->getOperand(0).getOpcode() == ISD::SRL) {
2089       // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
2090       // Predicate: isMask(mask)
2091       const SDValue &Srl = N->getOperand(0);
2092       ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1));
2093       ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
2094 
2095       if (Shift && Mask) {
2096         uint32_t ShiftVal = Shift->getZExtValue();
2097         uint32_t MaskVal = Mask->getZExtValue();
2098 
2099         if (isMask_32(MaskVal)) {
2100           uint32_t WidthVal = countPopulation(MaskVal);
2101           ReplaceNode(N, getBFE32(false, SDLoc(N), Srl.getOperand(0), ShiftVal,
2102                                   WidthVal));
2103           return;
2104         }
2105       }
2106     }
2107     break;
2108   case ISD::SRL:
2109     if (N->getOperand(0).getOpcode() == ISD::AND) {
2110       // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
2111       // Predicate: isMask(mask >> b)
2112       const SDValue &And = N->getOperand(0);
2113       ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1));
2114       ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1));
2115 
2116       if (Shift && Mask) {
2117         uint32_t ShiftVal = Shift->getZExtValue();
2118         uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
2119 
2120         if (isMask_32(MaskVal)) {
2121           uint32_t WidthVal = countPopulation(MaskVal);
2122           ReplaceNode(N, getBFE32(false, SDLoc(N), And.getOperand(0), ShiftVal,
2123                       WidthVal));
2124           return;
2125         }
2126       }
2127     } else if (N->getOperand(0).getOpcode() == ISD::SHL) {
2128       SelectS_BFEFromShifts(N);
2129       return;
2130     }
2131     break;
2132   case ISD::SRA:
2133     if (N->getOperand(0).getOpcode() == ISD::SHL) {
2134       SelectS_BFEFromShifts(N);
2135       return;
2136     }
2137     break;
2138 
2139   case ISD::SIGN_EXTEND_INREG: {
2140     // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
2141     SDValue Src = N->getOperand(0);
2142     if (Src.getOpcode() != ISD::SRL)
2143       break;
2144 
2145     const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1));
2146     if (!Amt)
2147       break;
2148 
2149     unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
2150     ReplaceNode(N, getBFE32(true, SDLoc(N), Src.getOperand(0),
2151                             Amt->getZExtValue(), Width));
2152     return;
2153   }
2154   }
2155 
2156   SelectCode(N);
2157 }
2158 
2159 bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
2160   assert(N->getOpcode() == ISD::BRCOND);
2161   if (!N->hasOneUse())
2162     return false;
2163 
2164   SDValue Cond = N->getOperand(1);
2165   if (Cond.getOpcode() == ISD::CopyToReg)
2166     Cond = Cond.getOperand(2);
2167 
2168   if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
2169     return false;
2170 
2171   MVT VT = Cond.getOperand(0).getSimpleValueType();
2172   if (VT == MVT::i32)
2173     return true;
2174 
2175   if (VT == MVT::i64) {
2176     auto ST = static_cast<const GCNSubtarget *>(Subtarget);
2177 
2178     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
2179     return (CC == ISD::SETEQ || CC == ISD::SETNE) && ST->hasScalarCompareEq64();
2180   }
2181 
2182   return false;
2183 }
2184 
2185 void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
2186   SDValue Cond = N->getOperand(1);
2187 
2188   if (Cond.isUndef()) {
2189     CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
2190                          N->getOperand(2), N->getOperand(0));
2191     return;
2192   }
2193 
2194   const GCNSubtarget *ST = static_cast<const GCNSubtarget *>(Subtarget);
2195   const SIRegisterInfo *TRI = ST->getRegisterInfo();
2196 
2197   bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
2198   unsigned BrOp = UseSCCBr ? AMDGPU::S_CBRANCH_SCC1 : AMDGPU::S_CBRANCH_VCCNZ;
2199   Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
2200   SDLoc SL(N);
2201 
2202   if (!UseSCCBr) {
2203     // This is the case that we are selecting to S_CBRANCH_VCCNZ.  We have not
2204     // analyzed what generates the vcc value, so we do not know whether vcc
2205     // bits for disabled lanes are 0.  Thus we need to mask out bits for
2206     // disabled lanes.
2207     //
2208     // For the case that we select S_CBRANCH_SCC1 and it gets
2209     // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
2210     // SIInstrInfo::moveToVALU which inserts the S_AND).
2211     //
2212     // We could add an analysis of what generates the vcc value here and omit
2213     // the S_AND when is unnecessary. But it would be better to add a separate
2214     // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
2215     // catches both cases.
2216     Cond = SDValue(CurDAG->getMachineNode(ST->isWave32() ? AMDGPU::S_AND_B32
2217                                                          : AMDGPU::S_AND_B64,
2218                      SL, MVT::i1,
2219                      CurDAG->getRegister(ST->isWave32() ? AMDGPU::EXEC_LO
2220                                                         : AMDGPU::EXEC,
2221                                          MVT::i1),
2222                     Cond),
2223                    0);
2224   }
2225 
2226   SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);
2227   CurDAG->SelectNodeTo(N, BrOp, MVT::Other,
2228                        N->getOperand(2), // Basic Block
2229                        VCC.getValue(0));
2230 }
2231 
2232 void AMDGPUDAGToDAGISel::SelectFMAD_FMA(SDNode *N) {
2233   MVT VT = N->getSimpleValueType(0);
2234   bool IsFMA = N->getOpcode() == ISD::FMA;
2235   if (VT != MVT::f32 || (!Subtarget->hasMadMixInsts() &&
2236                          !Subtarget->hasFmaMixInsts()) ||
2237       ((IsFMA && Subtarget->hasMadMixInsts()) ||
2238        (!IsFMA && Subtarget->hasFmaMixInsts()))) {
2239     SelectCode(N);
2240     return;
2241   }
2242 
2243   SDValue Src0 = N->getOperand(0);
2244   SDValue Src1 = N->getOperand(1);
2245   SDValue Src2 = N->getOperand(2);
2246   unsigned Src0Mods, Src1Mods, Src2Mods;
2247 
2248   // Avoid using v_mad_mix_f32/v_fma_mix_f32 unless there is actually an operand
2249   // using the conversion from f16.
2250   bool Sel0 = SelectVOP3PMadMixModsImpl(Src0, Src0, Src0Mods);
2251   bool Sel1 = SelectVOP3PMadMixModsImpl(Src1, Src1, Src1Mods);
2252   bool Sel2 = SelectVOP3PMadMixModsImpl(Src2, Src2, Src2Mods);
2253 
2254   assert((IsFMA || !Mode.allFP32Denormals()) &&
2255          "fmad selected with denormals enabled");
2256   // TODO: We can select this with f32 denormals enabled if all the sources are
2257   // converted from f16 (in which case fmad isn't legal).
2258 
2259   if (Sel0 || Sel1 || Sel2) {
2260     // For dummy operands.
2261     SDValue Zero = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2262     SDValue Ops[] = {
2263       CurDAG->getTargetConstant(Src0Mods, SDLoc(), MVT::i32), Src0,
2264       CurDAG->getTargetConstant(Src1Mods, SDLoc(), MVT::i32), Src1,
2265       CurDAG->getTargetConstant(Src2Mods, SDLoc(), MVT::i32), Src2,
2266       CurDAG->getTargetConstant(0, SDLoc(), MVT::i1),
2267       Zero, Zero
2268     };
2269 
2270     CurDAG->SelectNodeTo(N,
2271                          IsFMA ? AMDGPU::V_FMA_MIX_F32 : AMDGPU::V_MAD_MIX_F32,
2272                          MVT::f32, Ops);
2273   } else {
2274     SelectCode(N);
2275   }
2276 }
2277 
2278 void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
2279   // The address is assumed to be uniform, so if it ends up in a VGPR, it will
2280   // be copied to an SGPR with readfirstlane.
2281   unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
2282     AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2283 
2284   SDValue Chain = N->getOperand(0);
2285   SDValue Ptr = N->getOperand(2);
2286   MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2287   MachineMemOperand *MMO = M->getMemOperand();
2288   bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2289 
2290   SDValue Offset;
2291   if (CurDAG->isBaseWithConstantOffset(Ptr)) {
2292     SDValue PtrBase = Ptr.getOperand(0);
2293     SDValue PtrOffset = Ptr.getOperand(1);
2294 
2295     const APInt &OffsetVal = cast<ConstantSDNode>(PtrOffset)->getAPIntValue();
2296     if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) {
2297       N = glueCopyToM0(N, PtrBase);
2298       Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
2299     }
2300   }
2301 
2302   if (!Offset) {
2303     N = glueCopyToM0(N, Ptr);
2304     Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2305   }
2306 
2307   SDValue Ops[] = {
2308     Offset,
2309     CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
2310     Chain,
2311     N->getOperand(N->getNumOperands() - 1) // New glue
2312   };
2313 
2314   SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2315   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2316 }
2317 
2318 static unsigned gwsIntrinToOpcode(unsigned IntrID) {
2319   switch (IntrID) {
2320   case Intrinsic::amdgcn_ds_gws_init:
2321     return AMDGPU::DS_GWS_INIT;
2322   case Intrinsic::amdgcn_ds_gws_barrier:
2323     return AMDGPU::DS_GWS_BARRIER;
2324   case Intrinsic::amdgcn_ds_gws_sema_v:
2325     return AMDGPU::DS_GWS_SEMA_V;
2326   case Intrinsic::amdgcn_ds_gws_sema_br:
2327     return AMDGPU::DS_GWS_SEMA_BR;
2328   case Intrinsic::amdgcn_ds_gws_sema_p:
2329     return AMDGPU::DS_GWS_SEMA_P;
2330   case Intrinsic::amdgcn_ds_gws_sema_release_all:
2331     return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
2332   default:
2333     llvm_unreachable("not a gws intrinsic");
2334   }
2335 }
2336 
2337 void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
2338   if (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
2339       !Subtarget->hasGWSSemaReleaseAll()) {
2340     // Let this error.
2341     SelectCode(N);
2342     return;
2343   }
2344 
2345   // Chain, intrinsic ID, vsrc, offset
2346   const bool HasVSrc = N->getNumOperands() == 4;
2347   assert(HasVSrc || N->getNumOperands() == 3);
2348 
2349   SDLoc SL(N);
2350   SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2);
2351   int ImmOffset = 0;
2352   MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2353   MachineMemOperand *MMO = M->getMemOperand();
2354 
2355   // Don't worry if the offset ends up in a VGPR. Only one lane will have
2356   // effect, so SIFixSGPRCopies will validly insert readfirstlane.
2357 
2358   // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
2359   // offset field) % 64. Some versions of the programming guide omit the m0
2360   // part, or claim it's from offset 0.
2361   if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
2362     // If we have a constant offset, try to use the 0 in m0 as the base.
2363     // TODO: Look into changing the default m0 initialization value. If the
2364     // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
2365     // the immediate offset.
2366     glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32));
2367     ImmOffset = ConstOffset->getZExtValue();
2368   } else {
2369     if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
2370       ImmOffset = BaseOffset.getConstantOperandVal(1);
2371       BaseOffset = BaseOffset.getOperand(0);
2372     }
2373 
2374     // Prefer to do the shift in an SGPR since it should be possible to use m0
2375     // as the result directly. If it's already an SGPR, it will be eliminated
2376     // later.
2377     SDNode *SGPROffset
2378       = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
2379                                BaseOffset);
2380     // Shift to offset in m0
2381     SDNode *M0Base
2382       = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2383                                SDValue(SGPROffset, 0),
2384                                CurDAG->getTargetConstant(16, SL, MVT::i32));
2385     glueCopyToM0(N, SDValue(M0Base, 0));
2386   }
2387 
2388   SDValue Chain = N->getOperand(0);
2389   SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
2390 
2391   const unsigned Opc = gwsIntrinToOpcode(IntrID);
2392   SmallVector<SDValue, 5> Ops;
2393   if (HasVSrc)
2394     Ops.push_back(N->getOperand(2));
2395   Ops.push_back(OffsetField);
2396   Ops.push_back(Chain);
2397 
2398   SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2399   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2400 }
2401 
2402 void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
2403   if (Subtarget->getLDSBankCount() != 16) {
2404     // This is a single instruction with a pattern.
2405     SelectCode(N);
2406     return;
2407   }
2408 
2409   SDLoc DL(N);
2410 
2411   // This requires 2 instructions. It is possible to write a pattern to support
2412   // this, but the generated isel emitter doesn't correctly deal with multiple
2413   // output instructions using the same physical register input. The copy to m0
2414   // is incorrectly placed before the second instruction.
2415   //
2416   // TODO: Match source modifiers.
2417   //
2418   // def : Pat <
2419   //   (int_amdgcn_interp_p1_f16
2420   //    (VOP3Mods f32:$src0, i32:$src0_modifiers),
2421   //                             (i32 timm:$attrchan), (i32 timm:$attr),
2422   //                             (i1 timm:$high), M0),
2423   //   (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
2424   //       timm:$attrchan, 0,
2425   //       (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
2426   //   let Predicates = [has16BankLDS];
2427   // }
2428 
2429   // 16 bank LDS
2430   SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0,
2431                                       N->getOperand(5), SDValue());
2432 
2433   SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);
2434 
2435   SDNode *InterpMov =
2436     CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
2437         CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
2438         N->getOperand(3),  // Attr
2439         N->getOperand(2),  // Attrchan
2440         ToM0.getValue(1) // In glue
2441   });
2442 
2443   SDNode *InterpP1LV =
2444     CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
2445         CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
2446         N->getOperand(1), // Src0
2447         N->getOperand(3), // Attr
2448         N->getOperand(2), // Attrchan
2449         CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
2450         SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
2451         N->getOperand(4), // high
2452         CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
2453         CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
2454         SDValue(InterpMov, 1)
2455   });
2456 
2457   CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));
2458 }
2459 
2460 void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
2461   unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
2462   switch (IntrID) {
2463   case Intrinsic::amdgcn_ds_append:
2464   case Intrinsic::amdgcn_ds_consume: {
2465     if (N->getValueType(0) != MVT::i32)
2466       break;
2467     SelectDSAppendConsume(N, IntrID);
2468     return;
2469   }
2470   }
2471 
2472   SelectCode(N);
2473 }
2474 
2475 void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
2476   unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
2477   unsigned Opcode;
2478   switch (IntrID) {
2479   case Intrinsic::amdgcn_wqm:
2480     Opcode = AMDGPU::WQM;
2481     break;
2482   case Intrinsic::amdgcn_softwqm:
2483     Opcode = AMDGPU::SOFT_WQM;
2484     break;
2485   case Intrinsic::amdgcn_wwm:
2486   case Intrinsic::amdgcn_strict_wwm:
2487     Opcode = AMDGPU::STRICT_WWM;
2488     break;
2489   case Intrinsic::amdgcn_strict_wqm:
2490     Opcode = AMDGPU::STRICT_WQM;
2491     break;
2492   case Intrinsic::amdgcn_interp_p1_f16:
2493     SelectInterpP1F16(N);
2494     return;
2495   default:
2496     SelectCode(N);
2497     return;
2498   }
2499 
2500   SDValue Src = N->getOperand(1);
2501   CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src});
2502 }
2503 
2504 void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
2505   unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
2506   switch (IntrID) {
2507   case Intrinsic::amdgcn_ds_gws_init:
2508   case Intrinsic::amdgcn_ds_gws_barrier:
2509   case Intrinsic::amdgcn_ds_gws_sema_v:
2510   case Intrinsic::amdgcn_ds_gws_sema_br:
2511   case Intrinsic::amdgcn_ds_gws_sema_p:
2512   case Intrinsic::amdgcn_ds_gws_sema_release_all:
2513     SelectDS_GWS(N, IntrID);
2514     return;
2515   default:
2516     break;
2517   }
2518 
2519   SelectCode(N);
2520 }
2521 
2522 bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
2523                                             unsigned &Mods,
2524                                             bool AllowAbs) const {
2525   Mods = 0;
2526   Src = In;
2527 
2528   if (Src.getOpcode() == ISD::FNEG) {
2529     Mods |= SISrcMods::NEG;
2530     Src = Src.getOperand(0);
2531   }
2532 
2533   if (AllowAbs && Src.getOpcode() == ISD::FABS) {
2534     Mods |= SISrcMods::ABS;
2535     Src = Src.getOperand(0);
2536   }
2537 
2538   return true;
2539 }
2540 
2541 bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
2542                                         SDValue &SrcMods) const {
2543   unsigned Mods;
2544   if (SelectVOP3ModsImpl(In, Src, Mods)) {
2545     SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2546     return true;
2547   }
2548 
2549   return false;
2550 }
2551 
2552 bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
2553                                          SDValue &SrcMods) const {
2554   unsigned Mods;
2555   if (SelectVOP3ModsImpl(In, Src, Mods, /* AllowAbs */ false)) {
2556     SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2557     return true;
2558   }
2559 
2560   return false;
2561 }
2562 
2563 bool AMDGPUDAGToDAGISel::SelectVOP3Mods_NNaN(SDValue In, SDValue &Src,
2564                                              SDValue &SrcMods) const {
2565   SelectVOP3Mods(In, Src, SrcMods);
2566   return isNoNanSrc(Src);
2567 }
2568 
2569 bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
2570   if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
2571     return false;
2572 
2573   Src = In;
2574   return true;
2575 }
2576 
2577 bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
2578                                          SDValue &SrcMods, SDValue &Clamp,
2579                                          SDValue &Omod) const {
2580   SDLoc DL(In);
2581   Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2582   Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2583 
2584   return SelectVOP3Mods(In, Src, SrcMods);
2585 }
2586 
2587 bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,
2588                                           SDValue &SrcMods, SDValue &Clamp,
2589                                           SDValue &Omod) const {
2590   SDLoc DL(In);
2591   Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2592   Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2593 
2594   return SelectVOP3BMods(In, Src, SrcMods);
2595 }
2596 
2597 bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
2598                                          SDValue &Clamp, SDValue &Omod) const {
2599   Src = In;
2600 
2601   SDLoc DL(In);
2602   Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2603   Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2604 
2605   return true;
2606 }
2607 
2608 bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
2609                                          SDValue &SrcMods, bool IsDOT) const {
2610   unsigned Mods = 0;
2611   Src = In;
2612 
2613   if (Src.getOpcode() == ISD::FNEG) {
2614     Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
2615     Src = Src.getOperand(0);
2616   }
2617 
2618   if (Src.getOpcode() == ISD::BUILD_VECTOR &&
2619       (!IsDOT || !Subtarget->hasDOTOpSelHazard())) {
2620     unsigned VecMods = Mods;
2621 
2622     SDValue Lo = stripBitcast(Src.getOperand(0));
2623     SDValue Hi = stripBitcast(Src.getOperand(1));
2624 
2625     if (Lo.getOpcode() == ISD::FNEG) {
2626       Lo = stripBitcast(Lo.getOperand(0));
2627       Mods ^= SISrcMods::NEG;
2628     }
2629 
2630     if (Hi.getOpcode() == ISD::FNEG) {
2631       Hi = stripBitcast(Hi.getOperand(0));
2632       Mods ^= SISrcMods::NEG_HI;
2633     }
2634 
2635     if (isExtractHiElt(Lo, Lo))
2636       Mods |= SISrcMods::OP_SEL_0;
2637 
2638     if (isExtractHiElt(Hi, Hi))
2639       Mods |= SISrcMods::OP_SEL_1;
2640 
2641     unsigned VecSize = Src.getValueSizeInBits();
2642     Lo = stripExtractLoElt(Lo);
2643     Hi = stripExtractLoElt(Hi);
2644 
2645     if (Lo.getValueSizeInBits() > VecSize) {
2646       Lo = CurDAG->getTargetExtractSubreg(
2647         (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
2648         MVT::getIntegerVT(VecSize), Lo);
2649     }
2650 
2651     if (Hi.getValueSizeInBits() > VecSize) {
2652       Hi = CurDAG->getTargetExtractSubreg(
2653         (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
2654         MVT::getIntegerVT(VecSize), Hi);
2655     }
2656 
2657     assert(Lo.getValueSizeInBits() <= VecSize &&
2658            Hi.getValueSizeInBits() <= VecSize);
2659 
2660     if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
2661       // Really a scalar input. Just select from the low half of the register to
2662       // avoid packing.
2663 
2664       if (VecSize == 32 || VecSize == Lo.getValueSizeInBits()) {
2665         Src = Lo;
2666       } else {
2667         assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
2668 
2669         SDLoc SL(In);
2670         SDValue Undef = SDValue(
2671           CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL,
2672                                  Lo.getValueType()), 0);
2673         auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID
2674                                     : AMDGPU::SReg_64RegClassID;
2675         const SDValue Ops[] = {
2676           CurDAG->getTargetConstant(RC, SL, MVT::i32),
2677           Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
2678           Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) };
2679 
2680         Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
2681                                              Src.getValueType(), Ops), 0);
2682       }
2683       SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2684       return true;
2685     }
2686 
2687     if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {
2688       uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
2689                       .bitcastToAPInt().getZExtValue();
2690       if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
2691         Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);;
2692         SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2693         return true;
2694       }
2695     }
2696 
2697     Mods = VecMods;
2698   }
2699 
2700   // Packed instructions do not have abs modifiers.
2701   Mods |= SISrcMods::OP_SEL_1;
2702 
2703   SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2704   return true;
2705 }
2706 
2707 bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
2708                                             SDValue &SrcMods) const {
2709   return SelectVOP3PMods(In, Src, SrcMods, true);
2710 }
2711 
2712 bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
2713                                          SDValue &SrcMods) const {
2714   Src = In;
2715   // FIXME: Handle op_sel
2716   SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
2717   return true;
2718 }
2719 
2720 bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
2721                                              SDValue &SrcMods) const {
2722   // FIXME: Handle op_sel
2723   return SelectVOP3Mods(In, Src, SrcMods);
2724 }
2725 
2726 // The return value is not whether the match is possible (which it always is),
2727 // but whether or not it a conversion is really used.
2728 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
2729                                                    unsigned &Mods) const {
2730   Mods = 0;
2731   SelectVOP3ModsImpl(In, Src, Mods);
2732 
2733   if (Src.getOpcode() == ISD::FP_EXTEND) {
2734     Src = Src.getOperand(0);
2735     assert(Src.getValueType() == MVT::f16);
2736     Src = stripBitcast(Src);
2737 
2738     // Be careful about folding modifiers if we already have an abs. fneg is
2739     // applied last, so we don't want to apply an earlier fneg.
2740     if ((Mods & SISrcMods::ABS) == 0) {
2741       unsigned ModsTmp;
2742       SelectVOP3ModsImpl(Src, Src, ModsTmp);
2743 
2744       if ((ModsTmp & SISrcMods::NEG) != 0)
2745         Mods ^= SISrcMods::NEG;
2746 
2747       if ((ModsTmp & SISrcMods::ABS) != 0)
2748         Mods |= SISrcMods::ABS;
2749     }
2750 
2751     // op_sel/op_sel_hi decide the source type and source.
2752     // If the source's op_sel_hi is set, it indicates to do a conversion from fp16.
2753     // If the sources's op_sel is set, it picks the high half of the source
2754     // register.
2755 
2756     Mods |= SISrcMods::OP_SEL_1;
2757     if (isExtractHiElt(Src, Src)) {
2758       Mods |= SISrcMods::OP_SEL_0;
2759 
2760       // TODO: Should we try to look for neg/abs here?
2761     }
2762 
2763     return true;
2764   }
2765 
2766   return false;
2767 }
2768 
2769 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
2770                                                SDValue &SrcMods) const {
2771   unsigned Mods = 0;
2772   SelectVOP3PMadMixModsImpl(In, Src, Mods);
2773   SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2774   return true;
2775 }
2776 
2777 SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
2778   if (In.isUndef())
2779     return CurDAG->getUNDEF(MVT::i32);
2780 
2781   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
2782     SDLoc SL(In);
2783     return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);
2784   }
2785 
2786   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
2787     SDLoc SL(In);
2788     return CurDAG->getConstant(
2789       C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
2790   }
2791 
2792   SDValue Src;
2793   if (isExtractHiElt(In, Src))
2794     return Src;
2795 
2796   return SDValue();
2797 }
2798 
2799 bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
2800   assert(CurDAG->getTarget().getTargetTriple().getArch() == Triple::amdgcn);
2801 
2802   const SIRegisterInfo *SIRI =
2803     static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
2804   const SIInstrInfo * SII =
2805     static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
2806 
2807   unsigned Limit = 0;
2808   bool AllUsesAcceptSReg = true;
2809   for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
2810     Limit < 10 && U != E; ++U, ++Limit) {
2811     const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
2812 
2813     // If the register class is unknown, it could be an unknown
2814     // register class that needs to be an SGPR, e.g. an inline asm
2815     // constraint
2816     if (!RC || SIRI->isSGPRClass(RC))
2817       return false;
2818 
2819     if (RC != &AMDGPU::VS_32RegClass) {
2820       AllUsesAcceptSReg = false;
2821       SDNode * User = *U;
2822       if (User->isMachineOpcode()) {
2823         unsigned Opc = User->getMachineOpcode();
2824         MCInstrDesc Desc = SII->get(Opc);
2825         if (Desc.isCommutable()) {
2826           unsigned OpIdx = Desc.getNumDefs() + U.getOperandNo();
2827           unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
2828           if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
2829             unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
2830             const TargetRegisterClass *CommutedRC = getOperandRegClass(*U, CommutedOpNo);
2831             if (CommutedRC == &AMDGPU::VS_32RegClass)
2832               AllUsesAcceptSReg = true;
2833           }
2834         }
2835       }
2836       // If "AllUsesAcceptSReg == false" so far we haven't succeeded
2837       // commuting current user. This means have at least one use
2838       // that strictly require VGPR. Thus, we will not attempt to commute
2839       // other user instructions.
2840       if (!AllUsesAcceptSReg)
2841         break;
2842     }
2843   }
2844   return !AllUsesAcceptSReg && (Limit < 10);
2845 }
2846 
2847 bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode * N) const {
2848   auto Ld = cast<LoadSDNode>(N);
2849 
2850   return Ld->getAlignment() >= 4 &&
2851         (
2852           (
2853             (
2854               Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS       ||
2855               Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT
2856             )
2857             &&
2858             !N->isDivergent()
2859           )
2860           ||
2861           (
2862             Subtarget->getScalarizeGlobalBehavior() &&
2863             Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
2864             Ld->isSimple() &&
2865             !N->isDivergent() &&
2866             static_cast<const SITargetLowering *>(
2867               getTargetLowering())->isMemOpHasNoClobberedMemOperand(N)
2868           )
2869         );
2870 }
2871 
2872 void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
2873   const AMDGPUTargetLowering& Lowering =
2874     *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
2875   bool IsModified = false;
2876   do {
2877     IsModified = false;
2878 
2879     // Go over all selected nodes and try to fold them a bit more
2880     SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_begin();
2881     while (Position != CurDAG->allnodes_end()) {
2882       SDNode *Node = &*Position++;
2883       MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Node);
2884       if (!MachineNode)
2885         continue;
2886 
2887       SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
2888       if (ResNode != Node) {
2889         if (ResNode)
2890           ReplaceUses(Node, ResNode);
2891         IsModified = true;
2892       }
2893     }
2894     CurDAG->RemoveDeadNodes();
2895   } while (IsModified);
2896 }
2897