1 //===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Defines an instruction selector for the AMDGPU target.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUISelDAGToDAG.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "MCTargetDesc/R600MCTargetDesc.h"
18 #include "R600RegisterInfo.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
21 #include "llvm/Analysis/ValueTracking.h"
22 #include "llvm/CodeGen/FunctionLoweringInfo.h"
23 #include "llvm/CodeGen/SelectionDAG.h"
24 #include "llvm/CodeGen/SelectionDAGISel.h"
25 #include "llvm/CodeGen/SelectionDAGNodes.h"
26 #include "llvm/IR/IntrinsicsAMDGPU.h"
27 #include "llvm/InitializePasses.h"
28 
29 #ifdef EXPENSIVE_CHECKS
30 #include "llvm/Analysis/LoopInfo.h"
31 #include "llvm/IR/Dominators.h"
32 #endif
33 
34 #define DEBUG_TYPE "isel"
35 
36 using namespace llvm;
37 
38 //===----------------------------------------------------------------------===//
39 // Instruction Selector Implementation
40 //===----------------------------------------------------------------------===//
41 
42 namespace {
43 
44 static SDValue stripBitcast(SDValue Val) {
45   return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
46 }
47 
48 // Figure out if this is really an extract of the high 16-bits of a dword.
49 static bool isExtractHiElt(SDValue In, SDValue &Out) {
50   In = stripBitcast(In);
51 
52   if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
53     if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
54       if (!Idx->isOne())
55         return false;
56       Out = In.getOperand(0);
57       return true;
58     }
59   }
60 
61   if (In.getOpcode() != ISD::TRUNCATE)
62     return false;
63 
64   SDValue Srl = In.getOperand(0);
65   if (Srl.getOpcode() == ISD::SRL) {
66     if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
67       if (ShiftAmt->getZExtValue() == 16) {
68         Out = stripBitcast(Srl.getOperand(0));
69         return true;
70       }
71     }
72   }
73 
74   return false;
75 }
76 
77 // Look through operations that obscure just looking at the low 16-bits of the
78 // same register.
79 static SDValue stripExtractLoElt(SDValue In) {
80   if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
81     if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
82       if (Idx->isZero() && In.getValueSizeInBits() <= 32)
83         return In.getOperand(0);
84     }
85   }
86 
87   if (In.getOpcode() == ISD::TRUNCATE) {
88     SDValue Src = In.getOperand(0);
89     if (Src.getValueType().getSizeInBits() == 32)
90       return stripBitcast(Src);
91   }
92 
93   return In;
94 }
95 
96 }  // end anonymous namespace
97 
98 INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "amdgpu-isel",
99                       "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
100 INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo)
101 INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysis)
102 INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
103 #ifdef EXPENSIVE_CHECKS
104 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
105 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
106 #endif
107 INITIALIZE_PASS_END(AMDGPUDAGToDAGISel, "amdgpu-isel",
108                     "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
109 
110 /// This pass converts a legalized DAG into a AMDGPU-specific
111 // DAG, ready for instruction scheduling.
112 FunctionPass *llvm::createAMDGPUISelDag(TargetMachine *TM,
113                                         CodeGenOpt::Level OptLevel) {
114   return new AMDGPUDAGToDAGISel(TM, OptLevel);
115 }
116 
117 AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(
118     TargetMachine *TM /*= nullptr*/,
119     CodeGenOpt::Level OptLevel /*= CodeGenOpt::Default*/)
120     : SelectionDAGISel(*TM, OptLevel) {
121   EnableLateStructurizeCFG = AMDGPUTargetMachine::EnableLateStructurizeCFG;
122 }
123 
124 bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
125 #ifdef EXPENSIVE_CHECKS
126   DominatorTree & DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
127   LoopInfo * LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
128   for (auto &L : LI->getLoopsInPreorder()) {
129     assert(L->isLCSSAForm(DT));
130   }
131 #endif
132   Subtarget = &MF.getSubtarget<GCNSubtarget>();
133   Mode = AMDGPU::SIModeRegisterDefaults(MF.getFunction());
134   return SelectionDAGISel::runOnMachineFunction(MF);
135 }
136 
137 bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
138   // XXX - only need to list legal operations.
139   switch (Opc) {
140   case ISD::FADD:
141   case ISD::FSUB:
142   case ISD::FMUL:
143   case ISD::FDIV:
144   case ISD::FREM:
145   case ISD::FCANONICALIZE:
146   case ISD::UINT_TO_FP:
147   case ISD::SINT_TO_FP:
148   case ISD::FABS:
149     // Fabs is lowered to a bit operation, but it's an and which will clear the
150     // high bits anyway.
151   case ISD::FSQRT:
152   case ISD::FSIN:
153   case ISD::FCOS:
154   case ISD::FPOWI:
155   case ISD::FPOW:
156   case ISD::FLOG:
157   case ISD::FLOG2:
158   case ISD::FLOG10:
159   case ISD::FEXP:
160   case ISD::FEXP2:
161   case ISD::FCEIL:
162   case ISD::FTRUNC:
163   case ISD::FRINT:
164   case ISD::FNEARBYINT:
165   case ISD::FROUND:
166   case ISD::FFLOOR:
167   case ISD::FMINNUM:
168   case ISD::FMAXNUM:
169   case AMDGPUISD::FRACT:
170   case AMDGPUISD::CLAMP:
171   case AMDGPUISD::COS_HW:
172   case AMDGPUISD::SIN_HW:
173   case AMDGPUISD::FMIN3:
174   case AMDGPUISD::FMAX3:
175   case AMDGPUISD::FMED3:
176   case AMDGPUISD::FMAD_FTZ:
177   case AMDGPUISD::RCP:
178   case AMDGPUISD::RSQ:
179   case AMDGPUISD::RCP_IFLAG:
180   case AMDGPUISD::LDEXP:
181     // On gfx10, all 16-bit instructions preserve the high bits.
182     return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
183   case ISD::FP_ROUND:
184     // We may select fptrunc (fma/mad) to mad_mixlo, which does not zero the
185     // high bits on gfx9.
186     // TODO: If we had the source node we could see if the source was fma/mad
187     return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
188   case ISD::FMA:
189   case ISD::FMAD:
190   case AMDGPUISD::DIV_FIXUP:
191     return Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
192   default:
193     // fcopysign, select and others may be lowered to 32-bit bit operations
194     // which don't zero the high bits.
195     return false;
196   }
197 }
198 
199 void AMDGPUDAGToDAGISel::getAnalysisUsage(AnalysisUsage &AU) const {
200   AU.addRequired<AMDGPUArgumentUsageInfo>();
201   AU.addRequired<LegacyDivergenceAnalysis>();
202 #ifdef EXPENSIVE_CHECKS
203   AU.addRequired<DominatorTreeWrapperPass>();
204   AU.addRequired<LoopInfoWrapperPass>();
205 #endif
206   SelectionDAGISel::getAnalysisUsage(AU);
207 }
208 
209 bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
210   assert(Subtarget->d16PreservesUnusedBits());
211   MVT VT = N->getValueType(0).getSimpleVT();
212   if (VT != MVT::v2i16 && VT != MVT::v2f16)
213     return false;
214 
215   SDValue Lo = N->getOperand(0);
216   SDValue Hi = N->getOperand(1);
217 
218   LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi));
219 
220   // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
221   // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
222   // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
223 
224   // Need to check for possible indirect dependencies on the other half of the
225   // vector to avoid introducing a cycle.
226   if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) {
227     SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
228 
229     SDValue TiedIn = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Lo);
230     SDValue Ops[] = {
231       LdHi->getChain(), LdHi->getBasePtr(), TiedIn
232     };
233 
234     unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
235     if (LdHi->getMemoryVT() == MVT::i8) {
236       LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
237         AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8;
238     } else {
239       assert(LdHi->getMemoryVT() == MVT::i16);
240     }
241 
242     SDValue NewLoadHi =
243       CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
244                                   Ops, LdHi->getMemoryVT(),
245                                   LdHi->getMemOperand());
246 
247     CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
248     CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
249     return true;
250   }
251 
252   // build_vector (load ptr), hi -> load_d16_lo ptr, hi
253   // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
254   // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
255   LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo));
256   if (LdLo && Lo.hasOneUse()) {
257     SDValue TiedIn = getHi16Elt(Hi);
258     if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode()))
259       return false;
260 
261     SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
262     unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
263     if (LdLo->getMemoryVT() == MVT::i8) {
264       LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
265         AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8;
266     } else {
267       assert(LdLo->getMemoryVT() == MVT::i16);
268     }
269 
270     TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
271 
272     SDValue Ops[] = {
273       LdLo->getChain(), LdLo->getBasePtr(), TiedIn
274     };
275 
276     SDValue NewLoadLo =
277       CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
278                                   Ops, LdLo->getMemoryVT(),
279                                   LdLo->getMemOperand());
280 
281     CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
282     CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
283     return true;
284   }
285 
286   return false;
287 }
288 
289 void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
290   if (!Subtarget->d16PreservesUnusedBits())
291     return;
292 
293   SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
294 
295   bool MadeChange = false;
296   while (Position != CurDAG->allnodes_begin()) {
297     SDNode *N = &*--Position;
298     if (N->use_empty())
299       continue;
300 
301     switch (N->getOpcode()) {
302     case ISD::BUILD_VECTOR:
303       MadeChange |= matchLoadD16FromBuildVector(N);
304       break;
305     default:
306       break;
307     }
308   }
309 
310   if (MadeChange) {
311     CurDAG->RemoveDeadNodes();
312     LLVM_DEBUG(dbgs() << "After PreProcess:\n";
313                CurDAG->dump(););
314   }
315 }
316 
317 bool AMDGPUDAGToDAGISel::isNoNanSrc(SDValue N) const {
318   if (TM.Options.NoNaNsFPMath)
319     return true;
320 
321   // TODO: Move into isKnownNeverNaN
322   if (N->getFlags().hasNoNaNs())
323     return true;
324 
325   return CurDAG->isKnownNeverNaN(N);
326 }
327 
328 bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N,
329                                            bool Negated) const {
330   if (N->isUndef())
331     return true;
332 
333   const SIInstrInfo *TII = Subtarget->getInstrInfo();
334   if (Negated) {
335     if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
336       return TII->isInlineConstant(-C->getAPIntValue());
337 
338     if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
339       return TII->isInlineConstant(-C->getValueAPF().bitcastToAPInt());
340 
341   } else {
342     if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
343       return TII->isInlineConstant(C->getAPIntValue());
344 
345     if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
346       return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt());
347   }
348 
349   return false;
350 }
351 
352 /// Determine the register class for \p OpNo
353 /// \returns The register class of the virtual register that will be used for
354 /// the given operand number \OpNo or NULL if the register class cannot be
355 /// determined.
356 const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
357                                                           unsigned OpNo) const {
358   if (!N->isMachineOpcode()) {
359     if (N->getOpcode() == ISD::CopyToReg) {
360       Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
361       if (Reg.isVirtual()) {
362         MachineRegisterInfo &MRI = CurDAG->getMachineFunction().getRegInfo();
363         return MRI.getRegClass(Reg);
364       }
365 
366       const SIRegisterInfo *TRI
367         = static_cast<const GCNSubtarget *>(Subtarget)->getRegisterInfo();
368       return TRI->getPhysRegClass(Reg);
369     }
370 
371     return nullptr;
372   }
373 
374   switch (N->getMachineOpcode()) {
375   default: {
376     const MCInstrDesc &Desc =
377         Subtarget->getInstrInfo()->get(N->getMachineOpcode());
378     unsigned OpIdx = Desc.getNumDefs() + OpNo;
379     if (OpIdx >= Desc.getNumOperands())
380       return nullptr;
381     int RegClass = Desc.OpInfo[OpIdx].RegClass;
382     if (RegClass == -1)
383       return nullptr;
384 
385     return Subtarget->getRegisterInfo()->getRegClass(RegClass);
386   }
387   case AMDGPU::REG_SEQUENCE: {
388     unsigned RCID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
389     const TargetRegisterClass *SuperRC =
390         Subtarget->getRegisterInfo()->getRegClass(RCID);
391 
392     SDValue SubRegOp = N->getOperand(OpNo + 1);
393     unsigned SubRegIdx = cast<ConstantSDNode>(SubRegOp)->getZExtValue();
394     return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
395                                                               SubRegIdx);
396   }
397   }
398 }
399 
400 SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
401                                          SDValue Glue) const {
402   SmallVector <SDValue, 8> Ops;
403   Ops.push_back(NewChain); // Replace the chain.
404   for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
405     Ops.push_back(N->getOperand(i));
406 
407   Ops.push_back(Glue);
408   return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
409 }
410 
411 SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
412   const SITargetLowering& Lowering =
413     *static_cast<const SITargetLowering*>(getTargetLowering());
414 
415   assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
416 
417   SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val);
418   return glueCopyToOp(N, M0, M0.getValue(1));
419 }
420 
421 SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
422   unsigned AS = cast<MemSDNode>(N)->getAddressSpace();
423   if (AS == AMDGPUAS::LOCAL_ADDRESS) {
424     if (Subtarget->ldsRequiresM0Init())
425       return glueCopyToM0(N, CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));
426   } else if (AS == AMDGPUAS::REGION_ADDRESS) {
427     MachineFunction &MF = CurDAG->getMachineFunction();
428     unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
429     return
430         glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
431   }
432   return N;
433 }
434 
435 MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
436                                                   EVT VT) const {
437   SDNode *Lo = CurDAG->getMachineNode(
438       AMDGPU::S_MOV_B32, DL, MVT::i32,
439       CurDAG->getTargetConstant(Imm & 0xFFFFFFFF, DL, MVT::i32));
440   SDNode *Hi =
441       CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
442                              CurDAG->getTargetConstant(Imm >> 32, DL, MVT::i32));
443   const SDValue Ops[] = {
444       CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
445       SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
446       SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
447 
448   return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
449 }
450 
451 void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
452   EVT VT = N->getValueType(0);
453   unsigned NumVectorElts = VT.getVectorNumElements();
454   EVT EltVT = VT.getVectorElementType();
455   SDLoc DL(N);
456   SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
457 
458   if (NumVectorElts == 1) {
459     CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
460                          RegClass);
461     return;
462   }
463 
464   assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
465                                   "supported yet");
466   // 32 = Max Num Vector Elements
467   // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
468   // 1 = Vector Register Class
469   SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
470 
471   bool IsGCN = CurDAG->getSubtarget().getTargetTriple().getArch() ==
472                Triple::amdgcn;
473   RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
474   bool IsRegSeq = true;
475   unsigned NOps = N->getNumOperands();
476   for (unsigned i = 0; i < NOps; i++) {
477     // XXX: Why is this here?
478     if (isa<RegisterSDNode>(N->getOperand(i))) {
479       IsRegSeq = false;
480       break;
481     }
482     unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
483                          : R600RegisterInfo::getSubRegFromChannel(i);
484     RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
485     RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
486   }
487   if (NOps != NumVectorElts) {
488     // Fill in the missing undef elements if this was a scalar_to_vector.
489     assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
490     MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
491                                                    DL, EltVT);
492     for (unsigned i = NOps; i < NumVectorElts; ++i) {
493       unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
494                            : R600RegisterInfo::getSubRegFromChannel(i);
495       RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
496       RegSeqArgs[1 + (2 * i) + 1] =
497           CurDAG->getTargetConstant(Sub, DL, MVT::i32);
498     }
499   }
500 
501   if (!IsRegSeq)
502     SelectCode(N);
503   CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
504 }
505 
506 void AMDGPUDAGToDAGISel::Select(SDNode *N) {
507   unsigned int Opc = N->getOpcode();
508   if (N->isMachineOpcode()) {
509     N->setNodeId(-1);
510     return;   // Already selected.
511   }
512 
513   // isa<MemSDNode> almost works but is slightly too permissive for some DS
514   // intrinsics.
515   if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N) ||
516       (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC ||
517        Opc == ISD::ATOMIC_LOAD_FADD ||
518        Opc == AMDGPUISD::ATOMIC_LOAD_FMIN ||
519        Opc == AMDGPUISD::ATOMIC_LOAD_FMAX)) {
520     N = glueCopyToM0LDSInit(N);
521     SelectCode(N);
522     return;
523   }
524 
525   switch (Opc) {
526   default:
527     break;
528   // We are selecting i64 ADD here instead of custom lower it during
529   // DAG legalization, so we can fold some i64 ADDs used for address
530   // calculation into the LOAD and STORE instructions.
531   case ISD::ADDC:
532   case ISD::ADDE:
533   case ISD::SUBC:
534   case ISD::SUBE: {
535     if (N->getValueType(0) != MVT::i64)
536       break;
537 
538     SelectADD_SUB_I64(N);
539     return;
540   }
541   case ISD::ADDCARRY:
542   case ISD::SUBCARRY:
543     if (N->getValueType(0) != MVT::i32)
544       break;
545 
546     SelectAddcSubb(N);
547     return;
548   case ISD::UADDO:
549   case ISD::USUBO: {
550     SelectUADDO_USUBO(N);
551     return;
552   }
553   case AMDGPUISD::FMUL_W_CHAIN: {
554     SelectFMUL_W_CHAIN(N);
555     return;
556   }
557   case AMDGPUISD::FMA_W_CHAIN: {
558     SelectFMA_W_CHAIN(N);
559     return;
560   }
561 
562   case ISD::SCALAR_TO_VECTOR:
563   case ISD::BUILD_VECTOR: {
564     EVT VT = N->getValueType(0);
565     unsigned NumVectorElts = VT.getVectorNumElements();
566     if (VT.getScalarSizeInBits() == 16) {
567       if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
568         if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {
569           ReplaceNode(N, Packed);
570           return;
571         }
572       }
573 
574       break;
575     }
576 
577     assert(VT.getVectorElementType().bitsEq(MVT::i32));
578     unsigned RegClassID =
579         SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID();
580     SelectBuildVector(N, RegClassID);
581     return;
582   }
583   case ISD::BUILD_PAIR: {
584     SDValue RC, SubReg0, SubReg1;
585     SDLoc DL(N);
586     if (N->getValueType(0) == MVT::i128) {
587       RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32);
588       SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
589       SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
590     } else if (N->getValueType(0) == MVT::i64) {
591       RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);
592       SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
593       SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
594     } else {
595       llvm_unreachable("Unhandled value type for BUILD_PAIR");
596     }
597     const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
598                             N->getOperand(1), SubReg1 };
599     ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
600                                           N->getValueType(0), Ops));
601     return;
602   }
603 
604   case ISD::Constant:
605   case ISD::ConstantFP: {
606     if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
607       break;
608 
609     uint64_t Imm;
610     if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N))
611       Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
612     else {
613       ConstantSDNode *C = cast<ConstantSDNode>(N);
614       Imm = C->getZExtValue();
615     }
616 
617     SDLoc DL(N);
618     ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
619     return;
620   }
621   case AMDGPUISD::BFE_I32:
622   case AMDGPUISD::BFE_U32: {
623     // There is a scalar version available, but unlike the vector version which
624     // has a separate operand for the offset and width, the scalar version packs
625     // the width and offset into a single operand. Try to move to the scalar
626     // version if the offsets are constant, so that we can try to keep extended
627     // loads of kernel arguments in SGPRs.
628 
629     // TODO: Technically we could try to pattern match scalar bitshifts of
630     // dynamic values, but it's probably not useful.
631     ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
632     if (!Offset)
633       break;
634 
635     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
636     if (!Width)
637       break;
638 
639     bool Signed = Opc == AMDGPUISD::BFE_I32;
640 
641     uint32_t OffsetVal = Offset->getZExtValue();
642     uint32_t WidthVal = Width->getZExtValue();
643 
644     ReplaceNode(N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal,
645                             WidthVal));
646     return;
647   }
648   case AMDGPUISD::DIV_SCALE: {
649     SelectDIV_SCALE(N);
650     return;
651   }
652   case AMDGPUISD::MAD_I64_I32:
653   case AMDGPUISD::MAD_U64_U32: {
654     SelectMAD_64_32(N);
655     return;
656   }
657   case ISD::SMUL_LOHI:
658   case ISD::UMUL_LOHI:
659     return SelectMUL_LOHI(N);
660   case ISD::CopyToReg: {
661     const SITargetLowering& Lowering =
662       *static_cast<const SITargetLowering*>(getTargetLowering());
663     N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);
664     break;
665   }
666   case ISD::AND:
667   case ISD::SRL:
668   case ISD::SRA:
669   case ISD::SIGN_EXTEND_INREG:
670     if (N->getValueType(0) != MVT::i32)
671       break;
672 
673     SelectS_BFE(N);
674     return;
675   case ISD::BRCOND:
676     SelectBRCOND(N);
677     return;
678   case ISD::FMAD:
679   case ISD::FMA:
680     SelectFMAD_FMA(N);
681     return;
682   case AMDGPUISD::ATOMIC_CMP_SWAP:
683     SelectATOMIC_CMP_SWAP(N);
684     return;
685   case AMDGPUISD::CVT_PKRTZ_F16_F32:
686   case AMDGPUISD::CVT_PKNORM_I16_F32:
687   case AMDGPUISD::CVT_PKNORM_U16_F32:
688   case AMDGPUISD::CVT_PK_U16_U32:
689   case AMDGPUISD::CVT_PK_I16_I32: {
690     // Hack around using a legal type if f16 is illegal.
691     if (N->getValueType(0) == MVT::i32) {
692       MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
693       N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),
694                               { N->getOperand(0), N->getOperand(1) });
695       SelectCode(N);
696       return;
697     }
698 
699     break;
700   }
701   case ISD::INTRINSIC_W_CHAIN: {
702     SelectINTRINSIC_W_CHAIN(N);
703     return;
704   }
705   case ISD::INTRINSIC_WO_CHAIN: {
706     SelectINTRINSIC_WO_CHAIN(N);
707     return;
708   }
709   case ISD::INTRINSIC_VOID: {
710     SelectINTRINSIC_VOID(N);
711     return;
712   }
713   }
714 
715   SelectCode(N);
716 }
717 
718 bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
719   const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
720   const Instruction *Term = BB->getTerminator();
721   return Term->getMetadata("amdgpu.uniform") ||
722          Term->getMetadata("structurizecfg.uniform");
723 }
724 
725 bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N,
726                                              unsigned ShAmtBits) const {
727   assert(N->getOpcode() == ISD::AND);
728 
729   const APInt &RHS = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
730   if (RHS.countTrailingOnes() >= ShAmtBits)
731     return true;
732 
733   const APInt &LHSKnownZeros = CurDAG->computeKnownBits(N->getOperand(0)).Zero;
734   return (LHSKnownZeros | RHS).countTrailingOnes() >= ShAmtBits;
735 }
736 
737 static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr,
738                                           SDValue &N0, SDValue &N1) {
739   if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
740       Addr.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
741     // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
742     // (i64 (bitcast (v2i32 (build_vector
743     //                        (or (extract_vector_elt V, 0), OFFSET),
744     //                        (extract_vector_elt V, 1)))))
745     SDValue Lo = Addr.getOperand(0).getOperand(0);
746     if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
747       SDValue BaseLo = Lo.getOperand(0);
748       SDValue BaseHi = Addr.getOperand(0).getOperand(1);
749       // Check that split base (Lo and Hi) are extracted from the same one.
750       if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
751           BaseHi.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
752           BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
753           // Lo is statically extracted from index 0.
754           isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
755           BaseLo.getConstantOperandVal(1) == 0 &&
756           // Hi is statically extracted from index 0.
757           isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
758           BaseHi.getConstantOperandVal(1) == 1) {
759         N0 = BaseLo.getOperand(0).getOperand(0);
760         N1 = Lo.getOperand(1);
761         return true;
762       }
763     }
764   }
765   return false;
766 }
767 
768 bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
769                                                     SDValue &RHS) const {
770   if (CurDAG->isBaseWithConstantOffset(Addr)) {
771     LHS = Addr.getOperand(0);
772     RHS = Addr.getOperand(1);
773     return true;
774   }
775 
776   if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, LHS, RHS)) {
777     assert(LHS && RHS && isa<ConstantSDNode>(RHS));
778     return true;
779   }
780 
781   return false;
782 }
783 
784 StringRef AMDGPUDAGToDAGISel::getPassName() const {
785   return "AMDGPU DAG->DAG Pattern Instruction Selection";
786 }
787 
788 //===----------------------------------------------------------------------===//
789 // Complex Patterns
790 //===----------------------------------------------------------------------===//
791 
792 bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
793                                             SDValue &Offset) {
794   return false;
795 }
796 
797 bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
798                                             SDValue &Offset) {
799   ConstantSDNode *C;
800   SDLoc DL(Addr);
801 
802   if ((C = dyn_cast<ConstantSDNode>(Addr))) {
803     Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
804     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
805   } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
806              (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
807     Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
808     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
809   } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
810             (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
811     Base = Addr.getOperand(0);
812     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
813   } else {
814     Base = Addr;
815     Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
816   }
817 
818   return true;
819 }
820 
821 SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
822                                                        const SDLoc &DL) const {
823   SDNode *Mov = CurDAG->getMachineNode(
824     AMDGPU::S_MOV_B32, DL, MVT::i32,
825     CurDAG->getTargetConstant(Val, DL, MVT::i32));
826   return SDValue(Mov, 0);
827 }
828 
829 // FIXME: Should only handle addcarry/subcarry
830 void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
831   SDLoc DL(N);
832   SDValue LHS = N->getOperand(0);
833   SDValue RHS = N->getOperand(1);
834 
835   unsigned Opcode = N->getOpcode();
836   bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE);
837   bool ProduceCarry =
838       ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC;
839   bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE;
840 
841   SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
842   SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
843 
844   SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
845                                        DL, MVT::i32, LHS, Sub0);
846   SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
847                                        DL, MVT::i32, LHS, Sub1);
848 
849   SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
850                                        DL, MVT::i32, RHS, Sub0);
851   SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
852                                        DL, MVT::i32, RHS, Sub1);
853 
854   SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
855 
856   static const unsigned OpcMap[2][2][2] = {
857       {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
858        {AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}},
859       {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
860        {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
861 
862   unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd];
863   unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd];
864 
865   SDNode *AddLo;
866   if (!ConsumeCarry) {
867     SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
868     AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args);
869   } else {
870     SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) };
871     AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
872   }
873   SDValue AddHiArgs[] = {
874     SDValue(Hi0, 0),
875     SDValue(Hi1, 0),
876     SDValue(AddLo, 1)
877   };
878   SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs);
879 
880   SDValue RegSequenceArgs[] = {
881     CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
882     SDValue(AddLo,0),
883     Sub0,
884     SDValue(AddHi,0),
885     Sub1,
886   };
887   SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
888                                                MVT::i64, RegSequenceArgs);
889 
890   if (ProduceCarry) {
891     // Replace the carry-use
892     ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));
893   }
894 
895   // Replace the remaining uses.
896   ReplaceNode(N, RegSequence);
897 }
898 
899 void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
900   SDLoc DL(N);
901   SDValue LHS = N->getOperand(0);
902   SDValue RHS = N->getOperand(1);
903   SDValue CI = N->getOperand(2);
904 
905   if (N->isDivergent()) {
906     unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::V_ADDC_U32_e64
907                                                    : AMDGPU::V_SUBB_U32_e64;
908     CurDAG->SelectNodeTo(
909         N, Opc, N->getVTList(),
910         {LHS, RHS, CI,
911          CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
912   } else {
913     unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::S_ADD_CO_PSEUDO
914                                                    : AMDGPU::S_SUB_CO_PSEUDO;
915     CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI});
916   }
917 }
918 
919 void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
920   // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
921   // carry out despite the _i32 name. These were renamed in VI to _U32.
922   // FIXME: We should probably rename the opcodes here.
923   bool IsAdd = N->getOpcode() == ISD::UADDO;
924   bool IsVALU = N->isDivergent();
925 
926   for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end(); UI != E;
927        ++UI)
928     if (UI.getUse().getResNo() == 1) {
929       if ((IsAdd && (UI->getOpcode() != ISD::ADDCARRY)) ||
930           (!IsAdd && (UI->getOpcode() != ISD::SUBCARRY))) {
931         IsVALU = true;
932         break;
933       }
934     }
935 
936   if (IsVALU) {
937     unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
938 
939     CurDAG->SelectNodeTo(
940         N, Opc, N->getVTList(),
941         {N->getOperand(0), N->getOperand(1),
942          CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
943   } else {
944     unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::S_UADDO_PSEUDO
945                                                 : AMDGPU::S_USUBO_PSEUDO;
946 
947     CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
948                          {N->getOperand(0), N->getOperand(1)});
949   }
950 }
951 
952 void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
953   SDLoc SL(N);
954   //  src0_modifiers, src0,  src1_modifiers, src1, src2_modifiers, src2, clamp, omod
955   SDValue Ops[10];
956 
957   SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
958   SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
959   SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);
960   Ops[8] = N->getOperand(0);
961   Ops[9] = N->getOperand(4);
962 
963   // If there are no source modifiers, prefer fmac over fma because it can use
964   // the smaller VOP2 encoding.
965   bool UseFMAC = Subtarget->hasDLInsts() &&
966                  cast<ConstantSDNode>(Ops[0])->isZero() &&
967                  cast<ConstantSDNode>(Ops[2])->isZero() &&
968                  cast<ConstantSDNode>(Ops[4])->isZero();
969   unsigned Opcode = UseFMAC ? AMDGPU::V_FMAC_F32_e64 : AMDGPU::V_FMA_F32_e64;
970   CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), Ops);
971 }
972 
973 void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
974   SDLoc SL(N);
975   //    src0_modifiers, src0,  src1_modifiers, src1, clamp, omod
976   SDValue Ops[8];
977 
978   SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
979   SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
980   Ops[6] = N->getOperand(0);
981   Ops[7] = N->getOperand(3);
982 
983   CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
984 }
985 
986 // We need to handle this here because tablegen doesn't support matching
987 // instructions with multiple outputs.
988 void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
989   SDLoc SL(N);
990   EVT VT = N->getValueType(0);
991 
992   assert(VT == MVT::f32 || VT == MVT::f64);
993 
994   unsigned Opc
995     = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
996 
997   // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
998   // omod
999   SDValue Ops[8];
1000   SelectVOP3BMods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
1001   SelectVOP3BMods(N->getOperand(1), Ops[3], Ops[2]);
1002   SelectVOP3BMods(N->getOperand(2), Ops[5], Ops[4]);
1003   CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1004 }
1005 
1006 // We need to handle this here because tablegen doesn't support matching
1007 // instructions with multiple outputs.
1008 void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
1009   SDLoc SL(N);
1010   bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
1011   unsigned Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1012 
1013   SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1014   SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
1015                     Clamp };
1016   CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1017 }
1018 
1019 // We need to handle this here because tablegen doesn't support matching
1020 // instructions with multiple outputs.
1021 void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
1022   SDLoc SL(N);
1023   bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
1024   unsigned Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1025 
1026   SDValue Zero = CurDAG->getTargetConstant(0, SL, MVT::i64);
1027   SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1028   SDValue Ops[] = {N->getOperand(0), N->getOperand(1), Zero, Clamp};
1029   SDNode *Mad = CurDAG->getMachineNode(Opc, SL, N->getVTList(), Ops);
1030   if (!SDValue(N, 0).use_empty()) {
1031     SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32);
1032     SDNode *Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1033                                         MVT::i32, SDValue(Mad, 0), Sub0);
1034     ReplaceUses(SDValue(N, 0), SDValue(Lo, 0));
1035   }
1036   if (!SDValue(N, 1).use_empty()) {
1037     SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32);
1038     SDNode *Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SL,
1039                                         MVT::i32, SDValue(Mad, 0), Sub1);
1040     ReplaceUses(SDValue(N, 1), SDValue(Hi, 0));
1041   }
1042   CurDAG->RemoveDeadNode(N);
1043 }
1044 
1045 bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
1046   if (!isUInt<16>(Offset))
1047     return false;
1048 
1049   if (!Base || Subtarget->hasUsableDSOffset() ||
1050       Subtarget->unsafeDSOffsetFoldingEnabled())
1051     return true;
1052 
1053   // On Southern Islands instruction with a negative base value and an offset
1054   // don't seem to work.
1055   return CurDAG->SignBitIsZero(Base);
1056 }
1057 
1058 bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
1059                                               SDValue &Offset) const {
1060   SDLoc DL(Addr);
1061   if (CurDAG->isBaseWithConstantOffset(Addr)) {
1062     SDValue N0 = Addr.getOperand(0);
1063     SDValue N1 = Addr.getOperand(1);
1064     ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1065     if (isDSOffsetLegal(N0, C1->getSExtValue())) {
1066       // (add n0, c0)
1067       Base = N0;
1068       Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1069       return true;
1070     }
1071   } else if (Addr.getOpcode() == ISD::SUB) {
1072     // sub C, x -> add (sub 0, x), C
1073     if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1074       int64_t ByteOffset = C->getSExtValue();
1075       if (isDSOffsetLegal(SDValue(), ByteOffset)) {
1076         SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1077 
1078         // XXX - This is kind of hacky. Create a dummy sub node so we can check
1079         // the known bits in isDSOffsetLegal. We need to emit the selected node
1080         // here, so this is thrown away.
1081         SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
1082                                       Zero, Addr.getOperand(1));
1083 
1084         if (isDSOffsetLegal(Sub, ByteOffset)) {
1085           SmallVector<SDValue, 3> Opnds;
1086           Opnds.push_back(Zero);
1087           Opnds.push_back(Addr.getOperand(1));
1088 
1089           // FIXME: Select to VOP3 version for with-carry.
1090           unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1091           if (Subtarget->hasAddNoCarry()) {
1092             SubOp = AMDGPU::V_SUB_U32_e64;
1093             Opnds.push_back(
1094                 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1095           }
1096 
1097           MachineSDNode *MachineSub =
1098               CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
1099 
1100           Base = SDValue(MachineSub, 0);
1101           Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
1102           return true;
1103         }
1104       }
1105     }
1106   } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1107     // If we have a constant address, prefer to put the constant into the
1108     // offset. This can save moves to load the constant address since multiple
1109     // operations can share the zero base address register, and enables merging
1110     // into read2 / write2 instructions.
1111 
1112     SDLoc DL(Addr);
1113 
1114     if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {
1115       SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1116       MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
1117                                  DL, MVT::i32, Zero);
1118       Base = SDValue(MovZero, 0);
1119       Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1120       return true;
1121     }
1122   }
1123 
1124   // default case
1125   Base = Addr;
1126   Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);
1127   return true;
1128 }
1129 
1130 bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
1131                                           unsigned Offset1,
1132                                           unsigned Size) const {
1133   if (Offset0 % Size != 0 || Offset1 % Size != 0)
1134     return false;
1135   if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
1136     return false;
1137 
1138   if (!Base || Subtarget->hasUsableDSOffset() ||
1139       Subtarget->unsafeDSOffsetFoldingEnabled())
1140     return true;
1141 
1142   // On Southern Islands instruction with a negative base value and an offset
1143   // don't seem to work.
1144   return CurDAG->SignBitIsZero(Base);
1145 }
1146 
1147 // TODO: If offset is too big, put low 16-bit into offset.
1148 bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
1149                                                    SDValue &Offset0,
1150                                                    SDValue &Offset1) const {
1151   return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4);
1152 }
1153 
1154 bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
1155                                                     SDValue &Offset0,
1156                                                     SDValue &Offset1) const {
1157   return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8);
1158 }
1159 
1160 bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
1161                                             SDValue &Offset0, SDValue &Offset1,
1162                                             unsigned Size) const {
1163   SDLoc DL(Addr);
1164 
1165   if (CurDAG->isBaseWithConstantOffset(Addr)) {
1166     SDValue N0 = Addr.getOperand(0);
1167     SDValue N1 = Addr.getOperand(1);
1168     ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1169     unsigned OffsetValue0 = C1->getZExtValue();
1170     unsigned OffsetValue1 = OffsetValue0 + Size;
1171 
1172     // (add n0, c0)
1173     if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) {
1174       Base = N0;
1175       Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1176       Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1177       return true;
1178     }
1179   } else if (Addr.getOpcode() == ISD::SUB) {
1180     // sub C, x -> add (sub 0, x), C
1181     if (const ConstantSDNode *C =
1182             dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1183       unsigned OffsetValue0 = C->getZExtValue();
1184       unsigned OffsetValue1 = OffsetValue0 + Size;
1185 
1186       if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1187         SDLoc DL(Addr);
1188         SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1189 
1190         // XXX - This is kind of hacky. Create a dummy sub node so we can check
1191         // the known bits in isDSOffsetLegal. We need to emit the selected node
1192         // here, so this is thrown away.
1193         SDValue Sub =
1194             CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
1195 
1196         if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) {
1197           SmallVector<SDValue, 3> Opnds;
1198           Opnds.push_back(Zero);
1199           Opnds.push_back(Addr.getOperand(1));
1200           unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1201           if (Subtarget->hasAddNoCarry()) {
1202             SubOp = AMDGPU::V_SUB_U32_e64;
1203             Opnds.push_back(
1204                 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1205           }
1206 
1207           MachineSDNode *MachineSub = CurDAG->getMachineNode(
1208               SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds);
1209 
1210           Base = SDValue(MachineSub, 0);
1211           Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1212           Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1213           return true;
1214         }
1215       }
1216     }
1217   } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1218     unsigned OffsetValue0 = CAddr->getZExtValue();
1219     unsigned OffsetValue1 = OffsetValue0 + Size;
1220 
1221     if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1222       SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1223       MachineSDNode *MovZero =
1224           CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
1225       Base = SDValue(MovZero, 0);
1226       Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1227       Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1228       return true;
1229     }
1230   }
1231 
1232   // default case
1233 
1234   Base = Addr;
1235   Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8);
1236   Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8);
1237   return true;
1238 }
1239 
1240 bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
1241                                      SDValue &SOffset, SDValue &Offset,
1242                                      SDValue &Offen, SDValue &Idxen,
1243                                      SDValue &Addr64) const {
1244   // Subtarget prefers to use flat instruction
1245   // FIXME: This should be a pattern predicate and not reach here
1246   if (Subtarget->useFlatForGlobal())
1247     return false;
1248 
1249   SDLoc DL(Addr);
1250 
1251   Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1252   Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1253   Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
1254   SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1255 
1256   ConstantSDNode *C1 = nullptr;
1257   SDValue N0 = Addr;
1258   if (CurDAG->isBaseWithConstantOffset(Addr)) {
1259     C1 = cast<ConstantSDNode>(Addr.getOperand(1));
1260     if (isUInt<32>(C1->getZExtValue()))
1261       N0 = Addr.getOperand(0);
1262     else
1263       C1 = nullptr;
1264   }
1265 
1266   if (N0.getOpcode() == ISD::ADD) {
1267     // (add N2, N3) -> addr64, or
1268     // (add (add N2, N3), C1) -> addr64
1269     SDValue N2 = N0.getOperand(0);
1270     SDValue N3 = N0.getOperand(1);
1271     Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1272 
1273     if (N2->isDivergent()) {
1274       if (N3->isDivergent()) {
1275         // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1276         // addr64, and construct the resource from a 0 address.
1277         Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1278         VAddr = N0;
1279       } else {
1280         // N2 is divergent, N3 is not.
1281         Ptr = N3;
1282         VAddr = N2;
1283       }
1284     } else {
1285       // N2 is not divergent.
1286       Ptr = N2;
1287       VAddr = N3;
1288     }
1289     Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
1290   } else if (N0->isDivergent()) {
1291     // N0 is divergent. Use it as the addr64, and construct the resource from a
1292     // 0 address.
1293     Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1294     VAddr = N0;
1295     Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1296   } else {
1297     // N0 -> offset, or
1298     // (N0 + C1) -> offset
1299     VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
1300     Ptr = N0;
1301   }
1302 
1303   if (!C1) {
1304     // No offset.
1305     Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
1306     return true;
1307   }
1308 
1309   if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue())) {
1310     // Legal offset for instruction.
1311     Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1312     return true;
1313   }
1314 
1315   // Illegal offset, store it in soffset.
1316   Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
1317   SOffset =
1318       SDValue(CurDAG->getMachineNode(
1319                   AMDGPU::S_MOV_B32, DL, MVT::i32,
1320                   CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
1321               0);
1322   return true;
1323 }
1324 
1325 bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1326                                            SDValue &VAddr, SDValue &SOffset,
1327                                            SDValue &Offset) const {
1328   SDValue Ptr, Offen, Idxen, Addr64;
1329 
1330   // addr64 bit was removed for volcanic islands.
1331   // FIXME: This should be a pattern predicate and not reach here
1332   if (!Subtarget->hasAddr64())
1333     return false;
1334 
1335   if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1336     return false;
1337 
1338   ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
1339   if (C->getSExtValue()) {
1340     SDLoc DL(Addr);
1341 
1342     const SITargetLowering& Lowering =
1343       *static_cast<const SITargetLowering*>(getTargetLowering());
1344 
1345     SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
1346     return true;
1347   }
1348 
1349   return false;
1350 }
1351 
1352 std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
1353   SDLoc DL(N);
1354 
1355   auto *FI = dyn_cast<FrameIndexSDNode>(N);
1356   SDValue TFI =
1357       FI ? CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)) : N;
1358 
1359   // We rebase the base address into an absolute stack address and hence
1360   // use constant 0 for soffset. This value must be retained until
1361   // frame elimination and eliminateFrameIndex will choose the appropriate
1362   // frame register if need be.
1363   return std::make_pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));
1364 }
1365 
1366 bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
1367                                                  SDValue Addr, SDValue &Rsrc,
1368                                                  SDValue &VAddr, SDValue &SOffset,
1369                                                  SDValue &ImmOffset) const {
1370 
1371   SDLoc DL(Addr);
1372   MachineFunction &MF = CurDAG->getMachineFunction();
1373   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1374 
1375   Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1376 
1377   if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1378     int64_t Imm = CAddr->getSExtValue();
1379     const int64_t NullPtr =
1380         AMDGPUTargetMachine::getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS);
1381     // Don't fold null pointer.
1382     if (Imm != NullPtr) {
1383       SDValue HighBits = CurDAG->getTargetConstant(Imm & ~4095, DL, MVT::i32);
1384       MachineSDNode *MovHighBits = CurDAG->getMachineNode(
1385         AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
1386       VAddr = SDValue(MovHighBits, 0);
1387 
1388       SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1389       ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16);
1390       return true;
1391     }
1392   }
1393 
1394   if (CurDAG->isBaseWithConstantOffset(Addr)) {
1395     // (add n0, c1)
1396 
1397     SDValue N0 = Addr.getOperand(0);
1398     SDValue N1 = Addr.getOperand(1);
1399 
1400     // Offsets in vaddr must be positive if range checking is enabled.
1401     //
1402     // The total computation of vaddr + soffset + offset must not overflow.  If
1403     // vaddr is negative, even if offset is 0 the sgpr offset add will end up
1404     // overflowing.
1405     //
1406     // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1407     // always perform a range check. If a negative vaddr base index was used,
1408     // this would fail the range check. The overall address computation would
1409     // compute a valid address, but this doesn't happen due to the range
1410     // check. For out-of-bounds MUBUF loads, a 0 is returned.
1411     //
1412     // Therefore it should be safe to fold any VGPR offset on gfx9 into the
1413     // MUBUF vaddr, but not on older subtargets which can only do this if the
1414     // sign bit is known 0.
1415     ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1416     if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue()) &&
1417         (!Subtarget->privateMemoryResourceIsRangeChecked() ||
1418          CurDAG->SignBitIsZero(N0))) {
1419       std::tie(VAddr, SOffset) = foldFrameIndex(N0);
1420       ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1421       return true;
1422     }
1423   }
1424 
1425   // (node)
1426   std::tie(VAddr, SOffset) = foldFrameIndex(Addr);
1427   ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16);
1428   return true;
1429 }
1430 
1431 static bool IsCopyFromSGPR(const SIRegisterInfo &TRI, SDValue Val) {
1432   if (Val.getOpcode() != ISD::CopyFromReg)
1433     return false;
1434   auto RC =
1435       TRI.getPhysRegClass(cast<RegisterSDNode>(Val.getOperand(1))->getReg());
1436   return RC && TRI.isSGPRClass(RC);
1437 }
1438 
1439 bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
1440                                                   SDValue Addr,
1441                                                   SDValue &SRsrc,
1442                                                   SDValue &SOffset,
1443                                                   SDValue &Offset) const {
1444   const SIRegisterInfo *TRI =
1445       static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
1446   MachineFunction &MF = CurDAG->getMachineFunction();
1447   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1448   SDLoc DL(Addr);
1449 
1450   // CopyFromReg <sgpr>
1451   if (IsCopyFromSGPR(*TRI, Addr)) {
1452     SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1453     SOffset = Addr;
1454     Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
1455     return true;
1456   }
1457 
1458   ConstantSDNode *CAddr;
1459   if (Addr.getOpcode() == ISD::ADD) {
1460     // Add (CopyFromReg <sgpr>) <constant>
1461     CAddr = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
1462     if (!CAddr || !SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue()))
1463       return false;
1464     if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0)))
1465       return false;
1466 
1467     SOffset = Addr.getOperand(0);
1468   } else if ((CAddr = dyn_cast<ConstantSDNode>(Addr)) &&
1469              SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue())) {
1470     // <constant>
1471     SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1472   } else {
1473     return false;
1474   }
1475 
1476   SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1477 
1478   Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1479   return true;
1480 }
1481 
1482 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1483                                            SDValue &SOffset, SDValue &Offset
1484                                            ) const {
1485   SDValue Ptr, VAddr, Offen, Idxen, Addr64;
1486   const SIInstrInfo *TII =
1487     static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
1488 
1489   if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
1490     return false;
1491 
1492   if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
1493       !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
1494       !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
1495     uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
1496                     APInt::getAllOnes(32).getZExtValue(); // Size
1497     SDLoc DL(Addr);
1498 
1499     const SITargetLowering& Lowering =
1500       *static_cast<const SITargetLowering*>(getTargetLowering());
1501 
1502     SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
1503     return true;
1504   }
1505   return false;
1506 }
1507 
1508 // Find a load or store from corresponding pattern root.
1509 // Roots may be build_vector, bitconvert or their combinations.
1510 static MemSDNode* findMemSDNode(SDNode *N) {
1511   N = AMDGPUTargetLowering::stripBitcast(SDValue(N,0)).getNode();
1512   if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
1513     return MN;
1514   assert(isa<BuildVectorSDNode>(N));
1515   for (SDValue V : N->op_values())
1516     if (MemSDNode *MN =
1517           dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
1518       return MN;
1519   llvm_unreachable("cannot find MemSDNode in the pattern!");
1520 }
1521 
1522 bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
1523                                               SDValue &VAddr, SDValue &Offset,
1524                                               uint64_t FlatVariant) const {
1525   int64_t OffsetVal = 0;
1526 
1527   unsigned AS = findMemSDNode(N)->getAddressSpace();
1528 
1529   bool CanHaveFlatSegmentOffsetBug =
1530       Subtarget->hasFlatSegmentOffsetBug() &&
1531       FlatVariant == SIInstrFlags::FLAT &&
1532       (AS == AMDGPUAS::FLAT_ADDRESS || AS == AMDGPUAS::GLOBAL_ADDRESS);
1533 
1534   if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
1535     SDValue N0, N1;
1536     if (isBaseWithConstantOffset64(Addr, N0, N1)) {
1537       int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
1538 
1539       const SIInstrInfo *TII = Subtarget->getInstrInfo();
1540       if (TII->isLegalFLATOffset(COffsetVal, AS, FlatVariant)) {
1541         Addr = N0;
1542         OffsetVal = COffsetVal;
1543       } else {
1544         // If the offset doesn't fit, put the low bits into the offset field and
1545         // add the rest.
1546         //
1547         // For a FLAT instruction the hardware decides whether to access
1548         // global/scratch/shared memory based on the high bits of vaddr,
1549         // ignoring the offset field, so we have to ensure that when we add
1550         // remainder to vaddr it still points into the same underlying object.
1551         // The easiest way to do that is to make sure that we split the offset
1552         // into two pieces that are both >= 0 or both <= 0.
1553 
1554         SDLoc DL(N);
1555         uint64_t RemainderOffset;
1556 
1557         std::tie(OffsetVal, RemainderOffset) =
1558             TII->splitFlatOffset(COffsetVal, AS, FlatVariant);
1559 
1560         SDValue AddOffsetLo =
1561             getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1562         SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1563 
1564         if (Addr.getValueType().getSizeInBits() == 32) {
1565           SmallVector<SDValue, 3> Opnds;
1566           Opnds.push_back(N0);
1567           Opnds.push_back(AddOffsetLo);
1568           unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1569           if (Subtarget->hasAddNoCarry()) {
1570             AddOp = AMDGPU::V_ADD_U32_e64;
1571             Opnds.push_back(Clamp);
1572           }
1573           Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
1574         } else {
1575           // TODO: Should this try to use a scalar add pseudo if the base address
1576           // is uniform and saddr is usable?
1577           SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1578           SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1579 
1580           SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1581                                                 DL, MVT::i32, N0, Sub0);
1582           SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1583                                                 DL, MVT::i32, N0, Sub1);
1584 
1585           SDValue AddOffsetHi =
1586               getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
1587 
1588           SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1589 
1590           SDNode *Add =
1591               CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1592                                      {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1593 
1594           SDNode *Addc = CurDAG->getMachineNode(
1595               AMDGPU::V_ADDC_U32_e64, DL, VTs,
1596               {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1597 
1598           SDValue RegSequenceArgs[] = {
1599               CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
1600               SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1601 
1602           Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1603                                                 MVT::i64, RegSequenceArgs),
1604                          0);
1605         }
1606       }
1607     }
1608   }
1609 
1610   VAddr = Addr;
1611   Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i16);
1612   return true;
1613 }
1614 
1615 bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N, SDValue Addr,
1616                                           SDValue &VAddr,
1617                                           SDValue &Offset) const {
1618   return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FLAT);
1619 }
1620 
1621 bool AMDGPUDAGToDAGISel::SelectGlobalOffset(SDNode *N, SDValue Addr,
1622                                             SDValue &VAddr,
1623                                             SDValue &Offset) const {
1624   return SelectFlatOffsetImpl(N, Addr, VAddr, Offset, SIInstrFlags::FlatGlobal);
1625 }
1626 
1627 bool AMDGPUDAGToDAGISel::SelectScratchOffset(SDNode *N, SDValue Addr,
1628                                              SDValue &VAddr,
1629                                              SDValue &Offset) const {
1630   return SelectFlatOffsetImpl(N, Addr, VAddr, Offset,
1631                               SIInstrFlags::FlatScratch);
1632 }
1633 
1634 // If this matches zero_extend i32:x, return x
1635 static SDValue matchZExtFromI32(SDValue Op) {
1636   if (Op.getOpcode() != ISD::ZERO_EXTEND)
1637     return SDValue();
1638 
1639   SDValue ExtSrc = Op.getOperand(0);
1640   return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
1641 }
1642 
1643 // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
1644 bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
1645                                            SDValue Addr,
1646                                            SDValue &SAddr,
1647                                            SDValue &VOffset,
1648                                            SDValue &Offset) const {
1649   int64_t ImmOffset = 0;
1650 
1651   // Match the immediate offset first, which canonically is moved as low as
1652   // possible.
1653 
1654   SDValue LHS, RHS;
1655   if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1656     int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1657     const SIInstrInfo *TII = Subtarget->getInstrInfo();
1658 
1659     if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS,
1660                                SIInstrFlags::FlatGlobal)) {
1661       Addr = LHS;
1662       ImmOffset = COffsetVal;
1663     } else if (!LHS->isDivergent()) {
1664       if (COffsetVal > 0) {
1665         SDLoc SL(N);
1666         // saddr + large_offset -> saddr +
1667         //                         (voffset = large_offset & ~MaxOffset) +
1668         //                         (large_offset & MaxOffset);
1669         int64_t SplitImmOffset, RemainderOffset;
1670         std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1671             COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, SIInstrFlags::FlatGlobal);
1672 
1673         if (isUInt<32>(RemainderOffset)) {
1674           SDNode *VMov = CurDAG->getMachineNode(
1675               AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1676               CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1677           VOffset = SDValue(VMov, 0);
1678           SAddr = LHS;
1679           Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16);
1680           return true;
1681         }
1682       }
1683 
1684       // We are adding a 64 bit SGPR and a constant. If constant bus limit
1685       // is 1 we would need to perform 1 or 2 extra moves for each half of
1686       // the constant and it is better to do a scalar add and then issue a
1687       // single VALU instruction to materialize zero. Otherwise it is less
1688       // instructions to perform VALU adds with immediates or inline literals.
1689       unsigned NumLiterals =
1690           !TII->isInlineConstant(APInt(32, COffsetVal & 0xffffffff)) +
1691           !TII->isInlineConstant(APInt(32, COffsetVal >> 32));
1692       if (Subtarget->getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
1693         return false;
1694     }
1695   }
1696 
1697   // Match the variable offset.
1698   if (Addr.getOpcode() == ISD::ADD) {
1699     LHS = Addr.getOperand(0);
1700     RHS = Addr.getOperand(1);
1701 
1702     if (!LHS->isDivergent()) {
1703       // add (i64 sgpr), (zero_extend (i32 vgpr))
1704       if (SDValue ZextRHS = matchZExtFromI32(RHS)) {
1705         SAddr = LHS;
1706         VOffset = ZextRHS;
1707       }
1708     }
1709 
1710     if (!SAddr && !RHS->isDivergent()) {
1711       // add (zero_extend (i32 vgpr)), (i64 sgpr)
1712       if (SDValue ZextLHS = matchZExtFromI32(LHS)) {
1713         SAddr = RHS;
1714         VOffset = ZextLHS;
1715       }
1716     }
1717 
1718     if (SAddr) {
1719       Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
1720       return true;
1721     }
1722   }
1723 
1724   if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
1725       isa<ConstantSDNode>(Addr))
1726     return false;
1727 
1728   // It's cheaper to materialize a single 32-bit zero for vaddr than the two
1729   // moves required to copy a 64-bit SGPR to VGPR.
1730   SAddr = Addr;
1731   SDNode *VMov =
1732       CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
1733                              CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
1734   VOffset = SDValue(VMov, 0);
1735   Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
1736   return true;
1737 }
1738 
1739 static SDValue SelectSAddrFI(SelectionDAG *CurDAG, SDValue SAddr) {
1740   if (auto FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
1741     SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
1742   } else if (SAddr.getOpcode() == ISD::ADD &&
1743              isa<FrameIndexSDNode>(SAddr.getOperand(0))) {
1744     // Materialize this into a scalar move for scalar address to avoid
1745     // readfirstlane.
1746     auto FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
1747     SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
1748                                               FI->getValueType(0));
1749     SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, SDLoc(SAddr),
1750                                            MVT::i32, TFI, SAddr.getOperand(1)),
1751                     0);
1752   }
1753 
1754   return SAddr;
1755 }
1756 
1757 // Match (32-bit SGPR base) + sext(imm offset)
1758 bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
1759                                             SDValue &SAddr,
1760                                             SDValue &Offset) const {
1761   if (Addr->isDivergent())
1762     return false;
1763 
1764   SDLoc DL(Addr);
1765 
1766   int64_t COffsetVal = 0;
1767 
1768   if (CurDAG->isBaseWithConstantOffset(Addr)) {
1769     COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
1770     SAddr = Addr.getOperand(0);
1771   } else {
1772     SAddr = Addr;
1773   }
1774 
1775   SAddr = SelectSAddrFI(CurDAG, SAddr);
1776 
1777   const SIInstrInfo *TII = Subtarget->getInstrInfo();
1778 
1779   if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS,
1780                               SIInstrFlags::FlatScratch)) {
1781     int64_t SplitImmOffset, RemainderOffset;
1782     std::tie(SplitImmOffset, RemainderOffset) = TII->splitFlatOffset(
1783         COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, SIInstrFlags::FlatScratch);
1784 
1785     COffsetVal = SplitImmOffset;
1786 
1787     SDValue AddOffset =
1788         SAddr.getOpcode() == ISD::TargetFrameIndex
1789             ? getMaterializedScalarImm32(Lo_32(RemainderOffset), DL)
1790             : CurDAG->getTargetConstant(RemainderOffset, DL, MVT::i32);
1791     SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL, MVT::i32,
1792                                            SAddr, AddOffset),
1793                     0);
1794   }
1795 
1796   Offset = CurDAG->getTargetConstant(COffsetVal, DL, MVT::i16);
1797 
1798   return true;
1799 }
1800 
1801 bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
1802                                              SDValue &VAddr, SDValue &SAddr,
1803                                              SDValue &Offset) const  {
1804   int64_t ImmOffset = 0;
1805 
1806   SDValue LHS, RHS;
1807   if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1808     int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1809     const SIInstrInfo *TII = Subtarget->getInstrInfo();
1810 
1811     if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true)) {
1812       Addr = LHS;
1813       ImmOffset = COffsetVal;
1814     } else if (!LHS->isDivergent() && COffsetVal > 0) {
1815       SDLoc SL(N);
1816       // saddr + large_offset -> saddr + (vaddr = large_offset & ~MaxOffset) +
1817       //                         (large_offset & MaxOffset);
1818       int64_t SplitImmOffset, RemainderOffset;
1819       std::tie(SplitImmOffset, RemainderOffset)
1820         = TII->splitFlatOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true);
1821 
1822       if (isUInt<32>(RemainderOffset)) {
1823         SDNode *VMov = CurDAG->getMachineNode(
1824           AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1825           CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1826         VAddr = SDValue(VMov, 0);
1827         SAddr = LHS;
1828         Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16);
1829         return true;
1830       }
1831     }
1832   }
1833 
1834   if (Addr.getOpcode() != ISD::ADD)
1835     return false;
1836 
1837   LHS = Addr.getOperand(0);
1838   RHS = Addr.getOperand(1);
1839 
1840   if (!LHS->isDivergent() && RHS->isDivergent()) {
1841     SAddr = LHS;
1842     VAddr = RHS;
1843   } else if (!RHS->isDivergent() && LHS->isDivergent()) {
1844     SAddr = RHS;
1845     VAddr = LHS;
1846   } else {
1847     return false;
1848   }
1849 
1850   SAddr = SelectSAddrFI(CurDAG, SAddr);
1851   Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
1852   return true;
1853 }
1854 
1855 bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
1856                                           SDValue &Offset, bool &Imm) const {
1857   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
1858   if (!C) {
1859     if (ByteOffsetNode.getValueType().isScalarInteger() &&
1860         ByteOffsetNode.getValueType().getSizeInBits() == 32) {
1861       Offset = ByteOffsetNode;
1862       Imm = false;
1863       return true;
1864     }
1865     if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
1866       if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
1867         Offset = ByteOffsetNode.getOperand(0);
1868         Imm = false;
1869         return true;
1870       }
1871     }
1872     return false;
1873   }
1874 
1875   SDLoc SL(ByteOffsetNode);
1876   // GFX9 and GFX10 have signed byte immediate offsets.
1877   int64_t ByteOffset = C->getSExtValue();
1878   Optional<int64_t> EncodedOffset =
1879       AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset, false);
1880   if (EncodedOffset) {
1881     Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
1882     Imm = true;
1883     return true;
1884   }
1885 
1886   // SGPR and literal offsets are unsigned.
1887   if (ByteOffset < 0)
1888     return false;
1889 
1890   EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);
1891   if (EncodedOffset) {
1892     Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
1893     return true;
1894   }
1895 
1896   if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))
1897     return false;
1898 
1899   SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
1900   Offset = SDValue(
1901       CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
1902 
1903   return true;
1904 }
1905 
1906 SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
1907   if (Addr.getValueType() != MVT::i32)
1908     return Addr;
1909 
1910   // Zero-extend a 32-bit address.
1911   SDLoc SL(Addr);
1912 
1913   const MachineFunction &MF = CurDAG->getMachineFunction();
1914   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1915   unsigned AddrHiVal = Info->get32BitAddressHighBits();
1916   SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
1917 
1918   const SDValue Ops[] = {
1919     CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
1920     Addr,
1921     CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
1922     SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
1923             0),
1924     CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
1925   };
1926 
1927   return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
1928                                         Ops), 0);
1929 }
1930 
1931 bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
1932                                      SDValue &Offset, bool &Imm) const {
1933   SDLoc SL(Addr);
1934 
1935   // A 32-bit (address + offset) should not cause unsigned 32-bit integer
1936   // wraparound, because s_load instructions perform the addition in 64 bits.
1937   if ((Addr.getValueType() != MVT::i32 ||
1938        Addr->getFlags().hasNoUnsignedWrap())) {
1939     SDValue N0, N1;
1940     // Extract the base and offset if possible.
1941     if (CurDAG->isBaseWithConstantOffset(Addr) ||
1942         Addr.getOpcode() == ISD::ADD) {
1943       N0 = Addr.getOperand(0);
1944       N1 = Addr.getOperand(1);
1945     } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
1946       assert(N0 && N1 && isa<ConstantSDNode>(N1));
1947     }
1948     if (N0 && N1) {
1949       if (SelectSMRDOffset(N1, Offset, Imm)) {
1950         SBase = Expand32BitAddress(N0);
1951         return true;
1952       }
1953     }
1954   }
1955   SBase = Expand32BitAddress(Addr);
1956   Offset = CurDAG->getTargetConstant(0, SL, MVT::i32);
1957   Imm = true;
1958   return true;
1959 }
1960 
1961 bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
1962                                        SDValue &Offset) const {
1963   bool Imm = false;
1964   return SelectSMRD(Addr, SBase, Offset, Imm) && Imm;
1965 }
1966 
1967 bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
1968                                          SDValue &Offset) const {
1969 
1970   assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
1971 
1972   bool Imm = false;
1973   if (!SelectSMRD(Addr, SBase, Offset, Imm))
1974     return false;
1975 
1976   return !Imm && isa<ConstantSDNode>(Offset);
1977 }
1978 
1979 bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
1980                                         SDValue &Offset) const {
1981   bool Imm = false;
1982   return SelectSMRD(Addr, SBase, Offset, Imm) && !Imm &&
1983          !isa<ConstantSDNode>(Offset);
1984 }
1985 
1986 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue Addr,
1987                                              SDValue &Offset) const {
1988   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr)) {
1989     // The immediate offset for S_BUFFER instructions is unsigned.
1990     if (auto Imm =
1991             AMDGPU::getSMRDEncodedOffset(*Subtarget, C->getZExtValue(), true)) {
1992       Offset = CurDAG->getTargetConstant(*Imm, SDLoc(Addr), MVT::i32);
1993       return true;
1994     }
1995   }
1996 
1997   return false;
1998 }
1999 
2000 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue Addr,
2001                                                SDValue &Offset) const {
2002   assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2003 
2004   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr)) {
2005     if (auto Imm = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget,
2006                                                          C->getZExtValue())) {
2007       Offset = CurDAG->getTargetConstant(*Imm, SDLoc(Addr), MVT::i32);
2008       return true;
2009     }
2010   }
2011 
2012   return false;
2013 }
2014 
2015 bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
2016                                             SDValue &Base,
2017                                             SDValue &Offset) const {
2018   SDLoc DL(Index);
2019 
2020   if (CurDAG->isBaseWithConstantOffset(Index)) {
2021     SDValue N0 = Index.getOperand(0);
2022     SDValue N1 = Index.getOperand(1);
2023     ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
2024 
2025     // (add n0, c0)
2026     // Don't peel off the offset (c0) if doing so could possibly lead
2027     // the base (n0) to be negative.
2028     // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.
2029     if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) ||
2030         (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {
2031       Base = N0;
2032       Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
2033       return true;
2034     }
2035   }
2036 
2037   if (isa<ConstantSDNode>(Index))
2038     return false;
2039 
2040   Base = Index;
2041   Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
2042   return true;
2043 }
2044 
2045 SDNode *AMDGPUDAGToDAGISel::getBFE32(bool IsSigned, const SDLoc &DL,
2046                                      SDValue Val, uint32_t Offset,
2047                                      uint32_t Width) {
2048   if (Val->isDivergent()) {
2049     unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2050     SDValue Off = CurDAG->getTargetConstant(Offset, DL, MVT::i32);
2051     SDValue W = CurDAG->getTargetConstant(Width, DL, MVT::i32);
2052 
2053     return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, Off, W);
2054   }
2055   unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2056   // Transformation function, pack the offset and width of a BFE into
2057   // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
2058   // source, bits [5:0] contain the offset and bits [22:16] the width.
2059   uint32_t PackedVal = Offset | (Width << 16);
2060   SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);
2061 
2062   return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
2063 }
2064 
2065 void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
2066   // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
2067   // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
2068   // Predicate: 0 < b <= c < 32
2069 
2070   const SDValue &Shl = N->getOperand(0);
2071   ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1));
2072   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
2073 
2074   if (B && C) {
2075     uint32_t BVal = B->getZExtValue();
2076     uint32_t CVal = C->getZExtValue();
2077 
2078     if (0 < BVal && BVal <= CVal && CVal < 32) {
2079       bool Signed = N->getOpcode() == ISD::SRA;
2080       ReplaceNode(N, getBFE32(Signed, SDLoc(N), Shl.getOperand(0), CVal - BVal,
2081                   32 - CVal));
2082       return;
2083     }
2084   }
2085   SelectCode(N);
2086 }
2087 
2088 void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
2089   switch (N->getOpcode()) {
2090   case ISD::AND:
2091     if (N->getOperand(0).getOpcode() == ISD::SRL) {
2092       // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
2093       // Predicate: isMask(mask)
2094       const SDValue &Srl = N->getOperand(0);
2095       ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1));
2096       ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
2097 
2098       if (Shift && Mask) {
2099         uint32_t ShiftVal = Shift->getZExtValue();
2100         uint32_t MaskVal = Mask->getZExtValue();
2101 
2102         if (isMask_32(MaskVal)) {
2103           uint32_t WidthVal = countPopulation(MaskVal);
2104           ReplaceNode(N, getBFE32(false, SDLoc(N), Srl.getOperand(0), ShiftVal,
2105                                   WidthVal));
2106           return;
2107         }
2108       }
2109     }
2110     break;
2111   case ISD::SRL:
2112     if (N->getOperand(0).getOpcode() == ISD::AND) {
2113       // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
2114       // Predicate: isMask(mask >> b)
2115       const SDValue &And = N->getOperand(0);
2116       ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1));
2117       ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1));
2118 
2119       if (Shift && Mask) {
2120         uint32_t ShiftVal = Shift->getZExtValue();
2121         uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
2122 
2123         if (isMask_32(MaskVal)) {
2124           uint32_t WidthVal = countPopulation(MaskVal);
2125           ReplaceNode(N, getBFE32(false, SDLoc(N), And.getOperand(0), ShiftVal,
2126                       WidthVal));
2127           return;
2128         }
2129       }
2130     } else if (N->getOperand(0).getOpcode() == ISD::SHL) {
2131       SelectS_BFEFromShifts(N);
2132       return;
2133     }
2134     break;
2135   case ISD::SRA:
2136     if (N->getOperand(0).getOpcode() == ISD::SHL) {
2137       SelectS_BFEFromShifts(N);
2138       return;
2139     }
2140     break;
2141 
2142   case ISD::SIGN_EXTEND_INREG: {
2143     // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
2144     SDValue Src = N->getOperand(0);
2145     if (Src.getOpcode() != ISD::SRL)
2146       break;
2147 
2148     const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1));
2149     if (!Amt)
2150       break;
2151 
2152     unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
2153     ReplaceNode(N, getBFE32(true, SDLoc(N), Src.getOperand(0),
2154                             Amt->getZExtValue(), Width));
2155     return;
2156   }
2157   }
2158 
2159   SelectCode(N);
2160 }
2161 
2162 bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
2163   assert(N->getOpcode() == ISD::BRCOND);
2164   if (!N->hasOneUse())
2165     return false;
2166 
2167   SDValue Cond = N->getOperand(1);
2168   if (Cond.getOpcode() == ISD::CopyToReg)
2169     Cond = Cond.getOperand(2);
2170 
2171   if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
2172     return false;
2173 
2174   MVT VT = Cond.getOperand(0).getSimpleValueType();
2175   if (VT == MVT::i32)
2176     return true;
2177 
2178   if (VT == MVT::i64) {
2179     auto ST = static_cast<const GCNSubtarget *>(Subtarget);
2180 
2181     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
2182     return (CC == ISD::SETEQ || CC == ISD::SETNE) && ST->hasScalarCompareEq64();
2183   }
2184 
2185   return false;
2186 }
2187 
2188 void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
2189   SDValue Cond = N->getOperand(1);
2190 
2191   if (Cond.isUndef()) {
2192     CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
2193                          N->getOperand(2), N->getOperand(0));
2194     return;
2195   }
2196 
2197   const GCNSubtarget *ST = static_cast<const GCNSubtarget *>(Subtarget);
2198   const SIRegisterInfo *TRI = ST->getRegisterInfo();
2199 
2200   bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
2201   unsigned BrOp = UseSCCBr ? AMDGPU::S_CBRANCH_SCC1 : AMDGPU::S_CBRANCH_VCCNZ;
2202   Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
2203   SDLoc SL(N);
2204 
2205   if (!UseSCCBr) {
2206     // This is the case that we are selecting to S_CBRANCH_VCCNZ.  We have not
2207     // analyzed what generates the vcc value, so we do not know whether vcc
2208     // bits for disabled lanes are 0.  Thus we need to mask out bits for
2209     // disabled lanes.
2210     //
2211     // For the case that we select S_CBRANCH_SCC1 and it gets
2212     // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
2213     // SIInstrInfo::moveToVALU which inserts the S_AND).
2214     //
2215     // We could add an analysis of what generates the vcc value here and omit
2216     // the S_AND when is unnecessary. But it would be better to add a separate
2217     // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
2218     // catches both cases.
2219     Cond = SDValue(CurDAG->getMachineNode(ST->isWave32() ? AMDGPU::S_AND_B32
2220                                                          : AMDGPU::S_AND_B64,
2221                      SL, MVT::i1,
2222                      CurDAG->getRegister(ST->isWave32() ? AMDGPU::EXEC_LO
2223                                                         : AMDGPU::EXEC,
2224                                          MVT::i1),
2225                     Cond),
2226                    0);
2227   }
2228 
2229   SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);
2230   CurDAG->SelectNodeTo(N, BrOp, MVT::Other,
2231                        N->getOperand(2), // Basic Block
2232                        VCC.getValue(0));
2233 }
2234 
2235 void AMDGPUDAGToDAGISel::SelectFMAD_FMA(SDNode *N) {
2236   MVT VT = N->getSimpleValueType(0);
2237   bool IsFMA = N->getOpcode() == ISD::FMA;
2238   if (VT != MVT::f32 || (!Subtarget->hasMadMixInsts() &&
2239                          !Subtarget->hasFmaMixInsts()) ||
2240       ((IsFMA && Subtarget->hasMadMixInsts()) ||
2241        (!IsFMA && Subtarget->hasFmaMixInsts()))) {
2242     SelectCode(N);
2243     return;
2244   }
2245 
2246   SDValue Src0 = N->getOperand(0);
2247   SDValue Src1 = N->getOperand(1);
2248   SDValue Src2 = N->getOperand(2);
2249   unsigned Src0Mods, Src1Mods, Src2Mods;
2250 
2251   // Avoid using v_mad_mix_f32/v_fma_mix_f32 unless there is actually an operand
2252   // using the conversion from f16.
2253   bool Sel0 = SelectVOP3PMadMixModsImpl(Src0, Src0, Src0Mods);
2254   bool Sel1 = SelectVOP3PMadMixModsImpl(Src1, Src1, Src1Mods);
2255   bool Sel2 = SelectVOP3PMadMixModsImpl(Src2, Src2, Src2Mods);
2256 
2257   assert((IsFMA || !Mode.allFP32Denormals()) &&
2258          "fmad selected with denormals enabled");
2259   // TODO: We can select this with f32 denormals enabled if all the sources are
2260   // converted from f16 (in which case fmad isn't legal).
2261 
2262   if (Sel0 || Sel1 || Sel2) {
2263     // For dummy operands.
2264     SDValue Zero = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2265     SDValue Ops[] = {
2266       CurDAG->getTargetConstant(Src0Mods, SDLoc(), MVT::i32), Src0,
2267       CurDAG->getTargetConstant(Src1Mods, SDLoc(), MVT::i32), Src1,
2268       CurDAG->getTargetConstant(Src2Mods, SDLoc(), MVT::i32), Src2,
2269       CurDAG->getTargetConstant(0, SDLoc(), MVT::i1),
2270       Zero, Zero
2271     };
2272 
2273     CurDAG->SelectNodeTo(N,
2274                          IsFMA ? AMDGPU::V_FMA_MIX_F32 : AMDGPU::V_MAD_MIX_F32,
2275                          MVT::f32, Ops);
2276   } else {
2277     SelectCode(N);
2278   }
2279 }
2280 
2281 // This is here because there isn't a way to use the generated sub0_sub1 as the
2282 // subreg index to EXTRACT_SUBREG in tablegen.
2283 void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) {
2284   MemSDNode *Mem = cast<MemSDNode>(N);
2285   unsigned AS = Mem->getAddressSpace();
2286   if (AS == AMDGPUAS::FLAT_ADDRESS) {
2287     SelectCode(N);
2288     return;
2289   }
2290 
2291   MVT VT = N->getSimpleValueType(0);
2292   bool Is32 = (VT == MVT::i32);
2293   SDLoc SL(N);
2294 
2295   MachineSDNode *CmpSwap = nullptr;
2296   if (Subtarget->hasAddr64()) {
2297     SDValue SRsrc, VAddr, SOffset, Offset;
2298 
2299     if (SelectMUBUFAddr64(Mem->getBasePtr(), SRsrc, VAddr, SOffset, Offset)) {
2300       unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN :
2301         AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN;
2302       SDValue CmpVal = Mem->getOperand(2);
2303       SDValue CPol = CurDAG->getTargetConstant(AMDGPU::CPol::GLC, SL, MVT::i32);
2304 
2305       // XXX - Do we care about glue operands?
2306 
2307       SDValue Ops[] = {CmpVal, VAddr, SRsrc, SOffset, Offset, CPol,
2308                        Mem->getChain()};
2309 
2310       CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops);
2311     }
2312   }
2313 
2314   if (!CmpSwap) {
2315     SDValue SRsrc, SOffset, Offset;
2316     if (SelectMUBUFOffset(Mem->getBasePtr(), SRsrc, SOffset, Offset)) {
2317       unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN :
2318         AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN;
2319 
2320       SDValue CmpVal = Mem->getOperand(2);
2321       SDValue CPol = CurDAG->getTargetConstant(AMDGPU::CPol::GLC, SL, MVT::i32);
2322       SDValue Ops[] = {CmpVal, SRsrc, SOffset, Offset, CPol, Mem->getChain()};
2323 
2324       CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops);
2325     }
2326   }
2327 
2328   if (!CmpSwap) {
2329     SelectCode(N);
2330     return;
2331   }
2332 
2333   MachineMemOperand *MMO = Mem->getMemOperand();
2334   CurDAG->setNodeMemRefs(CmpSwap, {MMO});
2335 
2336   unsigned SubReg = Is32 ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
2337   SDValue Extract
2338     = CurDAG->getTargetExtractSubreg(SubReg, SL, VT, SDValue(CmpSwap, 0));
2339 
2340   ReplaceUses(SDValue(N, 0), Extract);
2341   ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 1));
2342   CurDAG->RemoveDeadNode(N);
2343 }
2344 
2345 void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
2346   // The address is assumed to be uniform, so if it ends up in a VGPR, it will
2347   // be copied to an SGPR with readfirstlane.
2348   unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
2349     AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2350 
2351   SDValue Chain = N->getOperand(0);
2352   SDValue Ptr = N->getOperand(2);
2353   MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2354   MachineMemOperand *MMO = M->getMemOperand();
2355   bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2356 
2357   SDValue Offset;
2358   if (CurDAG->isBaseWithConstantOffset(Ptr)) {
2359     SDValue PtrBase = Ptr.getOperand(0);
2360     SDValue PtrOffset = Ptr.getOperand(1);
2361 
2362     const APInt &OffsetVal = cast<ConstantSDNode>(PtrOffset)->getAPIntValue();
2363     if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) {
2364       N = glueCopyToM0(N, PtrBase);
2365       Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
2366     }
2367   }
2368 
2369   if (!Offset) {
2370     N = glueCopyToM0(N, Ptr);
2371     Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2372   }
2373 
2374   SDValue Ops[] = {
2375     Offset,
2376     CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
2377     Chain,
2378     N->getOperand(N->getNumOperands() - 1) // New glue
2379   };
2380 
2381   SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2382   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2383 }
2384 
2385 static unsigned gwsIntrinToOpcode(unsigned IntrID) {
2386   switch (IntrID) {
2387   case Intrinsic::amdgcn_ds_gws_init:
2388     return AMDGPU::DS_GWS_INIT;
2389   case Intrinsic::amdgcn_ds_gws_barrier:
2390     return AMDGPU::DS_GWS_BARRIER;
2391   case Intrinsic::amdgcn_ds_gws_sema_v:
2392     return AMDGPU::DS_GWS_SEMA_V;
2393   case Intrinsic::amdgcn_ds_gws_sema_br:
2394     return AMDGPU::DS_GWS_SEMA_BR;
2395   case Intrinsic::amdgcn_ds_gws_sema_p:
2396     return AMDGPU::DS_GWS_SEMA_P;
2397   case Intrinsic::amdgcn_ds_gws_sema_release_all:
2398     return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
2399   default:
2400     llvm_unreachable("not a gws intrinsic");
2401   }
2402 }
2403 
2404 void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
2405   if (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
2406       !Subtarget->hasGWSSemaReleaseAll()) {
2407     // Let this error.
2408     SelectCode(N);
2409     return;
2410   }
2411 
2412   // Chain, intrinsic ID, vsrc, offset
2413   const bool HasVSrc = N->getNumOperands() == 4;
2414   assert(HasVSrc || N->getNumOperands() == 3);
2415 
2416   SDLoc SL(N);
2417   SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2);
2418   int ImmOffset = 0;
2419   MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2420   MachineMemOperand *MMO = M->getMemOperand();
2421 
2422   // Don't worry if the offset ends up in a VGPR. Only one lane will have
2423   // effect, so SIFixSGPRCopies will validly insert readfirstlane.
2424 
2425   // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
2426   // offset field) % 64. Some versions of the programming guide omit the m0
2427   // part, or claim it's from offset 0.
2428   if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
2429     // If we have a constant offset, try to use the 0 in m0 as the base.
2430     // TODO: Look into changing the default m0 initialization value. If the
2431     // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
2432     // the immediate offset.
2433     glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32));
2434     ImmOffset = ConstOffset->getZExtValue();
2435   } else {
2436     if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
2437       ImmOffset = BaseOffset.getConstantOperandVal(1);
2438       BaseOffset = BaseOffset.getOperand(0);
2439     }
2440 
2441     // Prefer to do the shift in an SGPR since it should be possible to use m0
2442     // as the result directly. If it's already an SGPR, it will be eliminated
2443     // later.
2444     SDNode *SGPROffset
2445       = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
2446                                BaseOffset);
2447     // Shift to offset in m0
2448     SDNode *M0Base
2449       = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2450                                SDValue(SGPROffset, 0),
2451                                CurDAG->getTargetConstant(16, SL, MVT::i32));
2452     glueCopyToM0(N, SDValue(M0Base, 0));
2453   }
2454 
2455   SDValue Chain = N->getOperand(0);
2456   SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
2457 
2458   const unsigned Opc = gwsIntrinToOpcode(IntrID);
2459   SmallVector<SDValue, 5> Ops;
2460   if (HasVSrc)
2461     Ops.push_back(N->getOperand(2));
2462   Ops.push_back(OffsetField);
2463   Ops.push_back(Chain);
2464 
2465   SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2466   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2467 }
2468 
2469 void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
2470   if (Subtarget->getLDSBankCount() != 16) {
2471     // This is a single instruction with a pattern.
2472     SelectCode(N);
2473     return;
2474   }
2475 
2476   SDLoc DL(N);
2477 
2478   // This requires 2 instructions. It is possible to write a pattern to support
2479   // this, but the generated isel emitter doesn't correctly deal with multiple
2480   // output instructions using the same physical register input. The copy to m0
2481   // is incorrectly placed before the second instruction.
2482   //
2483   // TODO: Match source modifiers.
2484   //
2485   // def : Pat <
2486   //   (int_amdgcn_interp_p1_f16
2487   //    (VOP3Mods f32:$src0, i32:$src0_modifiers),
2488   //                             (i32 timm:$attrchan), (i32 timm:$attr),
2489   //                             (i1 timm:$high), M0),
2490   //   (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
2491   //       timm:$attrchan, 0,
2492   //       (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
2493   //   let Predicates = [has16BankLDS];
2494   // }
2495 
2496   // 16 bank LDS
2497   SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0,
2498                                       N->getOperand(5), SDValue());
2499 
2500   SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);
2501 
2502   SDNode *InterpMov =
2503     CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
2504         CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
2505         N->getOperand(3),  // Attr
2506         N->getOperand(2),  // Attrchan
2507         ToM0.getValue(1) // In glue
2508   });
2509 
2510   SDNode *InterpP1LV =
2511     CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
2512         CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
2513         N->getOperand(1), // Src0
2514         N->getOperand(3), // Attr
2515         N->getOperand(2), // Attrchan
2516         CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
2517         SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
2518         N->getOperand(4), // high
2519         CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
2520         CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
2521         SDValue(InterpMov, 1)
2522   });
2523 
2524   CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));
2525 }
2526 
2527 void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
2528   unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
2529   switch (IntrID) {
2530   case Intrinsic::amdgcn_ds_append:
2531   case Intrinsic::amdgcn_ds_consume: {
2532     if (N->getValueType(0) != MVT::i32)
2533       break;
2534     SelectDSAppendConsume(N, IntrID);
2535     return;
2536   }
2537   }
2538 
2539   SelectCode(N);
2540 }
2541 
2542 void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
2543   unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
2544   unsigned Opcode;
2545   switch (IntrID) {
2546   case Intrinsic::amdgcn_wqm:
2547     Opcode = AMDGPU::WQM;
2548     break;
2549   case Intrinsic::amdgcn_softwqm:
2550     Opcode = AMDGPU::SOFT_WQM;
2551     break;
2552   case Intrinsic::amdgcn_wwm:
2553   case Intrinsic::amdgcn_strict_wwm:
2554     Opcode = AMDGPU::STRICT_WWM;
2555     break;
2556   case Intrinsic::amdgcn_strict_wqm:
2557     Opcode = AMDGPU::STRICT_WQM;
2558     break;
2559   case Intrinsic::amdgcn_interp_p1_f16:
2560     SelectInterpP1F16(N);
2561     return;
2562   default:
2563     SelectCode(N);
2564     return;
2565   }
2566 
2567   SDValue Src = N->getOperand(1);
2568   CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src});
2569 }
2570 
2571 void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
2572   unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
2573   switch (IntrID) {
2574   case Intrinsic::amdgcn_ds_gws_init:
2575   case Intrinsic::amdgcn_ds_gws_barrier:
2576   case Intrinsic::amdgcn_ds_gws_sema_v:
2577   case Intrinsic::amdgcn_ds_gws_sema_br:
2578   case Intrinsic::amdgcn_ds_gws_sema_p:
2579   case Intrinsic::amdgcn_ds_gws_sema_release_all:
2580     SelectDS_GWS(N, IntrID);
2581     return;
2582   default:
2583     break;
2584   }
2585 
2586   SelectCode(N);
2587 }
2588 
2589 bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
2590                                             unsigned &Mods,
2591                                             bool AllowAbs) const {
2592   Mods = 0;
2593   Src = In;
2594 
2595   if (Src.getOpcode() == ISD::FNEG) {
2596     Mods |= SISrcMods::NEG;
2597     Src = Src.getOperand(0);
2598   }
2599 
2600   if (AllowAbs && Src.getOpcode() == ISD::FABS) {
2601     Mods |= SISrcMods::ABS;
2602     Src = Src.getOperand(0);
2603   }
2604 
2605   return true;
2606 }
2607 
2608 bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
2609                                         SDValue &SrcMods) const {
2610   unsigned Mods;
2611   if (SelectVOP3ModsImpl(In, Src, Mods)) {
2612     SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2613     return true;
2614   }
2615 
2616   return false;
2617 }
2618 
2619 bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
2620                                          SDValue &SrcMods) const {
2621   unsigned Mods;
2622   if (SelectVOP3ModsImpl(In, Src, Mods, /* AllowAbs */ false)) {
2623     SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2624     return true;
2625   }
2626 
2627   return false;
2628 }
2629 
2630 bool AMDGPUDAGToDAGISel::SelectVOP3Mods_NNaN(SDValue In, SDValue &Src,
2631                                              SDValue &SrcMods) const {
2632   SelectVOP3Mods(In, Src, SrcMods);
2633   return isNoNanSrc(Src);
2634 }
2635 
2636 bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
2637   if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
2638     return false;
2639 
2640   Src = In;
2641   return true;
2642 }
2643 
2644 bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
2645                                          SDValue &SrcMods, SDValue &Clamp,
2646                                          SDValue &Omod) const {
2647   SDLoc DL(In);
2648   Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2649   Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2650 
2651   return SelectVOP3Mods(In, Src, SrcMods);
2652 }
2653 
2654 bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,
2655                                           SDValue &SrcMods, SDValue &Clamp,
2656                                           SDValue &Omod) const {
2657   SDLoc DL(In);
2658   Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2659   Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2660 
2661   return SelectVOP3BMods(In, Src, SrcMods);
2662 }
2663 
2664 bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
2665                                          SDValue &Clamp, SDValue &Omod) const {
2666   Src = In;
2667 
2668   SDLoc DL(In);
2669   Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2670   Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2671 
2672   return true;
2673 }
2674 
2675 bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
2676                                          SDValue &SrcMods, bool IsDOT) const {
2677   unsigned Mods = 0;
2678   Src = In;
2679 
2680   if (Src.getOpcode() == ISD::FNEG) {
2681     Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
2682     Src = Src.getOperand(0);
2683   }
2684 
2685   if (Src.getOpcode() == ISD::BUILD_VECTOR &&
2686       (!IsDOT || !Subtarget->hasDOTOpSelHazard())) {
2687     unsigned VecMods = Mods;
2688 
2689     SDValue Lo = stripBitcast(Src.getOperand(0));
2690     SDValue Hi = stripBitcast(Src.getOperand(1));
2691 
2692     if (Lo.getOpcode() == ISD::FNEG) {
2693       Lo = stripBitcast(Lo.getOperand(0));
2694       Mods ^= SISrcMods::NEG;
2695     }
2696 
2697     if (Hi.getOpcode() == ISD::FNEG) {
2698       Hi = stripBitcast(Hi.getOperand(0));
2699       Mods ^= SISrcMods::NEG_HI;
2700     }
2701 
2702     if (isExtractHiElt(Lo, Lo))
2703       Mods |= SISrcMods::OP_SEL_0;
2704 
2705     if (isExtractHiElt(Hi, Hi))
2706       Mods |= SISrcMods::OP_SEL_1;
2707 
2708     unsigned VecSize = Src.getValueSizeInBits();
2709     Lo = stripExtractLoElt(Lo);
2710     Hi = stripExtractLoElt(Hi);
2711 
2712     if (Lo.getValueSizeInBits() > VecSize) {
2713       Lo = CurDAG->getTargetExtractSubreg(
2714         (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
2715         MVT::getIntegerVT(VecSize), Lo);
2716     }
2717 
2718     if (Hi.getValueSizeInBits() > VecSize) {
2719       Hi = CurDAG->getTargetExtractSubreg(
2720         (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
2721         MVT::getIntegerVT(VecSize), Hi);
2722     }
2723 
2724     assert(Lo.getValueSizeInBits() <= VecSize &&
2725            Hi.getValueSizeInBits() <= VecSize);
2726 
2727     if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
2728       // Really a scalar input. Just select from the low half of the register to
2729       // avoid packing.
2730 
2731       if (VecSize == 32 || VecSize == Lo.getValueSizeInBits()) {
2732         Src = Lo;
2733       } else {
2734         assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
2735 
2736         SDLoc SL(In);
2737         SDValue Undef = SDValue(
2738           CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL,
2739                                  Lo.getValueType()), 0);
2740         auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID
2741                                     : AMDGPU::SReg_64RegClassID;
2742         const SDValue Ops[] = {
2743           CurDAG->getTargetConstant(RC, SL, MVT::i32),
2744           Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
2745           Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) };
2746 
2747         Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
2748                                              Src.getValueType(), Ops), 0);
2749       }
2750       SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2751       return true;
2752     }
2753 
2754     if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {
2755       uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
2756                       .bitcastToAPInt().getZExtValue();
2757       if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
2758         Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);;
2759         SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2760         return true;
2761       }
2762     }
2763 
2764     Mods = VecMods;
2765   }
2766 
2767   // Packed instructions do not have abs modifiers.
2768   Mods |= SISrcMods::OP_SEL_1;
2769 
2770   SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2771   return true;
2772 }
2773 
2774 bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
2775                                             SDValue &SrcMods) const {
2776   return SelectVOP3PMods(In, Src, SrcMods, true);
2777 }
2778 
2779 bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
2780                                          SDValue &SrcMods) const {
2781   Src = In;
2782   // FIXME: Handle op_sel
2783   SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
2784   return true;
2785 }
2786 
2787 bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
2788                                              SDValue &SrcMods) const {
2789   // FIXME: Handle op_sel
2790   return SelectVOP3Mods(In, Src, SrcMods);
2791 }
2792 
2793 // The return value is not whether the match is possible (which it always is),
2794 // but whether or not it a conversion is really used.
2795 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
2796                                                    unsigned &Mods) const {
2797   Mods = 0;
2798   SelectVOP3ModsImpl(In, Src, Mods);
2799 
2800   if (Src.getOpcode() == ISD::FP_EXTEND) {
2801     Src = Src.getOperand(0);
2802     assert(Src.getValueType() == MVT::f16);
2803     Src = stripBitcast(Src);
2804 
2805     // Be careful about folding modifiers if we already have an abs. fneg is
2806     // applied last, so we don't want to apply an earlier fneg.
2807     if ((Mods & SISrcMods::ABS) == 0) {
2808       unsigned ModsTmp;
2809       SelectVOP3ModsImpl(Src, Src, ModsTmp);
2810 
2811       if ((ModsTmp & SISrcMods::NEG) != 0)
2812         Mods ^= SISrcMods::NEG;
2813 
2814       if ((ModsTmp & SISrcMods::ABS) != 0)
2815         Mods |= SISrcMods::ABS;
2816     }
2817 
2818     // op_sel/op_sel_hi decide the source type and source.
2819     // If the source's op_sel_hi is set, it indicates to do a conversion from fp16.
2820     // If the sources's op_sel is set, it picks the high half of the source
2821     // register.
2822 
2823     Mods |= SISrcMods::OP_SEL_1;
2824     if (isExtractHiElt(Src, Src)) {
2825       Mods |= SISrcMods::OP_SEL_0;
2826 
2827       // TODO: Should we try to look for neg/abs here?
2828     }
2829 
2830     return true;
2831   }
2832 
2833   return false;
2834 }
2835 
2836 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
2837                                                SDValue &SrcMods) const {
2838   unsigned Mods = 0;
2839   SelectVOP3PMadMixModsImpl(In, Src, Mods);
2840   SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2841   return true;
2842 }
2843 
2844 SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
2845   if (In.isUndef())
2846     return CurDAG->getUNDEF(MVT::i32);
2847 
2848   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
2849     SDLoc SL(In);
2850     return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);
2851   }
2852 
2853   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
2854     SDLoc SL(In);
2855     return CurDAG->getConstant(
2856       C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
2857   }
2858 
2859   SDValue Src;
2860   if (isExtractHiElt(In, Src))
2861     return Src;
2862 
2863   return SDValue();
2864 }
2865 
2866 bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
2867   assert(CurDAG->getTarget().getTargetTriple().getArch() == Triple::amdgcn);
2868 
2869   const SIRegisterInfo *SIRI =
2870     static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
2871   const SIInstrInfo * SII =
2872     static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
2873 
2874   unsigned Limit = 0;
2875   bool AllUsesAcceptSReg = true;
2876   for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
2877     Limit < 10 && U != E; ++U, ++Limit) {
2878     const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
2879 
2880     // If the register class is unknown, it could be an unknown
2881     // register class that needs to be an SGPR, e.g. an inline asm
2882     // constraint
2883     if (!RC || SIRI->isSGPRClass(RC))
2884       return false;
2885 
2886     if (RC != &AMDGPU::VS_32RegClass) {
2887       AllUsesAcceptSReg = false;
2888       SDNode * User = *U;
2889       if (User->isMachineOpcode()) {
2890         unsigned Opc = User->getMachineOpcode();
2891         MCInstrDesc Desc = SII->get(Opc);
2892         if (Desc.isCommutable()) {
2893           unsigned OpIdx = Desc.getNumDefs() + U.getOperandNo();
2894           unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
2895           if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
2896             unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
2897             const TargetRegisterClass *CommutedRC = getOperandRegClass(*U, CommutedOpNo);
2898             if (CommutedRC == &AMDGPU::VS_32RegClass)
2899               AllUsesAcceptSReg = true;
2900           }
2901         }
2902       }
2903       // If "AllUsesAcceptSReg == false" so far we haven't succeeded
2904       // commuting current user. This means have at least one use
2905       // that strictly require VGPR. Thus, we will not attempt to commute
2906       // other user instructions.
2907       if (!AllUsesAcceptSReg)
2908         break;
2909     }
2910   }
2911   return !AllUsesAcceptSReg && (Limit < 10);
2912 }
2913 
2914 bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode * N) const {
2915   auto Ld = cast<LoadSDNode>(N);
2916 
2917   return Ld->getAlignment() >= 4 &&
2918         (
2919           (
2920             (
2921               Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS       ||
2922               Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT
2923             )
2924             &&
2925             !N->isDivergent()
2926           )
2927           ||
2928           (
2929             Subtarget->getScalarizeGlobalBehavior() &&
2930             Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
2931             Ld->isSimple() &&
2932             !N->isDivergent() &&
2933             static_cast<const SITargetLowering *>(
2934               getTargetLowering())->isMemOpHasNoClobberedMemOperand(N)
2935           )
2936         );
2937 }
2938 
2939 void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
2940   const AMDGPUTargetLowering& Lowering =
2941     *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
2942   bool IsModified = false;
2943   do {
2944     IsModified = false;
2945 
2946     // Go over all selected nodes and try to fold them a bit more
2947     SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_begin();
2948     while (Position != CurDAG->allnodes_end()) {
2949       SDNode *Node = &*Position++;
2950       MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Node);
2951       if (!MachineNode)
2952         continue;
2953 
2954       SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
2955       if (ResNode != Node) {
2956         if (ResNode)
2957           ReplaceUses(Node, ResNode);
2958         IsModified = true;
2959       }
2960     }
2961     CurDAG->RemoveDeadNodes();
2962   } while (IsModified);
2963 }
2964