1 //===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Defines an instruction selector for the AMDGPU target.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "AMDGPUTargetMachine.h"
16 #include "SIMachineFunctionInfo.h"
17 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
18 #include "llvm/Analysis/ValueTracking.h"
19 #include "llvm/CodeGen/FunctionLoweringInfo.h"
20 #include "llvm/CodeGen/SelectionDAG.h"
21 #include "llvm/CodeGen/SelectionDAGISel.h"
22 #include "llvm/CodeGen/SelectionDAGNodes.h"
23 #include "llvm/IR/IntrinsicsAMDGPU.h"
24 #include "llvm/InitializePasses.h"
25 
26 #ifdef EXPENSIVE_CHECKS
27 #include "llvm/Analysis/LoopInfo.h"
28 #include "llvm/IR/Dominators.h"
29 #endif
30 
31 #define DEBUG_TYPE "isel"
32 
33 using namespace llvm;
34 
35 namespace llvm {
36 
37 class R600InstrInfo;
38 
39 } // end namespace llvm
40 
41 //===----------------------------------------------------------------------===//
42 // Instruction Selector Implementation
43 //===----------------------------------------------------------------------===//
44 
45 namespace {
46 
47 static bool isNullConstantOrUndef(SDValue V) {
48   if (V.isUndef())
49     return true;
50 
51   ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V);
52   return Const != nullptr && Const->isNullValue();
53 }
54 
55 static bool getConstantValue(SDValue N, uint32_t &Out) {
56   // This is only used for packed vectors, where ussing 0 for undef should
57   // always be good.
58   if (N.isUndef()) {
59     Out = 0;
60     return true;
61   }
62 
63   if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) {
64     Out = C->getAPIntValue().getSExtValue();
65     return true;
66   }
67 
68   if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) {
69     Out = C->getValueAPF().bitcastToAPInt().getSExtValue();
70     return true;
71   }
72 
73   return false;
74 }
75 
76 // TODO: Handle undef as zero
77 static SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG,
78                                  bool Negate = false) {
79   assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2);
80   uint32_t LHSVal, RHSVal;
81   if (getConstantValue(N->getOperand(0), LHSVal) &&
82       getConstantValue(N->getOperand(1), RHSVal)) {
83     SDLoc SL(N);
84     uint32_t K = Negate ?
85       (-LHSVal & 0xffff) | (-RHSVal << 16) :
86       (LHSVal & 0xffff) | (RHSVal << 16);
87     return DAG.getMachineNode(AMDGPU::S_MOV_B32, SL, N->getValueType(0),
88                               DAG.getTargetConstant(K, SL, MVT::i32));
89   }
90 
91   return nullptr;
92 }
93 
94 static SDNode *packNegConstantV2I16(const SDNode *N, SelectionDAG &DAG) {
95   return packConstantV2I16(N, DAG, true);
96 }
97 
98 /// AMDGPU specific code to select AMDGPU machine instructions for
99 /// SelectionDAG operations.
100 class AMDGPUDAGToDAGISel : public SelectionDAGISel {
101   // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can
102   // make the right decision when generating code for different targets.
103   const GCNSubtarget *Subtarget;
104 
105   // Default FP mode for the current function.
106   AMDGPU::SIModeRegisterDefaults Mode;
107 
108   bool EnableLateStructurizeCFG;
109 
110 public:
111   explicit AMDGPUDAGToDAGISel(TargetMachine *TM = nullptr,
112                               CodeGenOpt::Level OptLevel = CodeGenOpt::Default)
113     : SelectionDAGISel(*TM, OptLevel) {
114     EnableLateStructurizeCFG = AMDGPUTargetMachine::EnableLateStructurizeCFG;
115   }
116   ~AMDGPUDAGToDAGISel() override = default;
117 
118   void getAnalysisUsage(AnalysisUsage &AU) const override {
119     AU.addRequired<AMDGPUArgumentUsageInfo>();
120     AU.addRequired<LegacyDivergenceAnalysis>();
121 #ifdef EXPENSIVE_CHECKS
122     AU.addRequired<DominatorTreeWrapperPass>();
123     AU.addRequired<LoopInfoWrapperPass>();
124 #endif
125     SelectionDAGISel::getAnalysisUsage(AU);
126   }
127 
128   bool matchLoadD16FromBuildVector(SDNode *N) const;
129 
130   bool runOnMachineFunction(MachineFunction &MF) override;
131   void PreprocessISelDAG() override;
132   void Select(SDNode *N) override;
133   StringRef getPassName() const override;
134   void PostprocessISelDAG() override;
135 
136 protected:
137   void SelectBuildVector(SDNode *N, unsigned RegClassID);
138 
139 private:
140   std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const;
141   bool isNoNanSrc(SDValue N) const;
142   bool isInlineImmediate(const SDNode *N, bool Negated = false) const;
143   bool isNegInlineImmediate(const SDNode *N) const {
144     return isInlineImmediate(N, true);
145   }
146 
147   bool isInlineImmediate16(int64_t Imm) const {
148     return AMDGPU::isInlinableLiteral16(Imm, Subtarget->hasInv2PiInlineImm());
149   }
150 
151   bool isInlineImmediate32(int64_t Imm) const {
152     return AMDGPU::isInlinableLiteral32(Imm, Subtarget->hasInv2PiInlineImm());
153   }
154 
155   bool isInlineImmediate64(int64_t Imm) const {
156     return AMDGPU::isInlinableLiteral64(Imm, Subtarget->hasInv2PiInlineImm());
157   }
158 
159   bool isInlineImmediate(const APFloat &Imm) const {
160     return Subtarget->getInstrInfo()->isInlineConstant(Imm);
161   }
162 
163   bool isVGPRImm(const SDNode *N) const;
164   bool isUniformLoad(const SDNode *N) const;
165   bool isUniformBr(const SDNode *N) const;
166 
167   bool isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
168                                   SDValue &RHS) const;
169 
170   MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const;
171 
172   SDNode *glueCopyToOp(SDNode *N, SDValue NewChain, SDValue Glue) const;
173   SDNode *glueCopyToM0(SDNode *N, SDValue Val) const;
174   SDNode *glueCopyToM0LDSInit(SDNode *N) const;
175 
176   const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
177   virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
178   virtual bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
179   bool isDSOffsetLegal(SDValue Base, unsigned Offset) const;
180   bool isDSOffset2Legal(SDValue Base, unsigned Offset0, unsigned Offset1,
181                         unsigned Size) const;
182   bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
183   bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
184                                  SDValue &Offset1) const;
185   bool SelectDS128Bit8ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
186                                   SDValue &Offset1) const;
187   bool SelectDSReadWrite2(SDValue Ptr, SDValue &Base, SDValue &Offset0,
188                           SDValue &Offset1, unsigned Size) const;
189   bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
190                    SDValue &SOffset, SDValue &Offset, SDValue &Offen,
191                    SDValue &Idxen, SDValue &Addr64, SDValue &CPol, SDValue &TFE,
192                    SDValue &SWZ) const;
193   bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
194                          SDValue &SOffset, SDValue &Offset, SDValue &CPol,
195                          SDValue &TFE, SDValue &SWZ) const;
196   bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
197                          SDValue &SOffset, SDValue &Offset) const;
198   bool SelectMUBUFScratchOffen(SDNode *Parent,
199                                SDValue Addr, SDValue &RSrc, SDValue &VAddr,
200                                SDValue &SOffset, SDValue &ImmOffset) const;
201   bool SelectMUBUFScratchOffset(SDNode *Parent,
202                                 SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
203                                 SDValue &Offset) const;
204 
205   bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset,
206                          SDValue &Offset, SDValue &CPol, SDValue &TFE,
207                          SDValue &SWZ) const;
208   bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
209                          SDValue &Offset) const;
210 
211   template <bool IsSigned>
212   bool SelectFlatOffset(SDNode *N, SDValue Addr, SDValue &VAddr,
213                         SDValue &Offset) const;
214   bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
215                          SDValue &VOffset, SDValue &Offset) const;
216   bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
217                           SDValue &Offset) const;
218 
219   bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset,
220                         bool &Imm) const;
221   SDValue Expand32BitAddress(SDValue Addr) const;
222   bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset,
223                   bool &Imm) const;
224   bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
225   bool SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
226   bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
227   bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const;
228   bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const;
229   bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const;
230 
231   bool SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, SDValue &SrcMods) const;
232   bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods,
233                           bool AllowAbs = true) const;
234   bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
235   bool SelectVOP3BMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
236   bool SelectVOP3NoMods(SDValue In, SDValue &Src) const;
237   bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods,
238                        SDValue &Clamp, SDValue &Omod) const;
239   bool SelectVOP3BMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
240                         SDValue &Clamp, SDValue &Omod) const;
241   bool SelectVOP3NoMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
242                          SDValue &Clamp, SDValue &Omod) const;
243 
244   bool SelectVOP3OMods(SDValue In, SDValue &Src,
245                        SDValue &Clamp, SDValue &Omod) const;
246 
247   bool SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
248 
249   bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const;
250 
251   bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
252   bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods) const;
253   bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
254 
255   SDValue getHi16Elt(SDValue In) const;
256 
257   SDValue getMaterializedScalarImm32(int64_t Val, const SDLoc &DL) const;
258 
259   void SelectADD_SUB_I64(SDNode *N);
260   void SelectAddcSubb(SDNode *N);
261   void SelectUADDO_USUBO(SDNode *N);
262   void SelectDIV_SCALE(SDNode *N);
263   void SelectMAD_64_32(SDNode *N);
264   void SelectFMA_W_CHAIN(SDNode *N);
265   void SelectFMUL_W_CHAIN(SDNode *N);
266 
267   SDNode *getS_BFE(unsigned Opcode, const SDLoc &DL, SDValue Val,
268                    uint32_t Offset, uint32_t Width);
269   void SelectS_BFEFromShifts(SDNode *N);
270   void SelectS_BFE(SDNode *N);
271   bool isCBranchSCC(const SDNode *N) const;
272   void SelectBRCOND(SDNode *N);
273   void SelectFMAD_FMA(SDNode *N);
274   void SelectATOMIC_CMP_SWAP(SDNode *N);
275   void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
276   void SelectDS_GWS(SDNode *N, unsigned IntrID);
277   void SelectInterpP1F16(SDNode *N);
278   void SelectINTRINSIC_W_CHAIN(SDNode *N);
279   void SelectINTRINSIC_WO_CHAIN(SDNode *N);
280   void SelectINTRINSIC_VOID(SDNode *N);
281 
282 protected:
283   // Include the pieces autogenerated from the target description.
284 #include "AMDGPUGenDAGISel.inc"
285 };
286 
287 class R600DAGToDAGISel : public AMDGPUDAGToDAGISel {
288   const R600Subtarget *Subtarget;
289 
290   bool isConstantLoad(const MemSDNode *N, int cbID) const;
291   bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
292   bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg,
293                                        SDValue& Offset);
294 public:
295   explicit R600DAGToDAGISel(TargetMachine *TM, CodeGenOpt::Level OptLevel) :
296       AMDGPUDAGToDAGISel(TM, OptLevel) {}
297 
298   void Select(SDNode *N) override;
299 
300   bool SelectADDRIndirect(SDValue Addr, SDValue &Base,
301                           SDValue &Offset) override;
302   bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
303                           SDValue &Offset) override;
304 
305   bool runOnMachineFunction(MachineFunction &MF) override;
306 
307   void PreprocessISelDAG() override {}
308 
309 protected:
310   // Include the pieces autogenerated from the target description.
311 #include "R600GenDAGISel.inc"
312 };
313 
314 static SDValue stripBitcast(SDValue Val) {
315   return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
316 }
317 
318 // Figure out if this is really an extract of the high 16-bits of a dword.
319 static bool isExtractHiElt(SDValue In, SDValue &Out) {
320   In = stripBitcast(In);
321 
322   if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
323     if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
324       if (!Idx->isOne())
325         return false;
326       Out = In.getOperand(0);
327       return true;
328     }
329   }
330 
331   if (In.getOpcode() != ISD::TRUNCATE)
332     return false;
333 
334   SDValue Srl = In.getOperand(0);
335   if (Srl.getOpcode() == ISD::SRL) {
336     if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
337       if (ShiftAmt->getZExtValue() == 16) {
338         Out = stripBitcast(Srl.getOperand(0));
339         return true;
340       }
341     }
342   }
343 
344   return false;
345 }
346 
347 // Look through operations that obscure just looking at the low 16-bits of the
348 // same register.
349 static SDValue stripExtractLoElt(SDValue In) {
350   if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
351     if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
352       if (Idx->isNullValue() && In.getValueSizeInBits() <= 32)
353         return In.getOperand(0);
354     }
355   }
356 
357   if (In.getOpcode() == ISD::TRUNCATE) {
358     SDValue Src = In.getOperand(0);
359     if (Src.getValueType().getSizeInBits() == 32)
360       return stripBitcast(Src);
361   }
362 
363   return In;
364 }
365 
366 }  // end anonymous namespace
367 
368 INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "amdgpu-isel",
369                       "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
370 INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo)
371 INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysis)
372 INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
373 #ifdef EXPENSIVE_CHECKS
374 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
375 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
376 #endif
377 INITIALIZE_PASS_END(AMDGPUDAGToDAGISel, "amdgpu-isel",
378                     "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
379 
380 /// This pass converts a legalized DAG into a AMDGPU-specific
381 // DAG, ready for instruction scheduling.
382 FunctionPass *llvm::createAMDGPUISelDag(TargetMachine *TM,
383                                         CodeGenOpt::Level OptLevel) {
384   return new AMDGPUDAGToDAGISel(TM, OptLevel);
385 }
386 
387 /// This pass converts a legalized DAG into a R600-specific
388 // DAG, ready for instruction scheduling.
389 FunctionPass *llvm::createR600ISelDag(TargetMachine *TM,
390                                       CodeGenOpt::Level OptLevel) {
391   return new R600DAGToDAGISel(TM, OptLevel);
392 }
393 
394 bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
395 #ifdef EXPENSIVE_CHECKS
396   DominatorTree & DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
397   LoopInfo * LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
398   for (auto &L : LI->getLoopsInPreorder()) {
399     assert(L->isLCSSAForm(DT));
400   }
401 #endif
402   Subtarget = &MF.getSubtarget<GCNSubtarget>();
403   Mode = AMDGPU::SIModeRegisterDefaults(MF.getFunction());
404   return SelectionDAGISel::runOnMachineFunction(MF);
405 }
406 
407 bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
408   assert(Subtarget->d16PreservesUnusedBits());
409   MVT VT = N->getValueType(0).getSimpleVT();
410   if (VT != MVT::v2i16 && VT != MVT::v2f16)
411     return false;
412 
413   SDValue Lo = N->getOperand(0);
414   SDValue Hi = N->getOperand(1);
415 
416   LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi));
417 
418   // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
419   // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
420   // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
421 
422   // Need to check for possible indirect dependencies on the other half of the
423   // vector to avoid introducing a cycle.
424   if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) {
425     SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
426 
427     SDValue TiedIn = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Lo);
428     SDValue Ops[] = {
429       LdHi->getChain(), LdHi->getBasePtr(), TiedIn
430     };
431 
432     unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
433     if (LdHi->getMemoryVT() == MVT::i8) {
434       LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
435         AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8;
436     } else {
437       assert(LdHi->getMemoryVT() == MVT::i16);
438     }
439 
440     SDValue NewLoadHi =
441       CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
442                                   Ops, LdHi->getMemoryVT(),
443                                   LdHi->getMemOperand());
444 
445     CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
446     CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
447     return true;
448   }
449 
450   // build_vector (load ptr), hi -> load_d16_lo ptr, hi
451   // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
452   // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
453   LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo));
454   if (LdLo && Lo.hasOneUse()) {
455     SDValue TiedIn = getHi16Elt(Hi);
456     if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode()))
457       return false;
458 
459     SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
460     unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
461     if (LdLo->getMemoryVT() == MVT::i8) {
462       LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
463         AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8;
464     } else {
465       assert(LdLo->getMemoryVT() == MVT::i16);
466     }
467 
468     TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
469 
470     SDValue Ops[] = {
471       LdLo->getChain(), LdLo->getBasePtr(), TiedIn
472     };
473 
474     SDValue NewLoadLo =
475       CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
476                                   Ops, LdLo->getMemoryVT(),
477                                   LdLo->getMemOperand());
478 
479     CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
480     CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
481     return true;
482   }
483 
484   return false;
485 }
486 
487 void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
488   if (!Subtarget->d16PreservesUnusedBits())
489     return;
490 
491   SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
492 
493   bool MadeChange = false;
494   while (Position != CurDAG->allnodes_begin()) {
495     SDNode *N = &*--Position;
496     if (N->use_empty())
497       continue;
498 
499     switch (N->getOpcode()) {
500     case ISD::BUILD_VECTOR:
501       MadeChange |= matchLoadD16FromBuildVector(N);
502       break;
503     default:
504       break;
505     }
506   }
507 
508   if (MadeChange) {
509     CurDAG->RemoveDeadNodes();
510     LLVM_DEBUG(dbgs() << "After PreProcess:\n";
511                CurDAG->dump(););
512   }
513 }
514 
515 bool AMDGPUDAGToDAGISel::isNoNanSrc(SDValue N) const {
516   if (TM.Options.NoNaNsFPMath)
517     return true;
518 
519   // TODO: Move into isKnownNeverNaN
520   if (N->getFlags().hasNoNaNs())
521     return true;
522 
523   return CurDAG->isKnownNeverNaN(N);
524 }
525 
526 bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N,
527                                            bool Negated) const {
528   if (N->isUndef())
529     return true;
530 
531   const SIInstrInfo *TII = Subtarget->getInstrInfo();
532   if (Negated) {
533     if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
534       return TII->isInlineConstant(-C->getAPIntValue());
535 
536     if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
537       return TII->isInlineConstant(-C->getValueAPF().bitcastToAPInt());
538 
539   } else {
540     if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
541       return TII->isInlineConstant(C->getAPIntValue());
542 
543     if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
544       return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt());
545   }
546 
547   return false;
548 }
549 
550 /// Determine the register class for \p OpNo
551 /// \returns The register class of the virtual register that will be used for
552 /// the given operand number \OpNo or NULL if the register class cannot be
553 /// determined.
554 const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
555                                                           unsigned OpNo) const {
556   if (!N->isMachineOpcode()) {
557     if (N->getOpcode() == ISD::CopyToReg) {
558       Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
559       if (Reg.isVirtual()) {
560         MachineRegisterInfo &MRI = CurDAG->getMachineFunction().getRegInfo();
561         return MRI.getRegClass(Reg);
562       }
563 
564       const SIRegisterInfo *TRI
565         = static_cast<const GCNSubtarget *>(Subtarget)->getRegisterInfo();
566       return TRI->getPhysRegClass(Reg);
567     }
568 
569     return nullptr;
570   }
571 
572   switch (N->getMachineOpcode()) {
573   default: {
574     const MCInstrDesc &Desc =
575         Subtarget->getInstrInfo()->get(N->getMachineOpcode());
576     unsigned OpIdx = Desc.getNumDefs() + OpNo;
577     if (OpIdx >= Desc.getNumOperands())
578       return nullptr;
579     int RegClass = Desc.OpInfo[OpIdx].RegClass;
580     if (RegClass == -1)
581       return nullptr;
582 
583     return Subtarget->getRegisterInfo()->getRegClass(RegClass);
584   }
585   case AMDGPU::REG_SEQUENCE: {
586     unsigned RCID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
587     const TargetRegisterClass *SuperRC =
588         Subtarget->getRegisterInfo()->getRegClass(RCID);
589 
590     SDValue SubRegOp = N->getOperand(OpNo + 1);
591     unsigned SubRegIdx = cast<ConstantSDNode>(SubRegOp)->getZExtValue();
592     return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
593                                                               SubRegIdx);
594   }
595   }
596 }
597 
598 SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
599                                          SDValue Glue) const {
600   SmallVector <SDValue, 8> Ops;
601   Ops.push_back(NewChain); // Replace the chain.
602   for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
603     Ops.push_back(N->getOperand(i));
604 
605   Ops.push_back(Glue);
606   return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
607 }
608 
609 SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
610   const SITargetLowering& Lowering =
611     *static_cast<const SITargetLowering*>(getTargetLowering());
612 
613   assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
614 
615   SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val);
616   return glueCopyToOp(N, M0, M0.getValue(1));
617 }
618 
619 SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
620   unsigned AS = cast<MemSDNode>(N)->getAddressSpace();
621   if (AS == AMDGPUAS::LOCAL_ADDRESS) {
622     if (Subtarget->ldsRequiresM0Init())
623       return glueCopyToM0(N, CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));
624   } else if (AS == AMDGPUAS::REGION_ADDRESS) {
625     MachineFunction &MF = CurDAG->getMachineFunction();
626     unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
627     return
628         glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
629   }
630   return N;
631 }
632 
633 MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
634                                                   EVT VT) const {
635   SDNode *Lo = CurDAG->getMachineNode(
636       AMDGPU::S_MOV_B32, DL, MVT::i32,
637       CurDAG->getTargetConstant(Imm & 0xFFFFFFFF, DL, MVT::i32));
638   SDNode *Hi =
639       CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
640                              CurDAG->getTargetConstant(Imm >> 32, DL, MVT::i32));
641   const SDValue Ops[] = {
642       CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
643       SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
644       SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
645 
646   return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
647 }
648 
649 void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
650   EVT VT = N->getValueType(0);
651   unsigned NumVectorElts = VT.getVectorNumElements();
652   EVT EltVT = VT.getVectorElementType();
653   SDLoc DL(N);
654   SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
655 
656   if (NumVectorElts == 1) {
657     CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
658                          RegClass);
659     return;
660   }
661 
662   assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
663                                   "supported yet");
664   // 32 = Max Num Vector Elements
665   // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
666   // 1 = Vector Register Class
667   SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
668 
669   bool IsGCN = CurDAG->getSubtarget().getTargetTriple().getArch() ==
670                Triple::amdgcn;
671   RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
672   bool IsRegSeq = true;
673   unsigned NOps = N->getNumOperands();
674   for (unsigned i = 0; i < NOps; i++) {
675     // XXX: Why is this here?
676     if (isa<RegisterSDNode>(N->getOperand(i))) {
677       IsRegSeq = false;
678       break;
679     }
680     unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
681                          : R600RegisterInfo::getSubRegFromChannel(i);
682     RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
683     RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
684   }
685   if (NOps != NumVectorElts) {
686     // Fill in the missing undef elements if this was a scalar_to_vector.
687     assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
688     MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
689                                                    DL, EltVT);
690     for (unsigned i = NOps; i < NumVectorElts; ++i) {
691       unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
692                            : R600RegisterInfo::getSubRegFromChannel(i);
693       RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
694       RegSeqArgs[1 + (2 * i) + 1] =
695           CurDAG->getTargetConstant(Sub, DL, MVT::i32);
696     }
697   }
698 
699   if (!IsRegSeq)
700     SelectCode(N);
701   CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
702 }
703 
704 void AMDGPUDAGToDAGISel::Select(SDNode *N) {
705   unsigned int Opc = N->getOpcode();
706   if (N->isMachineOpcode()) {
707     N->setNodeId(-1);
708     return;   // Already selected.
709   }
710 
711   // isa<MemSDNode> almost works but is slightly too permissive for some DS
712   // intrinsics.
713   if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N) ||
714       (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC ||
715        Opc == ISD::ATOMIC_LOAD_FADD ||
716        Opc == AMDGPUISD::ATOMIC_LOAD_FMIN ||
717        Opc == AMDGPUISD::ATOMIC_LOAD_FMAX)) {
718     N = glueCopyToM0LDSInit(N);
719     SelectCode(N);
720     return;
721   }
722 
723   switch (Opc) {
724   default:
725     break;
726   // We are selecting i64 ADD here instead of custom lower it during
727   // DAG legalization, so we can fold some i64 ADDs used for address
728   // calculation into the LOAD and STORE instructions.
729   case ISD::ADDC:
730   case ISD::ADDE:
731   case ISD::SUBC:
732   case ISD::SUBE: {
733     if (N->getValueType(0) != MVT::i64)
734       break;
735 
736     SelectADD_SUB_I64(N);
737     return;
738   }
739   case ISD::ADDCARRY:
740   case ISD::SUBCARRY:
741     if (N->getValueType(0) != MVT::i32)
742       break;
743 
744     SelectAddcSubb(N);
745     return;
746   case ISD::UADDO:
747   case ISD::USUBO: {
748     SelectUADDO_USUBO(N);
749     return;
750   }
751   case AMDGPUISD::FMUL_W_CHAIN: {
752     SelectFMUL_W_CHAIN(N);
753     return;
754   }
755   case AMDGPUISD::FMA_W_CHAIN: {
756     SelectFMA_W_CHAIN(N);
757     return;
758   }
759 
760   case ISD::SCALAR_TO_VECTOR:
761   case ISD::BUILD_VECTOR: {
762     EVT VT = N->getValueType(0);
763     unsigned NumVectorElts = VT.getVectorNumElements();
764     if (VT.getScalarSizeInBits() == 16) {
765       if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
766         if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {
767           ReplaceNode(N, Packed);
768           return;
769         }
770       }
771 
772       break;
773     }
774 
775     assert(VT.getVectorElementType().bitsEq(MVT::i32));
776     unsigned RegClassID =
777         SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID();
778     SelectBuildVector(N, RegClassID);
779     return;
780   }
781   case ISD::BUILD_PAIR: {
782     SDValue RC, SubReg0, SubReg1;
783     SDLoc DL(N);
784     if (N->getValueType(0) == MVT::i128) {
785       RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32);
786       SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
787       SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
788     } else if (N->getValueType(0) == MVT::i64) {
789       RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);
790       SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
791       SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
792     } else {
793       llvm_unreachable("Unhandled value type for BUILD_PAIR");
794     }
795     const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
796                             N->getOperand(1), SubReg1 };
797     ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
798                                           N->getValueType(0), Ops));
799     return;
800   }
801 
802   case ISD::Constant:
803   case ISD::ConstantFP: {
804     if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
805       break;
806 
807     uint64_t Imm;
808     if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N))
809       Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
810     else {
811       ConstantSDNode *C = cast<ConstantSDNode>(N);
812       Imm = C->getZExtValue();
813     }
814 
815     SDLoc DL(N);
816     ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
817     return;
818   }
819   case AMDGPUISD::BFE_I32:
820   case AMDGPUISD::BFE_U32: {
821     // There is a scalar version available, but unlike the vector version which
822     // has a separate operand for the offset and width, the scalar version packs
823     // the width and offset into a single operand. Try to move to the scalar
824     // version if the offsets are constant, so that we can try to keep extended
825     // loads of kernel arguments in SGPRs.
826 
827     // TODO: Technically we could try to pattern match scalar bitshifts of
828     // dynamic values, but it's probably not useful.
829     ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
830     if (!Offset)
831       break;
832 
833     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
834     if (!Width)
835       break;
836 
837     bool Signed = Opc == AMDGPUISD::BFE_I32;
838 
839     uint32_t OffsetVal = Offset->getZExtValue();
840     uint32_t WidthVal = Width->getZExtValue();
841 
842     ReplaceNode(N, getS_BFE(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32,
843                             SDLoc(N), N->getOperand(0), OffsetVal, WidthVal));
844     return;
845   }
846   case AMDGPUISD::DIV_SCALE: {
847     SelectDIV_SCALE(N);
848     return;
849   }
850   case AMDGPUISD::MAD_I64_I32:
851   case AMDGPUISD::MAD_U64_U32: {
852     SelectMAD_64_32(N);
853     return;
854   }
855   case ISD::CopyToReg: {
856     const SITargetLowering& Lowering =
857       *static_cast<const SITargetLowering*>(getTargetLowering());
858     N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);
859     break;
860   }
861   case ISD::AND:
862   case ISD::SRL:
863   case ISD::SRA:
864   case ISD::SIGN_EXTEND_INREG:
865     if (N->getValueType(0) != MVT::i32)
866       break;
867 
868     SelectS_BFE(N);
869     return;
870   case ISD::BRCOND:
871     SelectBRCOND(N);
872     return;
873   case ISD::FMAD:
874   case ISD::FMA:
875     SelectFMAD_FMA(N);
876     return;
877   case AMDGPUISD::ATOMIC_CMP_SWAP:
878     SelectATOMIC_CMP_SWAP(N);
879     return;
880   case AMDGPUISD::CVT_PKRTZ_F16_F32:
881   case AMDGPUISD::CVT_PKNORM_I16_F32:
882   case AMDGPUISD::CVT_PKNORM_U16_F32:
883   case AMDGPUISD::CVT_PK_U16_U32:
884   case AMDGPUISD::CVT_PK_I16_I32: {
885     // Hack around using a legal type if f16 is illegal.
886     if (N->getValueType(0) == MVT::i32) {
887       MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
888       N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),
889                               { N->getOperand(0), N->getOperand(1) });
890       SelectCode(N);
891       return;
892     }
893 
894     break;
895   }
896   case ISD::INTRINSIC_W_CHAIN: {
897     SelectINTRINSIC_W_CHAIN(N);
898     return;
899   }
900   case ISD::INTRINSIC_WO_CHAIN: {
901     SelectINTRINSIC_WO_CHAIN(N);
902     return;
903   }
904   case ISD::INTRINSIC_VOID: {
905     SelectINTRINSIC_VOID(N);
906     return;
907   }
908   }
909 
910   SelectCode(N);
911 }
912 
913 bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
914   const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
915   const Instruction *Term = BB->getTerminator();
916   return Term->getMetadata("amdgpu.uniform") ||
917          Term->getMetadata("structurizecfg.uniform");
918 }
919 
920 static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr,
921                                           SDValue &N0, SDValue &N1) {
922   if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
923       Addr.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
924     // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
925     // (i64 (bitcast (v2i32 (build_vector
926     //                        (or (extract_vector_elt V, 0), OFFSET),
927     //                        (extract_vector_elt V, 1)))))
928     SDValue Lo = Addr.getOperand(0).getOperand(0);
929     if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
930       SDValue BaseLo = Lo.getOperand(0);
931       SDValue BaseHi = Addr.getOperand(0).getOperand(1);
932       // Check that split base (Lo and Hi) are extracted from the same one.
933       if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
934           BaseHi.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
935           BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
936           // Lo is statically extracted from index 0.
937           isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
938           BaseLo.getConstantOperandVal(1) == 0 &&
939           // Hi is statically extracted from index 0.
940           isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
941           BaseHi.getConstantOperandVal(1) == 1) {
942         N0 = BaseLo.getOperand(0).getOperand(0);
943         N1 = Lo.getOperand(1);
944         return true;
945       }
946     }
947   }
948   return false;
949 }
950 
951 bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
952                                                     SDValue &RHS) const {
953   if (CurDAG->isBaseWithConstantOffset(Addr)) {
954     LHS = Addr.getOperand(0);
955     RHS = Addr.getOperand(1);
956     return true;
957   }
958 
959   if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, LHS, RHS)) {
960     assert(LHS && RHS && isa<ConstantSDNode>(RHS));
961     return true;
962   }
963 
964   return false;
965 }
966 
967 StringRef AMDGPUDAGToDAGISel::getPassName() const {
968   return "AMDGPU DAG->DAG Pattern Instruction Selection";
969 }
970 
971 //===----------------------------------------------------------------------===//
972 // Complex Patterns
973 //===----------------------------------------------------------------------===//
974 
975 bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
976                                             SDValue &Offset) {
977   return false;
978 }
979 
980 bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
981                                             SDValue &Offset) {
982   ConstantSDNode *C;
983   SDLoc DL(Addr);
984 
985   if ((C = dyn_cast<ConstantSDNode>(Addr))) {
986     Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
987     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
988   } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
989              (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
990     Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
991     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
992   } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
993             (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
994     Base = Addr.getOperand(0);
995     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
996   } else {
997     Base = Addr;
998     Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
999   }
1000 
1001   return true;
1002 }
1003 
1004 SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
1005                                                        const SDLoc &DL) const {
1006   SDNode *Mov = CurDAG->getMachineNode(
1007     AMDGPU::S_MOV_B32, DL, MVT::i32,
1008     CurDAG->getTargetConstant(Val, DL, MVT::i32));
1009   return SDValue(Mov, 0);
1010 }
1011 
1012 // FIXME: Should only handle addcarry/subcarry
1013 void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
1014   SDLoc DL(N);
1015   SDValue LHS = N->getOperand(0);
1016   SDValue RHS = N->getOperand(1);
1017 
1018   unsigned Opcode = N->getOpcode();
1019   bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE);
1020   bool ProduceCarry =
1021       ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC;
1022   bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE;
1023 
1024   SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1025   SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1026 
1027   SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1028                                        DL, MVT::i32, LHS, Sub0);
1029   SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1030                                        DL, MVT::i32, LHS, Sub1);
1031 
1032   SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1033                                        DL, MVT::i32, RHS, Sub0);
1034   SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1035                                        DL, MVT::i32, RHS, Sub1);
1036 
1037   SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
1038 
1039   static const unsigned OpcMap[2][2][2] = {
1040       {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
1041        {AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}},
1042       {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
1043        {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
1044 
1045   unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd];
1046   unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd];
1047 
1048   SDNode *AddLo;
1049   if (!ConsumeCarry) {
1050     SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
1051     AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args);
1052   } else {
1053     SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) };
1054     AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
1055   }
1056   SDValue AddHiArgs[] = {
1057     SDValue(Hi0, 0),
1058     SDValue(Hi1, 0),
1059     SDValue(AddLo, 1)
1060   };
1061   SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs);
1062 
1063   SDValue RegSequenceArgs[] = {
1064     CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
1065     SDValue(AddLo,0),
1066     Sub0,
1067     SDValue(AddHi,0),
1068     Sub1,
1069   };
1070   SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1071                                                MVT::i64, RegSequenceArgs);
1072 
1073   if (ProduceCarry) {
1074     // Replace the carry-use
1075     ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));
1076   }
1077 
1078   // Replace the remaining uses.
1079   ReplaceNode(N, RegSequence);
1080 }
1081 
1082 void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
1083   SDLoc DL(N);
1084   SDValue LHS = N->getOperand(0);
1085   SDValue RHS = N->getOperand(1);
1086   SDValue CI = N->getOperand(2);
1087 
1088   if (N->isDivergent()) {
1089     unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::V_ADDC_U32_e64
1090                                                    : AMDGPU::V_SUBB_U32_e64;
1091     CurDAG->SelectNodeTo(
1092         N, Opc, N->getVTList(),
1093         {LHS, RHS, CI,
1094          CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
1095   } else {
1096     unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::S_ADD_CO_PSEUDO
1097                                                    : AMDGPU::S_SUB_CO_PSEUDO;
1098     CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI});
1099   }
1100 }
1101 
1102 void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
1103   // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
1104   // carry out despite the _i32 name. These were renamed in VI to _U32.
1105   // FIXME: We should probably rename the opcodes here.
1106   bool IsAdd = N->getOpcode() == ISD::UADDO;
1107   bool IsVALU = N->isDivergent();
1108 
1109   for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end(); UI != E;
1110        ++UI)
1111     if (UI.getUse().getResNo() == 1) {
1112       if ((IsAdd && (UI->getOpcode() != ISD::ADDCARRY)) ||
1113           (!IsAdd && (UI->getOpcode() != ISD::SUBCARRY))) {
1114         IsVALU = true;
1115         break;
1116       }
1117     }
1118 
1119   if (IsVALU) {
1120     unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
1121 
1122     CurDAG->SelectNodeTo(
1123         N, Opc, N->getVTList(),
1124         {N->getOperand(0), N->getOperand(1),
1125          CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
1126   } else {
1127     unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::S_UADDO_PSEUDO
1128                                                 : AMDGPU::S_USUBO_PSEUDO;
1129 
1130     CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
1131                          {N->getOperand(0), N->getOperand(1)});
1132   }
1133 }
1134 
1135 void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
1136   SDLoc SL(N);
1137   //  src0_modifiers, src0,  src1_modifiers, src1, src2_modifiers, src2, clamp, omod
1138   SDValue Ops[10];
1139 
1140   SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
1141   SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
1142   SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);
1143   Ops[8] = N->getOperand(0);
1144   Ops[9] = N->getOperand(4);
1145 
1146   CurDAG->SelectNodeTo(N, AMDGPU::V_FMA_F32_e64, N->getVTList(), Ops);
1147 }
1148 
1149 void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
1150   SDLoc SL(N);
1151   //    src0_modifiers, src0,  src1_modifiers, src1, clamp, omod
1152   SDValue Ops[8];
1153 
1154   SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
1155   SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
1156   Ops[6] = N->getOperand(0);
1157   Ops[7] = N->getOperand(3);
1158 
1159   CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
1160 }
1161 
1162 // We need to handle this here because tablegen doesn't support matching
1163 // instructions with multiple outputs.
1164 void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
1165   SDLoc SL(N);
1166   EVT VT = N->getValueType(0);
1167 
1168   assert(VT == MVT::f32 || VT == MVT::f64);
1169 
1170   unsigned Opc
1171     = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
1172 
1173   // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
1174   // omod
1175   SDValue Ops[8];
1176   SelectVOP3BMods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
1177   SelectVOP3BMods(N->getOperand(1), Ops[3], Ops[2]);
1178   SelectVOP3BMods(N->getOperand(2), Ops[5], Ops[4]);
1179   CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1180 }
1181 
1182 // We need to handle this here because tablegen doesn't support matching
1183 // instructions with multiple outputs.
1184 void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
1185   SDLoc SL(N);
1186   bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
1187   unsigned Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1188 
1189   SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1190   SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
1191                     Clamp };
1192   CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1193 }
1194 
1195 bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
1196   if (!isUInt<16>(Offset))
1197     return false;
1198 
1199   if (!Base || Subtarget->hasUsableDSOffset() ||
1200       Subtarget->unsafeDSOffsetFoldingEnabled())
1201     return true;
1202 
1203   // On Southern Islands instruction with a negative base value and an offset
1204   // don't seem to work.
1205   return CurDAG->SignBitIsZero(Base);
1206 }
1207 
1208 bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
1209                                               SDValue &Offset) const {
1210   SDLoc DL(Addr);
1211   if (CurDAG->isBaseWithConstantOffset(Addr)) {
1212     SDValue N0 = Addr.getOperand(0);
1213     SDValue N1 = Addr.getOperand(1);
1214     ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1215     if (isDSOffsetLegal(N0, C1->getSExtValue())) {
1216       // (add n0, c0)
1217       Base = N0;
1218       Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1219       return true;
1220     }
1221   } else if (Addr.getOpcode() == ISD::SUB) {
1222     // sub C, x -> add (sub 0, x), C
1223     if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1224       int64_t ByteOffset = C->getSExtValue();
1225       if (isDSOffsetLegal(SDValue(), ByteOffset)) {
1226         SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1227 
1228         // XXX - This is kind of hacky. Create a dummy sub node so we can check
1229         // the known bits in isDSOffsetLegal. We need to emit the selected node
1230         // here, so this is thrown away.
1231         SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
1232                                       Zero, Addr.getOperand(1));
1233 
1234         if (isDSOffsetLegal(Sub, ByteOffset)) {
1235           SmallVector<SDValue, 3> Opnds;
1236           Opnds.push_back(Zero);
1237           Opnds.push_back(Addr.getOperand(1));
1238 
1239           // FIXME: Select to VOP3 version for with-carry.
1240           unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1241           if (Subtarget->hasAddNoCarry()) {
1242             SubOp = AMDGPU::V_SUB_U32_e64;
1243             Opnds.push_back(
1244                 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1245           }
1246 
1247           MachineSDNode *MachineSub =
1248               CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
1249 
1250           Base = SDValue(MachineSub, 0);
1251           Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
1252           return true;
1253         }
1254       }
1255     }
1256   } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1257     // If we have a constant address, prefer to put the constant into the
1258     // offset. This can save moves to load the constant address since multiple
1259     // operations can share the zero base address register, and enables merging
1260     // into read2 / write2 instructions.
1261 
1262     SDLoc DL(Addr);
1263 
1264     if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {
1265       SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1266       MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
1267                                  DL, MVT::i32, Zero);
1268       Base = SDValue(MovZero, 0);
1269       Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1270       return true;
1271     }
1272   }
1273 
1274   // default case
1275   Base = Addr;
1276   Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);
1277   return true;
1278 }
1279 
1280 bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
1281                                           unsigned Offset1,
1282                                           unsigned Size) const {
1283   if (Offset0 % Size != 0 || Offset1 % Size != 0)
1284     return false;
1285   if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
1286     return false;
1287 
1288   if (!Base || Subtarget->hasUsableDSOffset() ||
1289       Subtarget->unsafeDSOffsetFoldingEnabled())
1290     return true;
1291 
1292   // On Southern Islands instruction with a negative base value and an offset
1293   // don't seem to work.
1294   return CurDAG->SignBitIsZero(Base);
1295 }
1296 
1297 // TODO: If offset is too big, put low 16-bit into offset.
1298 bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
1299                                                    SDValue &Offset0,
1300                                                    SDValue &Offset1) const {
1301   return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4);
1302 }
1303 
1304 bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
1305                                                     SDValue &Offset0,
1306                                                     SDValue &Offset1) const {
1307   return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8);
1308 }
1309 
1310 bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
1311                                             SDValue &Offset0, SDValue &Offset1,
1312                                             unsigned Size) const {
1313   SDLoc DL(Addr);
1314 
1315   if (CurDAG->isBaseWithConstantOffset(Addr)) {
1316     SDValue N0 = Addr.getOperand(0);
1317     SDValue N1 = Addr.getOperand(1);
1318     ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1319     unsigned OffsetValue0 = C1->getZExtValue();
1320     unsigned OffsetValue1 = OffsetValue0 + Size;
1321 
1322     // (add n0, c0)
1323     if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) {
1324       Base = N0;
1325       Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1326       Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1327       return true;
1328     }
1329   } else if (Addr.getOpcode() == ISD::SUB) {
1330     // sub C, x -> add (sub 0, x), C
1331     if (const ConstantSDNode *C =
1332             dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1333       unsigned OffsetValue0 = C->getZExtValue();
1334       unsigned OffsetValue1 = OffsetValue0 + Size;
1335 
1336       if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1337         SDLoc DL(Addr);
1338         SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1339 
1340         // XXX - This is kind of hacky. Create a dummy sub node so we can check
1341         // the known bits in isDSOffsetLegal. We need to emit the selected node
1342         // here, so this is thrown away.
1343         SDValue Sub =
1344             CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
1345 
1346         if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) {
1347           SmallVector<SDValue, 3> Opnds;
1348           Opnds.push_back(Zero);
1349           Opnds.push_back(Addr.getOperand(1));
1350           unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1351           if (Subtarget->hasAddNoCarry()) {
1352             SubOp = AMDGPU::V_SUB_U32_e64;
1353             Opnds.push_back(
1354                 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1355           }
1356 
1357           MachineSDNode *MachineSub = CurDAG->getMachineNode(
1358               SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds);
1359 
1360           Base = SDValue(MachineSub, 0);
1361           Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1362           Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1363           return true;
1364         }
1365       }
1366     }
1367   } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1368     unsigned OffsetValue0 = CAddr->getZExtValue();
1369     unsigned OffsetValue1 = OffsetValue0 + Size;
1370 
1371     if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1372       SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1373       MachineSDNode *MovZero =
1374           CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
1375       Base = SDValue(MovZero, 0);
1376       Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1377       Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1378       return true;
1379     }
1380   }
1381 
1382   // default case
1383 
1384   Base = Addr;
1385   Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8);
1386   Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8);
1387   return true;
1388 }
1389 
1390 bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
1391                                      SDValue &SOffset, SDValue &Offset,
1392                                      SDValue &Offen, SDValue &Idxen,
1393                                      SDValue &Addr64, SDValue &CPol,
1394                                      SDValue &TFE, SDValue &SWZ) const {
1395   // Subtarget prefers to use flat instruction
1396   // FIXME: This should be a pattern predicate and not reach here
1397   if (Subtarget->useFlatForGlobal())
1398     return false;
1399 
1400   SDLoc DL(Addr);
1401 
1402   if (!CPol)
1403     CPol = CurDAG->getTargetConstant(0, DL, MVT::i32);
1404   TFE = CurDAG->getTargetConstant(0, DL, MVT::i1);
1405   SWZ = CurDAG->getTargetConstant(0, DL, MVT::i1);
1406 
1407   Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1408   Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1409   Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
1410   SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1411 
1412   ConstantSDNode *C1 = nullptr;
1413   SDValue N0 = Addr;
1414   if (CurDAG->isBaseWithConstantOffset(Addr)) {
1415     C1 = cast<ConstantSDNode>(Addr.getOperand(1));
1416     if (isUInt<32>(C1->getZExtValue()))
1417       N0 = Addr.getOperand(0);
1418     else
1419       C1 = nullptr;
1420   }
1421 
1422   if (N0.getOpcode() == ISD::ADD) {
1423     // (add N2, N3) -> addr64, or
1424     // (add (add N2, N3), C1) -> addr64
1425     SDValue N2 = N0.getOperand(0);
1426     SDValue N3 = N0.getOperand(1);
1427     Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1428 
1429     if (N2->isDivergent()) {
1430       if (N3->isDivergent()) {
1431         // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1432         // addr64, and construct the resource from a 0 address.
1433         Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1434         VAddr = N0;
1435       } else {
1436         // N2 is divergent, N3 is not.
1437         Ptr = N3;
1438         VAddr = N2;
1439       }
1440     } else {
1441       // N2 is not divergent.
1442       Ptr = N2;
1443       VAddr = N3;
1444     }
1445     Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
1446   } else if (N0->isDivergent()) {
1447     // N0 is divergent. Use it as the addr64, and construct the resource from a
1448     // 0 address.
1449     Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1450     VAddr = N0;
1451     Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1452   } else {
1453     // N0 -> offset, or
1454     // (N0 + C1) -> offset
1455     VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
1456     Ptr = N0;
1457   }
1458 
1459   if (!C1) {
1460     // No offset.
1461     Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
1462     return true;
1463   }
1464 
1465   if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue())) {
1466     // Legal offset for instruction.
1467     Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1468     return true;
1469   }
1470 
1471   // Illegal offset, store it in soffset.
1472   Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
1473   SOffset =
1474       SDValue(CurDAG->getMachineNode(
1475                   AMDGPU::S_MOV_B32, DL, MVT::i32,
1476                   CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
1477               0);
1478   return true;
1479 }
1480 
1481 bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1482                                            SDValue &VAddr, SDValue &SOffset,
1483                                            SDValue &Offset, SDValue &CPol,
1484                                            SDValue &TFE, SDValue &SWZ) const {
1485   SDValue Ptr, Offen, Idxen, Addr64;
1486 
1487   // addr64 bit was removed for volcanic islands.
1488   // FIXME: This should be a pattern predicate and not reach here
1489   if (!Subtarget->hasAddr64())
1490     return false;
1491 
1492   if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
1493                    CPol, TFE, SWZ))
1494     return false;
1495 
1496   ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
1497   if (C->getSExtValue()) {
1498     SDLoc DL(Addr);
1499 
1500     const SITargetLowering& Lowering =
1501       *static_cast<const SITargetLowering*>(getTargetLowering());
1502 
1503     SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
1504     return true;
1505   }
1506 
1507   return false;
1508 }
1509 
1510 bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1511                                            SDValue &VAddr, SDValue &SOffset,
1512                                            SDValue &Offset) const {
1513   SDValue CPol, TFE, SWZ;
1514 
1515   return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, CPol, TFE, SWZ);
1516 }
1517 
1518 static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
1519   auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>();
1520   return PSV && PSV->isStack();
1521 }
1522 
1523 std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
1524   SDLoc DL(N);
1525 
1526   auto *FI = dyn_cast<FrameIndexSDNode>(N);
1527   SDValue TFI =
1528       FI ? CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)) : N;
1529 
1530   // We rebase the base address into an absolute stack address and hence
1531   // use constant 0 for soffset. This value must be retained until
1532   // frame elimination and eliminateFrameIndex will choose the appropriate
1533   // frame register if need be.
1534   return std::make_pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));
1535 }
1536 
1537 bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
1538                                                  SDValue Addr, SDValue &Rsrc,
1539                                                  SDValue &VAddr, SDValue &SOffset,
1540                                                  SDValue &ImmOffset) const {
1541 
1542   SDLoc DL(Addr);
1543   MachineFunction &MF = CurDAG->getMachineFunction();
1544   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1545 
1546   Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1547 
1548   if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1549     int64_t Imm = CAddr->getSExtValue();
1550     const int64_t NullPtr =
1551         AMDGPUTargetMachine::getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS);
1552     // Don't fold null pointer.
1553     if (Imm != NullPtr) {
1554       SDValue HighBits = CurDAG->getTargetConstant(Imm & ~4095, DL, MVT::i32);
1555       MachineSDNode *MovHighBits = CurDAG->getMachineNode(
1556         AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
1557       VAddr = SDValue(MovHighBits, 0);
1558 
1559       // In a call sequence, stores to the argument stack area are relative to the
1560       // stack pointer.
1561       const MachinePointerInfo &PtrInfo
1562         = cast<MemSDNode>(Parent)->getPointerInfo();
1563       SOffset = isStackPtrRelative(PtrInfo)
1564         ? CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32)
1565         : CurDAG->getTargetConstant(0, DL, MVT::i32);
1566       ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16);
1567       return true;
1568     }
1569   }
1570 
1571   if (CurDAG->isBaseWithConstantOffset(Addr)) {
1572     // (add n0, c1)
1573 
1574     SDValue N0 = Addr.getOperand(0);
1575     SDValue N1 = Addr.getOperand(1);
1576 
1577     // Offsets in vaddr must be positive if range checking is enabled.
1578     //
1579     // The total computation of vaddr + soffset + offset must not overflow.  If
1580     // vaddr is negative, even if offset is 0 the sgpr offset add will end up
1581     // overflowing.
1582     //
1583     // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1584     // always perform a range check. If a negative vaddr base index was used,
1585     // this would fail the range check. The overall address computation would
1586     // compute a valid address, but this doesn't happen due to the range
1587     // check. For out-of-bounds MUBUF loads, a 0 is returned.
1588     //
1589     // Therefore it should be safe to fold any VGPR offset on gfx9 into the
1590     // MUBUF vaddr, but not on older subtargets which can only do this if the
1591     // sign bit is known 0.
1592     ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1593     if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue()) &&
1594         (!Subtarget->privateMemoryResourceIsRangeChecked() ||
1595          CurDAG->SignBitIsZero(N0))) {
1596       std::tie(VAddr, SOffset) = foldFrameIndex(N0);
1597       ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1598       return true;
1599     }
1600   }
1601 
1602   // (node)
1603   std::tie(VAddr, SOffset) = foldFrameIndex(Addr);
1604   ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16);
1605   return true;
1606 }
1607 
1608 bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
1609                                                   SDValue Addr,
1610                                                   SDValue &SRsrc,
1611                                                   SDValue &SOffset,
1612                                                   SDValue &Offset) const {
1613   ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr);
1614   if (!CAddr || !SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue()))
1615     return false;
1616 
1617   SDLoc DL(Addr);
1618   MachineFunction &MF = CurDAG->getMachineFunction();
1619   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1620 
1621   SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1622 
1623   const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Parent)->getPointerInfo();
1624 
1625   // FIXME: Get from MachinePointerInfo? We should only be using the frame
1626   // offset if we know this is in a call sequence.
1627   SOffset = isStackPtrRelative(PtrInfo)
1628                 ? CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32)
1629                 : CurDAG->getTargetConstant(0, DL, MVT::i32);
1630 
1631   Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1632   return true;
1633 }
1634 
1635 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1636                                            SDValue &SOffset, SDValue &Offset,
1637                                            SDValue &CPol, SDValue &TFE,
1638                                            SDValue &SWZ) const {
1639   SDValue Ptr, VAddr, Offen, Idxen, Addr64;
1640   const SIInstrInfo *TII =
1641     static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
1642 
1643   if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
1644                    CPol, TFE, SWZ))
1645     return false;
1646 
1647   if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
1648       !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
1649       !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
1650     uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
1651                     APInt::getAllOnesValue(32).getZExtValue(); // Size
1652     SDLoc DL(Addr);
1653 
1654     const SITargetLowering& Lowering =
1655       *static_cast<const SITargetLowering*>(getTargetLowering());
1656 
1657     SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
1658     return true;
1659   }
1660   return false;
1661 }
1662 
1663 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1664                                            SDValue &Soffset, SDValue &Offset
1665                                            ) const {
1666   SDValue CPol, TFE, SWZ;
1667 
1668   return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, CPol, TFE, SWZ);
1669 }
1670 
1671 // Find a load or store from corresponding pattern root.
1672 // Roots may be build_vector, bitconvert or their combinations.
1673 static MemSDNode* findMemSDNode(SDNode *N) {
1674   N = AMDGPUTargetLowering::stripBitcast(SDValue(N,0)).getNode();
1675   if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
1676     return MN;
1677   assert(isa<BuildVectorSDNode>(N));
1678   for (SDValue V : N->op_values())
1679     if (MemSDNode *MN =
1680           dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
1681       return MN;
1682   llvm_unreachable("cannot find MemSDNode in the pattern!");
1683 }
1684 
1685 template <bool IsSigned>
1686 bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N,
1687                                           SDValue Addr,
1688                                           SDValue &VAddr,
1689                                           SDValue &Offset) const {
1690   int64_t OffsetVal = 0;
1691 
1692   unsigned AS = findMemSDNode(N)->getAddressSpace();
1693 
1694   if (Subtarget->hasFlatInstOffsets() &&
1695       (!Subtarget->hasFlatSegmentOffsetBug() ||
1696        AS != AMDGPUAS::FLAT_ADDRESS)) {
1697     SDValue N0, N1;
1698     if (isBaseWithConstantOffset64(Addr, N0, N1)) {
1699       uint64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
1700 
1701       const SIInstrInfo *TII = Subtarget->getInstrInfo();
1702       if (TII->isLegalFLATOffset(COffsetVal, AS, IsSigned)) {
1703         Addr = N0;
1704         OffsetVal = COffsetVal;
1705       } else {
1706         // If the offset doesn't fit, put the low bits into the offset field and
1707         // add the rest.
1708         //
1709         // For a FLAT instruction the hardware decides whether to access
1710         // global/scratch/shared memory based on the high bits of vaddr,
1711         // ignoring the offset field, so we have to ensure that when we add
1712         // remainder to vaddr it still points into the same underlying object.
1713         // The easiest way to do that is to make sure that we split the offset
1714         // into two pieces that are both >= 0 or both <= 0.
1715 
1716         SDLoc DL(N);
1717         uint64_t RemainderOffset;
1718 
1719         std::tie(OffsetVal, RemainderOffset)
1720           = TII->splitFlatOffset(COffsetVal, AS, IsSigned);
1721 
1722         SDValue AddOffsetLo =
1723             getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1724         SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1725 
1726         if (Addr.getValueType().getSizeInBits() == 32) {
1727           SmallVector<SDValue, 3> Opnds;
1728           Opnds.push_back(N0);
1729           Opnds.push_back(AddOffsetLo);
1730           unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1731           if (Subtarget->hasAddNoCarry()) {
1732             AddOp = AMDGPU::V_ADD_U32_e64;
1733             Opnds.push_back(Clamp);
1734           }
1735           Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
1736         } else {
1737           // TODO: Should this try to use a scalar add pseudo if the base address
1738           // is uniform and saddr is usable?
1739           SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1740           SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1741 
1742           SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1743                                                 DL, MVT::i32, N0, Sub0);
1744           SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1745                                                 DL, MVT::i32, N0, Sub1);
1746 
1747           SDValue AddOffsetHi =
1748               getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
1749 
1750           SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1751 
1752           SDNode *Add =
1753               CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1754                                      {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1755 
1756           SDNode *Addc = CurDAG->getMachineNode(
1757               AMDGPU::V_ADDC_U32_e64, DL, VTs,
1758               {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1759 
1760           SDValue RegSequenceArgs[] = {
1761               CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
1762               SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1763 
1764           Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1765                                                 MVT::i64, RegSequenceArgs),
1766                          0);
1767         }
1768       }
1769     }
1770   }
1771 
1772   VAddr = Addr;
1773   Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i16);
1774   return true;
1775 }
1776 
1777 // If this matches zero_extend i32:x, return x
1778 static SDValue matchZExtFromI32(SDValue Op) {
1779   if (Op.getOpcode() != ISD::ZERO_EXTEND)
1780     return SDValue();
1781 
1782   SDValue ExtSrc = Op.getOperand(0);
1783   return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
1784 }
1785 
1786 // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
1787 bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
1788                                            SDValue Addr,
1789                                            SDValue &SAddr,
1790                                            SDValue &VOffset,
1791                                            SDValue &Offset) const {
1792   int64_t ImmOffset = 0;
1793 
1794   // Match the immediate offset first, which canonically is moved as low as
1795   // possible.
1796 
1797   SDValue LHS, RHS;
1798   if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1799     int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1800     const SIInstrInfo *TII = Subtarget->getInstrInfo();
1801 
1802     if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, true)) {
1803       Addr = LHS;
1804       ImmOffset = COffsetVal;
1805     } else if (!LHS->isDivergent() && COffsetVal > 0) {
1806       SDLoc SL(N);
1807       // saddr + large_offset -> saddr + (voffset = large_offset & ~MaxOffset) +
1808       //                         (large_offset & MaxOffset);
1809       int64_t SplitImmOffset, RemainderOffset;
1810       std::tie(SplitImmOffset, RemainderOffset)
1811         = TII->splitFlatOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, true);
1812 
1813       if (isUInt<32>(RemainderOffset)) {
1814         SDNode *VMov = CurDAG->getMachineNode(
1815           AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1816           CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1817         VOffset = SDValue(VMov, 0);
1818         SAddr = LHS;
1819         Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16);
1820         return true;
1821       }
1822     }
1823   }
1824 
1825   // Match the variable offset.
1826   if (Addr.getOpcode() != ISD::ADD) {
1827     if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
1828         isa<ConstantSDNode>(Addr))
1829       return false;
1830 
1831     // It's cheaper to materialize a single 32-bit zero for vaddr than the two
1832     // moves required to copy a 64-bit SGPR to VGPR.
1833     SAddr = Addr;
1834     SDNode *VMov = CurDAG->getMachineNode(
1835       AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
1836       CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
1837     VOffset = SDValue(VMov, 0);
1838     Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
1839     return true;
1840   }
1841 
1842   LHS = Addr.getOperand(0);
1843   RHS = Addr.getOperand(1);
1844 
1845   if (!LHS->isDivergent()) {
1846     // add (i64 sgpr), (zero_extend (i32 vgpr))
1847     if (SDValue ZextRHS = matchZExtFromI32(RHS)) {
1848       SAddr = LHS;
1849       VOffset = ZextRHS;
1850     }
1851   }
1852 
1853   if (!SAddr && !RHS->isDivergent()) {
1854     // add (zero_extend (i32 vgpr)), (i64 sgpr)
1855     if (SDValue ZextLHS = matchZExtFromI32(LHS)) {
1856       SAddr = RHS;
1857       VOffset = ZextLHS;
1858     }
1859   }
1860 
1861   if (!SAddr)
1862     return false;
1863 
1864   Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
1865   return true;
1866 }
1867 
1868 // Match (32-bit SGPR base) + sext(imm offset)
1869 bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *N,
1870                                             SDValue Addr,
1871                                             SDValue &SAddr,
1872                                             SDValue &Offset) const {
1873   if (Addr->isDivergent())
1874     return false;
1875 
1876   SAddr = Addr;
1877   int64_t COffsetVal = 0;
1878 
1879   if (CurDAG->isBaseWithConstantOffset(Addr)) {
1880     COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
1881     SAddr = Addr.getOperand(0);
1882   }
1883 
1884   if (auto FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
1885     SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
1886   } else if (SAddr.getOpcode() == ISD::ADD &&
1887              isa<FrameIndexSDNode>(SAddr.getOperand(0))) {
1888     // Materialize this into a scalar move for scalar address to avoid
1889     // readfirstlane.
1890     auto FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
1891     SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
1892                                               FI->getValueType(0));
1893     SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_U32, SDLoc(SAddr),
1894                                            MVT::i32, TFI, SAddr.getOperand(1)),
1895                     0);
1896   }
1897 
1898   const SIInstrInfo *TII = Subtarget->getInstrInfo();
1899 
1900   if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true)) {
1901     const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(*Subtarget, true);
1902     // Use signed division by a power of two to truncate towards 0.
1903     int64_t D = 1LL << (NumBits - 1);
1904     int64_t RemainderOffset = (COffsetVal / D) * D;
1905     int64_t ImmField = COffsetVal - RemainderOffset;
1906 
1907     assert(TII->isLegalFLATOffset(ImmField, AMDGPUAS::PRIVATE_ADDRESS, true));
1908     assert(RemainderOffset + ImmField == COffsetVal);
1909 
1910     COffsetVal = ImmField;
1911 
1912     SDLoc DL(N);
1913     SDValue AddOffset =
1914         getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1915     SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_U32, DL, MVT::i32,
1916                                            SAddr, AddOffset), 0);
1917   }
1918 
1919   Offset = CurDAG->getTargetConstant(COffsetVal, SDLoc(), MVT::i16);
1920 
1921   return true;
1922 }
1923 
1924 bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
1925                                           SDValue &Offset, bool &Imm) const {
1926   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
1927   if (!C) {
1928     if (ByteOffsetNode.getValueType().isScalarInteger() &&
1929         ByteOffsetNode.getValueType().getSizeInBits() == 32) {
1930       Offset = ByteOffsetNode;
1931       Imm = false;
1932       return true;
1933     }
1934     if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
1935       if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
1936         Offset = ByteOffsetNode.getOperand(0);
1937         Imm = false;
1938         return true;
1939       }
1940     }
1941     return false;
1942   }
1943 
1944   SDLoc SL(ByteOffsetNode);
1945   // GFX9 and GFX10 have signed byte immediate offsets.
1946   int64_t ByteOffset = C->getSExtValue();
1947   Optional<int64_t> EncodedOffset =
1948       AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset, false);
1949   if (EncodedOffset) {
1950     Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
1951     Imm = true;
1952     return true;
1953   }
1954 
1955   // SGPR and literal offsets are unsigned.
1956   if (ByteOffset < 0)
1957     return false;
1958 
1959   EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);
1960   if (EncodedOffset) {
1961     Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
1962     return true;
1963   }
1964 
1965   if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))
1966     return false;
1967 
1968   SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
1969   Offset = SDValue(
1970       CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
1971 
1972   return true;
1973 }
1974 
1975 SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
1976   if (Addr.getValueType() != MVT::i32)
1977     return Addr;
1978 
1979   // Zero-extend a 32-bit address.
1980   SDLoc SL(Addr);
1981 
1982   const MachineFunction &MF = CurDAG->getMachineFunction();
1983   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1984   unsigned AddrHiVal = Info->get32BitAddressHighBits();
1985   SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
1986 
1987   const SDValue Ops[] = {
1988     CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
1989     Addr,
1990     CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
1991     SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
1992             0),
1993     CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
1994   };
1995 
1996   return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
1997                                         Ops), 0);
1998 }
1999 
2000 bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
2001                                      SDValue &Offset, bool &Imm) const {
2002   SDLoc SL(Addr);
2003 
2004   // A 32-bit (address + offset) should not cause unsigned 32-bit integer
2005   // wraparound, because s_load instructions perform the addition in 64 bits.
2006   if ((Addr.getValueType() != MVT::i32 ||
2007        Addr->getFlags().hasNoUnsignedWrap())) {
2008     SDValue N0, N1;
2009     // Extract the base and offset if possible.
2010     if (CurDAG->isBaseWithConstantOffset(Addr) ||
2011         Addr.getOpcode() == ISD::ADD) {
2012       N0 = Addr.getOperand(0);
2013       N1 = Addr.getOperand(1);
2014     } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
2015       assert(N0 && N1 && isa<ConstantSDNode>(N1));
2016     }
2017     if (N0 && N1) {
2018       if (SelectSMRDOffset(N1, Offset, Imm)) {
2019         SBase = Expand32BitAddress(N0);
2020         return true;
2021       }
2022     }
2023   }
2024   SBase = Expand32BitAddress(Addr);
2025   Offset = CurDAG->getTargetConstant(0, SL, MVT::i32);
2026   Imm = true;
2027   return true;
2028 }
2029 
2030 bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
2031                                        SDValue &Offset) const {
2032   bool Imm = false;
2033   return SelectSMRD(Addr, SBase, Offset, Imm) && Imm;
2034 }
2035 
2036 bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
2037                                          SDValue &Offset) const {
2038 
2039   assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2040 
2041   bool Imm = false;
2042   if (!SelectSMRD(Addr, SBase, Offset, Imm))
2043     return false;
2044 
2045   return !Imm && isa<ConstantSDNode>(Offset);
2046 }
2047 
2048 bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
2049                                         SDValue &Offset) const {
2050   bool Imm = false;
2051   return SelectSMRD(Addr, SBase, Offset, Imm) && !Imm &&
2052          !isa<ConstantSDNode>(Offset);
2053 }
2054 
2055 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue Addr,
2056                                              SDValue &Offset) const {
2057   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr)) {
2058     // The immediate offset for S_BUFFER instructions is unsigned.
2059     if (auto Imm =
2060             AMDGPU::getSMRDEncodedOffset(*Subtarget, C->getZExtValue(), true)) {
2061       Offset = CurDAG->getTargetConstant(*Imm, SDLoc(Addr), MVT::i32);
2062       return true;
2063     }
2064   }
2065 
2066   return false;
2067 }
2068 
2069 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue Addr,
2070                                                SDValue &Offset) const {
2071   assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2072 
2073   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr)) {
2074     if (auto Imm = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget,
2075                                                          C->getZExtValue())) {
2076       Offset = CurDAG->getTargetConstant(*Imm, SDLoc(Addr), MVT::i32);
2077       return true;
2078     }
2079   }
2080 
2081   return false;
2082 }
2083 
2084 bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
2085                                             SDValue &Base,
2086                                             SDValue &Offset) const {
2087   SDLoc DL(Index);
2088 
2089   if (CurDAG->isBaseWithConstantOffset(Index)) {
2090     SDValue N0 = Index.getOperand(0);
2091     SDValue N1 = Index.getOperand(1);
2092     ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
2093 
2094     // (add n0, c0)
2095     // Don't peel off the offset (c0) if doing so could possibly lead
2096     // the base (n0) to be negative.
2097     // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.
2098     if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) ||
2099         (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {
2100       Base = N0;
2101       Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
2102       return true;
2103     }
2104   }
2105 
2106   if (isa<ConstantSDNode>(Index))
2107     return false;
2108 
2109   Base = Index;
2110   Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
2111   return true;
2112 }
2113 
2114 SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, const SDLoc &DL,
2115                                      SDValue Val, uint32_t Offset,
2116                                      uint32_t Width) {
2117   // Transformation function, pack the offset and width of a BFE into
2118   // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
2119   // source, bits [5:0] contain the offset and bits [22:16] the width.
2120   uint32_t PackedVal = Offset | (Width << 16);
2121   SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);
2122 
2123   return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
2124 }
2125 
2126 void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
2127   // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
2128   // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
2129   // Predicate: 0 < b <= c < 32
2130 
2131   const SDValue &Shl = N->getOperand(0);
2132   ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1));
2133   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
2134 
2135   if (B && C) {
2136     uint32_t BVal = B->getZExtValue();
2137     uint32_t CVal = C->getZExtValue();
2138 
2139     if (0 < BVal && BVal <= CVal && CVal < 32) {
2140       bool Signed = N->getOpcode() == ISD::SRA;
2141       unsigned Opcode = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2142 
2143       ReplaceNode(N, getS_BFE(Opcode, SDLoc(N), Shl.getOperand(0), CVal - BVal,
2144                               32 - CVal));
2145       return;
2146     }
2147   }
2148   SelectCode(N);
2149 }
2150 
2151 void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
2152   switch (N->getOpcode()) {
2153   case ISD::AND:
2154     if (N->getOperand(0).getOpcode() == ISD::SRL) {
2155       // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
2156       // Predicate: isMask(mask)
2157       const SDValue &Srl = N->getOperand(0);
2158       ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1));
2159       ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
2160 
2161       if (Shift && Mask) {
2162         uint32_t ShiftVal = Shift->getZExtValue();
2163         uint32_t MaskVal = Mask->getZExtValue();
2164 
2165         if (isMask_32(MaskVal)) {
2166           uint32_t WidthVal = countPopulation(MaskVal);
2167 
2168           ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N),
2169                                   Srl.getOperand(0), ShiftVal, WidthVal));
2170           return;
2171         }
2172       }
2173     }
2174     break;
2175   case ISD::SRL:
2176     if (N->getOperand(0).getOpcode() == ISD::AND) {
2177       // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
2178       // Predicate: isMask(mask >> b)
2179       const SDValue &And = N->getOperand(0);
2180       ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1));
2181       ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1));
2182 
2183       if (Shift && Mask) {
2184         uint32_t ShiftVal = Shift->getZExtValue();
2185         uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
2186 
2187         if (isMask_32(MaskVal)) {
2188           uint32_t WidthVal = countPopulation(MaskVal);
2189 
2190           ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N),
2191                                   And.getOperand(0), ShiftVal, WidthVal));
2192           return;
2193         }
2194       }
2195     } else if (N->getOperand(0).getOpcode() == ISD::SHL) {
2196       SelectS_BFEFromShifts(N);
2197       return;
2198     }
2199     break;
2200   case ISD::SRA:
2201     if (N->getOperand(0).getOpcode() == ISD::SHL) {
2202       SelectS_BFEFromShifts(N);
2203       return;
2204     }
2205     break;
2206 
2207   case ISD::SIGN_EXTEND_INREG: {
2208     // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
2209     SDValue Src = N->getOperand(0);
2210     if (Src.getOpcode() != ISD::SRL)
2211       break;
2212 
2213     const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1));
2214     if (!Amt)
2215       break;
2216 
2217     unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
2218     ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_I32, SDLoc(N), Src.getOperand(0),
2219                             Amt->getZExtValue(), Width));
2220     return;
2221   }
2222   }
2223 
2224   SelectCode(N);
2225 }
2226 
2227 bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
2228   assert(N->getOpcode() == ISD::BRCOND);
2229   if (!N->hasOneUse())
2230     return false;
2231 
2232   SDValue Cond = N->getOperand(1);
2233   if (Cond.getOpcode() == ISD::CopyToReg)
2234     Cond = Cond.getOperand(2);
2235 
2236   if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
2237     return false;
2238 
2239   MVT VT = Cond.getOperand(0).getSimpleValueType();
2240   if (VT == MVT::i32)
2241     return true;
2242 
2243   if (VT == MVT::i64) {
2244     auto ST = static_cast<const GCNSubtarget *>(Subtarget);
2245 
2246     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
2247     return (CC == ISD::SETEQ || CC == ISD::SETNE) && ST->hasScalarCompareEq64();
2248   }
2249 
2250   return false;
2251 }
2252 
2253 void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
2254   SDValue Cond = N->getOperand(1);
2255 
2256   if (Cond.isUndef()) {
2257     CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
2258                          N->getOperand(2), N->getOperand(0));
2259     return;
2260   }
2261 
2262   const GCNSubtarget *ST = static_cast<const GCNSubtarget *>(Subtarget);
2263   const SIRegisterInfo *TRI = ST->getRegisterInfo();
2264 
2265   bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
2266   unsigned BrOp = UseSCCBr ? AMDGPU::S_CBRANCH_SCC1 : AMDGPU::S_CBRANCH_VCCNZ;
2267   Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
2268   SDLoc SL(N);
2269 
2270   if (!UseSCCBr) {
2271     // This is the case that we are selecting to S_CBRANCH_VCCNZ.  We have not
2272     // analyzed what generates the vcc value, so we do not know whether vcc
2273     // bits for disabled lanes are 0.  Thus we need to mask out bits for
2274     // disabled lanes.
2275     //
2276     // For the case that we select S_CBRANCH_SCC1 and it gets
2277     // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
2278     // SIInstrInfo::moveToVALU which inserts the S_AND).
2279     //
2280     // We could add an analysis of what generates the vcc value here and omit
2281     // the S_AND when is unnecessary. But it would be better to add a separate
2282     // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
2283     // catches both cases.
2284     Cond = SDValue(CurDAG->getMachineNode(ST->isWave32() ? AMDGPU::S_AND_B32
2285                                                          : AMDGPU::S_AND_B64,
2286                      SL, MVT::i1,
2287                      CurDAG->getRegister(ST->isWave32() ? AMDGPU::EXEC_LO
2288                                                         : AMDGPU::EXEC,
2289                                          MVT::i1),
2290                     Cond),
2291                    0);
2292   }
2293 
2294   SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);
2295   CurDAG->SelectNodeTo(N, BrOp, MVT::Other,
2296                        N->getOperand(2), // Basic Block
2297                        VCC.getValue(0));
2298 }
2299 
2300 void AMDGPUDAGToDAGISel::SelectFMAD_FMA(SDNode *N) {
2301   MVT VT = N->getSimpleValueType(0);
2302   bool IsFMA = N->getOpcode() == ISD::FMA;
2303   if (VT != MVT::f32 || (!Subtarget->hasMadMixInsts() &&
2304                          !Subtarget->hasFmaMixInsts()) ||
2305       ((IsFMA && Subtarget->hasMadMixInsts()) ||
2306        (!IsFMA && Subtarget->hasFmaMixInsts()))) {
2307     SelectCode(N);
2308     return;
2309   }
2310 
2311   SDValue Src0 = N->getOperand(0);
2312   SDValue Src1 = N->getOperand(1);
2313   SDValue Src2 = N->getOperand(2);
2314   unsigned Src0Mods, Src1Mods, Src2Mods;
2315 
2316   // Avoid using v_mad_mix_f32/v_fma_mix_f32 unless there is actually an operand
2317   // using the conversion from f16.
2318   bool Sel0 = SelectVOP3PMadMixModsImpl(Src0, Src0, Src0Mods);
2319   bool Sel1 = SelectVOP3PMadMixModsImpl(Src1, Src1, Src1Mods);
2320   bool Sel2 = SelectVOP3PMadMixModsImpl(Src2, Src2, Src2Mods);
2321 
2322   assert((IsFMA || !Mode.allFP32Denormals()) &&
2323          "fmad selected with denormals enabled");
2324   // TODO: We can select this with f32 denormals enabled if all the sources are
2325   // converted from f16 (in which case fmad isn't legal).
2326 
2327   if (Sel0 || Sel1 || Sel2) {
2328     // For dummy operands.
2329     SDValue Zero = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2330     SDValue Ops[] = {
2331       CurDAG->getTargetConstant(Src0Mods, SDLoc(), MVT::i32), Src0,
2332       CurDAG->getTargetConstant(Src1Mods, SDLoc(), MVT::i32), Src1,
2333       CurDAG->getTargetConstant(Src2Mods, SDLoc(), MVT::i32), Src2,
2334       CurDAG->getTargetConstant(0, SDLoc(), MVT::i1),
2335       Zero, Zero
2336     };
2337 
2338     CurDAG->SelectNodeTo(N,
2339                          IsFMA ? AMDGPU::V_FMA_MIX_F32 : AMDGPU::V_MAD_MIX_F32,
2340                          MVT::f32, Ops);
2341   } else {
2342     SelectCode(N);
2343   }
2344 }
2345 
2346 // This is here because there isn't a way to use the generated sub0_sub1 as the
2347 // subreg index to EXTRACT_SUBREG in tablegen.
2348 void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) {
2349   MemSDNode *Mem = cast<MemSDNode>(N);
2350   unsigned AS = Mem->getAddressSpace();
2351   if (AS == AMDGPUAS::FLAT_ADDRESS) {
2352     SelectCode(N);
2353     return;
2354   }
2355 
2356   MVT VT = N->getSimpleValueType(0);
2357   bool Is32 = (VT == MVT::i32);
2358   SDLoc SL(N);
2359 
2360   MachineSDNode *CmpSwap = nullptr;
2361   if (Subtarget->hasAddr64()) {
2362     SDValue SRsrc, VAddr, SOffset, Offset;
2363 
2364     if (SelectMUBUFAddr64(Mem->getBasePtr(), SRsrc, VAddr, SOffset, Offset)) {
2365       unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN :
2366         AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN;
2367       SDValue CmpVal = Mem->getOperand(2);
2368       SDValue CPol = CurDAG->getTargetConstant(AMDGPU::CPol::GLC, SL, MVT::i32);
2369 
2370       // XXX - Do we care about glue operands?
2371 
2372       SDValue Ops[] = {CmpVal, VAddr, SRsrc, SOffset, Offset, CPol,
2373                        Mem->getChain()};
2374 
2375       CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops);
2376     }
2377   }
2378 
2379   if (!CmpSwap) {
2380     SDValue SRsrc, SOffset, Offset;
2381     if (SelectMUBUFOffset(Mem->getBasePtr(), SRsrc, SOffset, Offset)) {
2382       unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN :
2383         AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN;
2384 
2385       SDValue CmpVal = Mem->getOperand(2);
2386       SDValue CPol = CurDAG->getTargetConstant(AMDGPU::CPol::GLC, SL, MVT::i32);
2387       SDValue Ops[] = {CmpVal, SRsrc, SOffset, Offset, CPol, Mem->getChain()};
2388 
2389       CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops);
2390     }
2391   }
2392 
2393   if (!CmpSwap) {
2394     SelectCode(N);
2395     return;
2396   }
2397 
2398   MachineMemOperand *MMO = Mem->getMemOperand();
2399   CurDAG->setNodeMemRefs(CmpSwap, {MMO});
2400 
2401   unsigned SubReg = Is32 ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
2402   SDValue Extract
2403     = CurDAG->getTargetExtractSubreg(SubReg, SL, VT, SDValue(CmpSwap, 0));
2404 
2405   ReplaceUses(SDValue(N, 0), Extract);
2406   ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 1));
2407   CurDAG->RemoveDeadNode(N);
2408 }
2409 
2410 void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
2411   // The address is assumed to be uniform, so if it ends up in a VGPR, it will
2412   // be copied to an SGPR with readfirstlane.
2413   unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
2414     AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2415 
2416   SDValue Chain = N->getOperand(0);
2417   SDValue Ptr = N->getOperand(2);
2418   MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2419   MachineMemOperand *MMO = M->getMemOperand();
2420   bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2421 
2422   SDValue Offset;
2423   if (CurDAG->isBaseWithConstantOffset(Ptr)) {
2424     SDValue PtrBase = Ptr.getOperand(0);
2425     SDValue PtrOffset = Ptr.getOperand(1);
2426 
2427     const APInt &OffsetVal = cast<ConstantSDNode>(PtrOffset)->getAPIntValue();
2428     if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) {
2429       N = glueCopyToM0(N, PtrBase);
2430       Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
2431     }
2432   }
2433 
2434   if (!Offset) {
2435     N = glueCopyToM0(N, Ptr);
2436     Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2437   }
2438 
2439   SDValue Ops[] = {
2440     Offset,
2441     CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
2442     Chain,
2443     N->getOperand(N->getNumOperands() - 1) // New glue
2444   };
2445 
2446   SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2447   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2448 }
2449 
2450 static unsigned gwsIntrinToOpcode(unsigned IntrID) {
2451   switch (IntrID) {
2452   case Intrinsic::amdgcn_ds_gws_init:
2453     return AMDGPU::DS_GWS_INIT;
2454   case Intrinsic::amdgcn_ds_gws_barrier:
2455     return AMDGPU::DS_GWS_BARRIER;
2456   case Intrinsic::amdgcn_ds_gws_sema_v:
2457     return AMDGPU::DS_GWS_SEMA_V;
2458   case Intrinsic::amdgcn_ds_gws_sema_br:
2459     return AMDGPU::DS_GWS_SEMA_BR;
2460   case Intrinsic::amdgcn_ds_gws_sema_p:
2461     return AMDGPU::DS_GWS_SEMA_P;
2462   case Intrinsic::amdgcn_ds_gws_sema_release_all:
2463     return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
2464   default:
2465     llvm_unreachable("not a gws intrinsic");
2466   }
2467 }
2468 
2469 void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
2470   if (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
2471       !Subtarget->hasGWSSemaReleaseAll()) {
2472     // Let this error.
2473     SelectCode(N);
2474     return;
2475   }
2476 
2477   // Chain, intrinsic ID, vsrc, offset
2478   const bool HasVSrc = N->getNumOperands() == 4;
2479   assert(HasVSrc || N->getNumOperands() == 3);
2480 
2481   SDLoc SL(N);
2482   SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2);
2483   int ImmOffset = 0;
2484   MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2485   MachineMemOperand *MMO = M->getMemOperand();
2486 
2487   // Don't worry if the offset ends up in a VGPR. Only one lane will have
2488   // effect, so SIFixSGPRCopies will validly insert readfirstlane.
2489 
2490   // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
2491   // offset field) % 64. Some versions of the programming guide omit the m0
2492   // part, or claim it's from offset 0.
2493   if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
2494     // If we have a constant offset, try to use the 0 in m0 as the base.
2495     // TODO: Look into changing the default m0 initialization value. If the
2496     // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
2497     // the immediate offset.
2498     glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32));
2499     ImmOffset = ConstOffset->getZExtValue();
2500   } else {
2501     if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
2502       ImmOffset = BaseOffset.getConstantOperandVal(1);
2503       BaseOffset = BaseOffset.getOperand(0);
2504     }
2505 
2506     // Prefer to do the shift in an SGPR since it should be possible to use m0
2507     // as the result directly. If it's already an SGPR, it will be eliminated
2508     // later.
2509     SDNode *SGPROffset
2510       = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
2511                                BaseOffset);
2512     // Shift to offset in m0
2513     SDNode *M0Base
2514       = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2515                                SDValue(SGPROffset, 0),
2516                                CurDAG->getTargetConstant(16, SL, MVT::i32));
2517     glueCopyToM0(N, SDValue(M0Base, 0));
2518   }
2519 
2520   SDValue Chain = N->getOperand(0);
2521   SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
2522 
2523   const unsigned Opc = gwsIntrinToOpcode(IntrID);
2524   SmallVector<SDValue, 5> Ops;
2525   if (HasVSrc)
2526     Ops.push_back(N->getOperand(2));
2527   Ops.push_back(OffsetField);
2528   Ops.push_back(Chain);
2529 
2530   SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2531   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2532 }
2533 
2534 void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
2535   if (Subtarget->getLDSBankCount() != 16) {
2536     // This is a single instruction with a pattern.
2537     SelectCode(N);
2538     return;
2539   }
2540 
2541   SDLoc DL(N);
2542 
2543   // This requires 2 instructions. It is possible to write a pattern to support
2544   // this, but the generated isel emitter doesn't correctly deal with multiple
2545   // output instructions using the same physical register input. The copy to m0
2546   // is incorrectly placed before the second instruction.
2547   //
2548   // TODO: Match source modifiers.
2549   //
2550   // def : Pat <
2551   //   (int_amdgcn_interp_p1_f16
2552   //    (VOP3Mods f32:$src0, i32:$src0_modifiers),
2553   //                             (i32 timm:$attrchan), (i32 timm:$attr),
2554   //                             (i1 timm:$high), M0),
2555   //   (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
2556   //       timm:$attrchan, 0,
2557   //       (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
2558   //   let Predicates = [has16BankLDS];
2559   // }
2560 
2561   // 16 bank LDS
2562   SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0,
2563                                       N->getOperand(5), SDValue());
2564 
2565   SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);
2566 
2567   SDNode *InterpMov =
2568     CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
2569         CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
2570         N->getOperand(3),  // Attr
2571         N->getOperand(2),  // Attrchan
2572         ToM0.getValue(1) // In glue
2573   });
2574 
2575   SDNode *InterpP1LV =
2576     CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
2577         CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
2578         N->getOperand(1), // Src0
2579         N->getOperand(3), // Attr
2580         N->getOperand(2), // Attrchan
2581         CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
2582         SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
2583         N->getOperand(4), // high
2584         CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
2585         CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
2586         SDValue(InterpMov, 1)
2587   });
2588 
2589   CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));
2590 }
2591 
2592 void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
2593   unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
2594   switch (IntrID) {
2595   case Intrinsic::amdgcn_ds_append:
2596   case Intrinsic::amdgcn_ds_consume: {
2597     if (N->getValueType(0) != MVT::i32)
2598       break;
2599     SelectDSAppendConsume(N, IntrID);
2600     return;
2601   }
2602   }
2603 
2604   SelectCode(N);
2605 }
2606 
2607 void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
2608   unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
2609   unsigned Opcode;
2610   switch (IntrID) {
2611   case Intrinsic::amdgcn_wqm:
2612     Opcode = AMDGPU::WQM;
2613     break;
2614   case Intrinsic::amdgcn_softwqm:
2615     Opcode = AMDGPU::SOFT_WQM;
2616     break;
2617   case Intrinsic::amdgcn_wwm:
2618   case Intrinsic::amdgcn_strict_wwm:
2619     Opcode = AMDGPU::STRICT_WWM;
2620     break;
2621   case Intrinsic::amdgcn_strict_wqm:
2622     Opcode = AMDGPU::STRICT_WQM;
2623     break;
2624   case Intrinsic::amdgcn_interp_p1_f16:
2625     SelectInterpP1F16(N);
2626     return;
2627   default:
2628     SelectCode(N);
2629     return;
2630   }
2631 
2632   SDValue Src = N->getOperand(1);
2633   CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src});
2634 }
2635 
2636 void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
2637   unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
2638   switch (IntrID) {
2639   case Intrinsic::amdgcn_ds_gws_init:
2640   case Intrinsic::amdgcn_ds_gws_barrier:
2641   case Intrinsic::amdgcn_ds_gws_sema_v:
2642   case Intrinsic::amdgcn_ds_gws_sema_br:
2643   case Intrinsic::amdgcn_ds_gws_sema_p:
2644   case Intrinsic::amdgcn_ds_gws_sema_release_all:
2645     SelectDS_GWS(N, IntrID);
2646     return;
2647   default:
2648     break;
2649   }
2650 
2651   SelectCode(N);
2652 }
2653 
2654 bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
2655                                             unsigned &Mods,
2656                                             bool AllowAbs) const {
2657   Mods = 0;
2658   Src = In;
2659 
2660   if (Src.getOpcode() == ISD::FNEG) {
2661     Mods |= SISrcMods::NEG;
2662     Src = Src.getOperand(0);
2663   }
2664 
2665   if (AllowAbs && Src.getOpcode() == ISD::FABS) {
2666     Mods |= SISrcMods::ABS;
2667     Src = Src.getOperand(0);
2668   }
2669 
2670   return true;
2671 }
2672 
2673 bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
2674                                         SDValue &SrcMods) const {
2675   unsigned Mods;
2676   if (SelectVOP3ModsImpl(In, Src, Mods)) {
2677     SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2678     return true;
2679   }
2680 
2681   return false;
2682 }
2683 
2684 bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
2685                                          SDValue &SrcMods) const {
2686   unsigned Mods;
2687   if (SelectVOP3ModsImpl(In, Src, Mods, /* AllowAbs */ false)) {
2688     SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2689     return true;
2690   }
2691 
2692   return false;
2693 }
2694 
2695 bool AMDGPUDAGToDAGISel::SelectVOP3Mods_NNaN(SDValue In, SDValue &Src,
2696                                              SDValue &SrcMods) const {
2697   SelectVOP3Mods(In, Src, SrcMods);
2698   return isNoNanSrc(Src);
2699 }
2700 
2701 bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
2702   if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
2703     return false;
2704 
2705   Src = In;
2706   return true;
2707 }
2708 
2709 bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
2710                                          SDValue &SrcMods, SDValue &Clamp,
2711                                          SDValue &Omod) const {
2712   SDLoc DL(In);
2713   Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2714   Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2715 
2716   return SelectVOP3Mods(In, Src, SrcMods);
2717 }
2718 
2719 bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,
2720                                           SDValue &SrcMods, SDValue &Clamp,
2721                                           SDValue &Omod) const {
2722   SDLoc DL(In);
2723   Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2724   Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2725 
2726   return SelectVOP3BMods(In, Src, SrcMods);
2727 }
2728 
2729 bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
2730                                          SDValue &Clamp, SDValue &Omod) const {
2731   Src = In;
2732 
2733   SDLoc DL(In);
2734   Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2735   Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2736 
2737   return true;
2738 }
2739 
2740 bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
2741                                          SDValue &SrcMods) const {
2742   unsigned Mods = 0;
2743   Src = In;
2744 
2745   if (Src.getOpcode() == ISD::FNEG) {
2746     Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
2747     Src = Src.getOperand(0);
2748   }
2749 
2750   if (Src.getOpcode() == ISD::BUILD_VECTOR) {
2751     unsigned VecMods = Mods;
2752 
2753     SDValue Lo = stripBitcast(Src.getOperand(0));
2754     SDValue Hi = stripBitcast(Src.getOperand(1));
2755 
2756     if (Lo.getOpcode() == ISD::FNEG) {
2757       Lo = stripBitcast(Lo.getOperand(0));
2758       Mods ^= SISrcMods::NEG;
2759     }
2760 
2761     if (Hi.getOpcode() == ISD::FNEG) {
2762       Hi = stripBitcast(Hi.getOperand(0));
2763       Mods ^= SISrcMods::NEG_HI;
2764     }
2765 
2766     if (isExtractHiElt(Lo, Lo))
2767       Mods |= SISrcMods::OP_SEL_0;
2768 
2769     if (isExtractHiElt(Hi, Hi))
2770       Mods |= SISrcMods::OP_SEL_1;
2771 
2772     unsigned VecSize = Src.getValueSizeInBits();
2773     Lo = stripExtractLoElt(Lo);
2774     Hi = stripExtractLoElt(Hi);
2775 
2776     if (Lo.getValueSizeInBits() > VecSize) {
2777       Lo = CurDAG->getTargetExtractSubreg(
2778         (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
2779         MVT::getIntegerVT(VecSize), Lo);
2780     }
2781 
2782     if (Hi.getValueSizeInBits() > VecSize) {
2783       Hi = CurDAG->getTargetExtractSubreg(
2784         (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
2785         MVT::getIntegerVT(VecSize), Hi);
2786     }
2787 
2788     assert(Lo.getValueSizeInBits() <= VecSize &&
2789            Hi.getValueSizeInBits() <= VecSize);
2790 
2791     if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
2792       // Really a scalar input. Just select from the low half of the register to
2793       // avoid packing.
2794 
2795       if (VecSize == 32 || VecSize == Lo.getValueSizeInBits()) {
2796         Src = Lo;
2797       } else {
2798         assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
2799 
2800         SDLoc SL(In);
2801         SDValue Undef = SDValue(
2802           CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL,
2803                                  Lo.getValueType()), 0);
2804         auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID
2805                                     : AMDGPU::SReg_64RegClassID;
2806         const SDValue Ops[] = {
2807           CurDAG->getTargetConstant(RC, SL, MVT::i32),
2808           Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
2809           Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) };
2810 
2811         Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
2812                                              Src.getValueType(), Ops), 0);
2813       }
2814       SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2815       return true;
2816     }
2817 
2818     if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {
2819       uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
2820                       .bitcastToAPInt().getZExtValue();
2821       if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
2822         Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);;
2823         SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2824         return true;
2825       }
2826     }
2827 
2828     Mods = VecMods;
2829   }
2830 
2831   // Packed instructions do not have abs modifiers.
2832   Mods |= SISrcMods::OP_SEL_1;
2833 
2834   SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2835   return true;
2836 }
2837 
2838 bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
2839                                          SDValue &SrcMods) const {
2840   Src = In;
2841   // FIXME: Handle op_sel
2842   SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
2843   return true;
2844 }
2845 
2846 bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
2847                                              SDValue &SrcMods) const {
2848   // FIXME: Handle op_sel
2849   return SelectVOP3Mods(In, Src, SrcMods);
2850 }
2851 
2852 // The return value is not whether the match is possible (which it always is),
2853 // but whether or not it a conversion is really used.
2854 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
2855                                                    unsigned &Mods) const {
2856   Mods = 0;
2857   SelectVOP3ModsImpl(In, Src, Mods);
2858 
2859   if (Src.getOpcode() == ISD::FP_EXTEND) {
2860     Src = Src.getOperand(0);
2861     assert(Src.getValueType() == MVT::f16);
2862     Src = stripBitcast(Src);
2863 
2864     // Be careful about folding modifiers if we already have an abs. fneg is
2865     // applied last, so we don't want to apply an earlier fneg.
2866     if ((Mods & SISrcMods::ABS) == 0) {
2867       unsigned ModsTmp;
2868       SelectVOP3ModsImpl(Src, Src, ModsTmp);
2869 
2870       if ((ModsTmp & SISrcMods::NEG) != 0)
2871         Mods ^= SISrcMods::NEG;
2872 
2873       if ((ModsTmp & SISrcMods::ABS) != 0)
2874         Mods |= SISrcMods::ABS;
2875     }
2876 
2877     // op_sel/op_sel_hi decide the source type and source.
2878     // If the source's op_sel_hi is set, it indicates to do a conversion from fp16.
2879     // If the sources's op_sel is set, it picks the high half of the source
2880     // register.
2881 
2882     Mods |= SISrcMods::OP_SEL_1;
2883     if (isExtractHiElt(Src, Src)) {
2884       Mods |= SISrcMods::OP_SEL_0;
2885 
2886       // TODO: Should we try to look for neg/abs here?
2887     }
2888 
2889     return true;
2890   }
2891 
2892   return false;
2893 }
2894 
2895 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
2896                                                SDValue &SrcMods) const {
2897   unsigned Mods = 0;
2898   SelectVOP3PMadMixModsImpl(In, Src, Mods);
2899   SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2900   return true;
2901 }
2902 
2903 SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
2904   if (In.isUndef())
2905     return CurDAG->getUNDEF(MVT::i32);
2906 
2907   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
2908     SDLoc SL(In);
2909     return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);
2910   }
2911 
2912   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
2913     SDLoc SL(In);
2914     return CurDAG->getConstant(
2915       C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
2916   }
2917 
2918   SDValue Src;
2919   if (isExtractHiElt(In, Src))
2920     return Src;
2921 
2922   return SDValue();
2923 }
2924 
2925 bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
2926   assert(CurDAG->getTarget().getTargetTriple().getArch() == Triple::amdgcn);
2927 
2928   const SIRegisterInfo *SIRI =
2929     static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
2930   const SIInstrInfo * SII =
2931     static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
2932 
2933   unsigned Limit = 0;
2934   bool AllUsesAcceptSReg = true;
2935   for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
2936     Limit < 10 && U != E; ++U, ++Limit) {
2937     const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
2938 
2939     // If the register class is unknown, it could be an unknown
2940     // register class that needs to be an SGPR, e.g. an inline asm
2941     // constraint
2942     if (!RC || SIRI->isSGPRClass(RC))
2943       return false;
2944 
2945     if (RC != &AMDGPU::VS_32RegClass) {
2946       AllUsesAcceptSReg = false;
2947       SDNode * User = *U;
2948       if (User->isMachineOpcode()) {
2949         unsigned Opc = User->getMachineOpcode();
2950         MCInstrDesc Desc = SII->get(Opc);
2951         if (Desc.isCommutable()) {
2952           unsigned OpIdx = Desc.getNumDefs() + U.getOperandNo();
2953           unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
2954           if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
2955             unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
2956             const TargetRegisterClass *CommutedRC = getOperandRegClass(*U, CommutedOpNo);
2957             if (CommutedRC == &AMDGPU::VS_32RegClass)
2958               AllUsesAcceptSReg = true;
2959           }
2960         }
2961       }
2962       // If "AllUsesAcceptSReg == false" so far we haven't suceeded
2963       // commuting current user. This means have at least one use
2964       // that strictly require VGPR. Thus, we will not attempt to commute
2965       // other user instructions.
2966       if (!AllUsesAcceptSReg)
2967         break;
2968     }
2969   }
2970   return !AllUsesAcceptSReg && (Limit < 10);
2971 }
2972 
2973 bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode * N) const {
2974   auto Ld = cast<LoadSDNode>(N);
2975 
2976   return Ld->getAlignment() >= 4 &&
2977         (
2978           (
2979             (
2980               Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS       ||
2981               Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT
2982             )
2983             &&
2984             !N->isDivergent()
2985           )
2986           ||
2987           (
2988             Subtarget->getScalarizeGlobalBehavior() &&
2989             Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
2990             Ld->isSimple() &&
2991             !N->isDivergent() &&
2992             static_cast<const SITargetLowering *>(
2993               getTargetLowering())->isMemOpHasNoClobberedMemOperand(N)
2994           )
2995         );
2996 }
2997 
2998 void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
2999   const AMDGPUTargetLowering& Lowering =
3000     *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
3001   bool IsModified = false;
3002   do {
3003     IsModified = false;
3004 
3005     // Go over all selected nodes and try to fold them a bit more
3006     SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_begin();
3007     while (Position != CurDAG->allnodes_end()) {
3008       SDNode *Node = &*Position++;
3009       MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Node);
3010       if (!MachineNode)
3011         continue;
3012 
3013       SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
3014       if (ResNode != Node) {
3015         if (ResNode)
3016           ReplaceUses(Node, ResNode);
3017         IsModified = true;
3018       }
3019     }
3020     CurDAG->RemoveDeadNodes();
3021   } while (IsModified);
3022 }
3023 
3024 bool R600DAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
3025   Subtarget = &MF.getSubtarget<R600Subtarget>();
3026   return SelectionDAGISel::runOnMachineFunction(MF);
3027 }
3028 
3029 bool R600DAGToDAGISel::isConstantLoad(const MemSDNode *N, int CbId) const {
3030   if (!N->readMem())
3031     return false;
3032   if (CbId == -1)
3033     return N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
3034            N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
3035 
3036   return N->getAddressSpace() == AMDGPUAS::CONSTANT_BUFFER_0 + CbId;
3037 }
3038 
3039 bool R600DAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr,
3040                                                          SDValue& IntPtr) {
3041   if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Addr)) {
3042     IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, SDLoc(Addr),
3043                                        true);
3044     return true;
3045   }
3046   return false;
3047 }
3048 
3049 bool R600DAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr,
3050     SDValue& BaseReg, SDValue &Offset) {
3051   if (!isa<ConstantSDNode>(Addr)) {
3052     BaseReg = Addr;
3053     Offset = CurDAG->getIntPtrConstant(0, SDLoc(Addr), true);
3054     return true;
3055   }
3056   return false;
3057 }
3058 
3059 void R600DAGToDAGISel::Select(SDNode *N) {
3060   unsigned int Opc = N->getOpcode();
3061   if (N->isMachineOpcode()) {
3062     N->setNodeId(-1);
3063     return;   // Already selected.
3064   }
3065 
3066   switch (Opc) {
3067   default: break;
3068   case AMDGPUISD::BUILD_VERTICAL_VECTOR:
3069   case ISD::SCALAR_TO_VECTOR:
3070   case ISD::BUILD_VECTOR: {
3071     EVT VT = N->getValueType(0);
3072     unsigned NumVectorElts = VT.getVectorNumElements();
3073     unsigned RegClassID;
3074     // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
3075     // that adds a 128 bits reg copy when going through TwoAddressInstructions
3076     // pass. We want to avoid 128 bits copies as much as possible because they
3077     // can't be bundled by our scheduler.
3078     switch(NumVectorElts) {
3079     case 2: RegClassID = R600::R600_Reg64RegClassID; break;
3080     case 4:
3081       if (Opc == AMDGPUISD::BUILD_VERTICAL_VECTOR)
3082         RegClassID = R600::R600_Reg128VerticalRegClassID;
3083       else
3084         RegClassID = R600::R600_Reg128RegClassID;
3085       break;
3086     default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
3087     }
3088     SelectBuildVector(N, RegClassID);
3089     return;
3090   }
3091   }
3092 
3093   SelectCode(N);
3094 }
3095 
3096 bool R600DAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
3097                                           SDValue &Offset) {
3098   ConstantSDNode *C;
3099   SDLoc DL(Addr);
3100 
3101   if ((C = dyn_cast<ConstantSDNode>(Addr))) {
3102     Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
3103     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
3104   } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
3105              (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
3106     Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
3107     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
3108   } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
3109             (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
3110     Base = Addr.getOperand(0);
3111     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
3112   } else {
3113     Base = Addr;
3114     Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
3115   }
3116 
3117   return true;
3118 }
3119 
3120 bool R600DAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
3121                                           SDValue &Offset) {
3122   ConstantSDNode *IMMOffset;
3123 
3124   if (Addr.getOpcode() == ISD::ADD
3125       && (IMMOffset = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
3126       && isInt<16>(IMMOffset->getZExtValue())) {
3127 
3128       Base = Addr.getOperand(0);
3129       Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr),
3130                                          MVT::i32);
3131       return true;
3132   // If the pointer address is constant, we can move it to the offset field.
3133   } else if ((IMMOffset = dyn_cast<ConstantSDNode>(Addr))
3134              && isInt<16>(IMMOffset->getZExtValue())) {
3135     Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
3136                                   SDLoc(CurDAG->getEntryNode()),
3137                                   R600::ZERO, MVT::i32);
3138     Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr),
3139                                        MVT::i32);
3140     return true;
3141   }
3142 
3143   // Default case, no offset
3144   Base = Addr;
3145   Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
3146   return true;
3147 }
3148