1 //===-- AMDGPUISelDAGToDAG.cpp - A dag to dag inst selector for AMDGPU ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Defines an instruction selector for the AMDGPU target.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "AMDGPUTargetMachine.h"
16 #include "SIMachineFunctionInfo.h"
17 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
18 #include "llvm/Analysis/ValueTracking.h"
19 #include "llvm/CodeGen/FunctionLoweringInfo.h"
20 #include "llvm/CodeGen/SelectionDAG.h"
21 #include "llvm/CodeGen/SelectionDAGISel.h"
22 #include "llvm/CodeGen/SelectionDAGNodes.h"
23 #include "llvm/IR/IntrinsicsAMDGPU.h"
24 #include "llvm/InitializePasses.h"
25 
26 #ifdef EXPENSIVE_CHECKS
27 #include "llvm/Analysis/LoopInfo.h"
28 #include "llvm/IR/Dominators.h"
29 #endif
30 
31 #define DEBUG_TYPE "isel"
32 
33 using namespace llvm;
34 
35 namespace llvm {
36 
37 class R600InstrInfo;
38 
39 } // end namespace llvm
40 
41 //===----------------------------------------------------------------------===//
42 // Instruction Selector Implementation
43 //===----------------------------------------------------------------------===//
44 
45 namespace {
46 
47 static bool isNullConstantOrUndef(SDValue V) {
48   if (V.isUndef())
49     return true;
50 
51   ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V);
52   return Const != nullptr && Const->isNullValue();
53 }
54 
55 static bool getConstantValue(SDValue N, uint32_t &Out) {
56   // This is only used for packed vectors, where ussing 0 for undef should
57   // always be good.
58   if (N.isUndef()) {
59     Out = 0;
60     return true;
61   }
62 
63   if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) {
64     Out = C->getAPIntValue().getSExtValue();
65     return true;
66   }
67 
68   if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) {
69     Out = C->getValueAPF().bitcastToAPInt().getSExtValue();
70     return true;
71   }
72 
73   return false;
74 }
75 
76 // TODO: Handle undef as zero
77 static SDNode *packConstantV2I16(const SDNode *N, SelectionDAG &DAG,
78                                  bool Negate = false) {
79   assert(N->getOpcode() == ISD::BUILD_VECTOR && N->getNumOperands() == 2);
80   uint32_t LHSVal, RHSVal;
81   if (getConstantValue(N->getOperand(0), LHSVal) &&
82       getConstantValue(N->getOperand(1), RHSVal)) {
83     SDLoc SL(N);
84     uint32_t K = Negate ?
85       (-LHSVal & 0xffff) | (-RHSVal << 16) :
86       (LHSVal & 0xffff) | (RHSVal << 16);
87     return DAG.getMachineNode(AMDGPU::S_MOV_B32, SL, N->getValueType(0),
88                               DAG.getTargetConstant(K, SL, MVT::i32));
89   }
90 
91   return nullptr;
92 }
93 
94 static SDNode *packNegConstantV2I16(const SDNode *N, SelectionDAG &DAG) {
95   return packConstantV2I16(N, DAG, true);
96 }
97 
98 /// AMDGPU specific code to select AMDGPU machine instructions for
99 /// SelectionDAG operations.
100 class AMDGPUDAGToDAGISel : public SelectionDAGISel {
101   // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can
102   // make the right decision when generating code for different targets.
103   const GCNSubtarget *Subtarget;
104 
105   // Default FP mode for the current function.
106   AMDGPU::SIModeRegisterDefaults Mode;
107 
108   bool EnableLateStructurizeCFG;
109 
110 public:
111   explicit AMDGPUDAGToDAGISel(TargetMachine *TM = nullptr,
112                               CodeGenOpt::Level OptLevel = CodeGenOpt::Default)
113     : SelectionDAGISel(*TM, OptLevel) {
114     EnableLateStructurizeCFG = AMDGPUTargetMachine::EnableLateStructurizeCFG;
115   }
116   ~AMDGPUDAGToDAGISel() override = default;
117 
118   void getAnalysisUsage(AnalysisUsage &AU) const override {
119     AU.addRequired<AMDGPUArgumentUsageInfo>();
120     AU.addRequired<LegacyDivergenceAnalysis>();
121 #ifdef EXPENSIVE_CHECKS
122     AU.addRequired<DominatorTreeWrapperPass>();
123     AU.addRequired<LoopInfoWrapperPass>();
124 #endif
125     SelectionDAGISel::getAnalysisUsage(AU);
126   }
127 
128   bool matchLoadD16FromBuildVector(SDNode *N) const;
129 
130   bool runOnMachineFunction(MachineFunction &MF) override;
131   void PreprocessISelDAG() override;
132   void Select(SDNode *N) override;
133   StringRef getPassName() const override;
134   void PostprocessISelDAG() override;
135 
136 protected:
137   void SelectBuildVector(SDNode *N, unsigned RegClassID);
138 
139 private:
140   std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const;
141   bool isNoNanSrc(SDValue N) const;
142   bool isInlineImmediate(const SDNode *N, bool Negated = false) const;
143   bool isNegInlineImmediate(const SDNode *N) const {
144     return isInlineImmediate(N, true);
145   }
146 
147   bool isInlineImmediate16(int64_t Imm) const {
148     return AMDGPU::isInlinableLiteral16(Imm, Subtarget->hasInv2PiInlineImm());
149   }
150 
151   bool isInlineImmediate32(int64_t Imm) const {
152     return AMDGPU::isInlinableLiteral32(Imm, Subtarget->hasInv2PiInlineImm());
153   }
154 
155   bool isInlineImmediate64(int64_t Imm) const {
156     return AMDGPU::isInlinableLiteral64(Imm, Subtarget->hasInv2PiInlineImm());
157   }
158 
159   bool isInlineImmediate(const APFloat &Imm) const {
160     return Subtarget->getInstrInfo()->isInlineConstant(Imm);
161   }
162 
163   bool isVGPRImm(const SDNode *N) const;
164   bool isUniformLoad(const SDNode *N) const;
165   bool isUniformBr(const SDNode *N) const;
166 
167   bool isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
168                                   SDValue &RHS) const;
169 
170   MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const;
171 
172   SDNode *glueCopyToOp(SDNode *N, SDValue NewChain, SDValue Glue) const;
173   SDNode *glueCopyToM0(SDNode *N, SDValue Val) const;
174   SDNode *glueCopyToM0LDSInit(SDNode *N) const;
175 
176   const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
177   virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
178   virtual bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
179   bool isDSOffsetLegal(SDValue Base, unsigned Offset) const;
180   bool isDSOffset2Legal(SDValue Base, unsigned Offset0, unsigned Offset1,
181                         unsigned Size) const;
182   bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
183   bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
184                                  SDValue &Offset1) const;
185   bool SelectDS128Bit8ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
186                                   SDValue &Offset1) const;
187   bool SelectDSReadWrite2(SDValue Ptr, SDValue &Base, SDValue &Offset0,
188                           SDValue &Offset1, unsigned Size) const;
189   bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
190                    SDValue &SOffset, SDValue &Offset, SDValue &Offen,
191                    SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC,
192                    SDValue &TFE, SDValue &DLC, SDValue &SWZ,
193                    SDValue &SCCB) const;
194   bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
195                          SDValue &SOffset, SDValue &Offset, SDValue &GLC,
196                          SDValue &SLC, SDValue &TFE, SDValue &DLC,
197                          SDValue &SWZ, SDValue &SCCB) const;
198   bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
199                          SDValue &VAddr, SDValue &SOffset, SDValue &Offset,
200                          SDValue &SLC) const;
201   bool SelectMUBUFScratchOffen(SDNode *Parent,
202                                SDValue Addr, SDValue &RSrc, SDValue &VAddr,
203                                SDValue &SOffset, SDValue &ImmOffset) const;
204   bool SelectMUBUFScratchOffset(SDNode *Parent,
205                                 SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
206                                 SDValue &Offset) const;
207 
208   bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset,
209                          SDValue &Offset, SDValue &GLC, SDValue &SLC,
210                          SDValue &TFE, SDValue &DLC, SDValue &SWZ,
211                          SDValue &SCCB) const;
212   bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
213                          SDValue &Offset, SDValue &SLC) const;
214   bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
215                          SDValue &Offset) const;
216 
217   template <bool IsSigned>
218   bool SelectFlatOffset(SDNode *N, SDValue Addr, SDValue &VAddr,
219                         SDValue &Offset) const;
220   bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
221                          SDValue &VOffset, SDValue &Offset) const;
222   bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
223                           SDValue &Offset) const;
224 
225   bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset,
226                         bool &Imm) const;
227   SDValue Expand32BitAddress(SDValue Addr) const;
228   bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset,
229                   bool &Imm) const;
230   bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
231   bool SelectSMRDImm32(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
232   bool SelectSMRDSgpr(SDValue Addr, SDValue &SBase, SDValue &Offset) const;
233   bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const;
234   bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const;
235   bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const;
236 
237   bool SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, SDValue &SrcMods) const;
238   bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods,
239                           bool AllowAbs = true) const;
240   bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
241   bool SelectVOP3BMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
242   bool SelectVOP3NoMods(SDValue In, SDValue &Src) const;
243   bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods,
244                        SDValue &Clamp, SDValue &Omod) const;
245   bool SelectVOP3BMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
246                         SDValue &Clamp, SDValue &Omod) const;
247   bool SelectVOP3NoMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
248                          SDValue &Clamp, SDValue &Omod) const;
249 
250   bool SelectVOP3OMods(SDValue In, SDValue &Src,
251                        SDValue &Clamp, SDValue &Omod) const;
252 
253   bool SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
254 
255   bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const;
256 
257   bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
258   bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods) const;
259   bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
260 
261   SDValue getHi16Elt(SDValue In) const;
262 
263   SDValue getMaterializedScalarImm32(int64_t Val, const SDLoc &DL) const;
264 
265   void SelectADD_SUB_I64(SDNode *N);
266   void SelectAddcSubb(SDNode *N);
267   void SelectUADDO_USUBO(SDNode *N);
268   void SelectDIV_SCALE(SDNode *N);
269   void SelectMAD_64_32(SDNode *N);
270   void SelectFMA_W_CHAIN(SDNode *N);
271   void SelectFMUL_W_CHAIN(SDNode *N);
272 
273   SDNode *getS_BFE(unsigned Opcode, const SDLoc &DL, SDValue Val,
274                    uint32_t Offset, uint32_t Width);
275   void SelectS_BFEFromShifts(SDNode *N);
276   void SelectS_BFE(SDNode *N);
277   bool isCBranchSCC(const SDNode *N) const;
278   void SelectBRCOND(SDNode *N);
279   void SelectFMAD_FMA(SDNode *N);
280   void SelectATOMIC_CMP_SWAP(SDNode *N);
281   void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
282   void SelectDS_GWS(SDNode *N, unsigned IntrID);
283   void SelectInterpP1F16(SDNode *N);
284   void SelectINTRINSIC_W_CHAIN(SDNode *N);
285   void SelectINTRINSIC_WO_CHAIN(SDNode *N);
286   void SelectINTRINSIC_VOID(SDNode *N);
287 
288 protected:
289   // Include the pieces autogenerated from the target description.
290 #include "AMDGPUGenDAGISel.inc"
291 };
292 
293 class R600DAGToDAGISel : public AMDGPUDAGToDAGISel {
294   const R600Subtarget *Subtarget;
295 
296   bool isConstantLoad(const MemSDNode *N, int cbID) const;
297   bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
298   bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg,
299                                        SDValue& Offset);
300 public:
301   explicit R600DAGToDAGISel(TargetMachine *TM, CodeGenOpt::Level OptLevel) :
302       AMDGPUDAGToDAGISel(TM, OptLevel) {}
303 
304   void Select(SDNode *N) override;
305 
306   bool SelectADDRIndirect(SDValue Addr, SDValue &Base,
307                           SDValue &Offset) override;
308   bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
309                           SDValue &Offset) override;
310 
311   bool runOnMachineFunction(MachineFunction &MF) override;
312 
313   void PreprocessISelDAG() override {}
314 
315 protected:
316   // Include the pieces autogenerated from the target description.
317 #include "R600GenDAGISel.inc"
318 };
319 
320 static SDValue stripBitcast(SDValue Val) {
321   return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
322 }
323 
324 // Figure out if this is really an extract of the high 16-bits of a dword.
325 static bool isExtractHiElt(SDValue In, SDValue &Out) {
326   In = stripBitcast(In);
327 
328   if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
329     if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
330       if (!Idx->isOne())
331         return false;
332       Out = In.getOperand(0);
333       return true;
334     }
335   }
336 
337   if (In.getOpcode() != ISD::TRUNCATE)
338     return false;
339 
340   SDValue Srl = In.getOperand(0);
341   if (Srl.getOpcode() == ISD::SRL) {
342     if (ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
343       if (ShiftAmt->getZExtValue() == 16) {
344         Out = stripBitcast(Srl.getOperand(0));
345         return true;
346       }
347     }
348   }
349 
350   return false;
351 }
352 
353 // Look through operations that obscure just looking at the low 16-bits of the
354 // same register.
355 static SDValue stripExtractLoElt(SDValue In) {
356   if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
357     if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
358       if (Idx->isNullValue() && In.getValueSizeInBits() <= 32)
359         return In.getOperand(0);
360     }
361   }
362 
363   if (In.getOpcode() == ISD::TRUNCATE) {
364     SDValue Src = In.getOperand(0);
365     if (Src.getValueType().getSizeInBits() == 32)
366       return stripBitcast(Src);
367   }
368 
369   return In;
370 }
371 
372 }  // end anonymous namespace
373 
374 INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "amdgpu-isel",
375                       "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
376 INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo)
377 INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysis)
378 INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
379 #ifdef EXPENSIVE_CHECKS
380 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
381 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
382 #endif
383 INITIALIZE_PASS_END(AMDGPUDAGToDAGISel, "amdgpu-isel",
384                     "AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
385 
386 /// This pass converts a legalized DAG into a AMDGPU-specific
387 // DAG, ready for instruction scheduling.
388 FunctionPass *llvm::createAMDGPUISelDag(TargetMachine *TM,
389                                         CodeGenOpt::Level OptLevel) {
390   return new AMDGPUDAGToDAGISel(TM, OptLevel);
391 }
392 
393 /// This pass converts a legalized DAG into a R600-specific
394 // DAG, ready for instruction scheduling.
395 FunctionPass *llvm::createR600ISelDag(TargetMachine *TM,
396                                       CodeGenOpt::Level OptLevel) {
397   return new R600DAGToDAGISel(TM, OptLevel);
398 }
399 
400 bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
401 #ifdef EXPENSIVE_CHECKS
402   DominatorTree & DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
403   LoopInfo * LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
404   for (auto &L : LI->getLoopsInPreorder()) {
405     assert(L->isLCSSAForm(DT));
406   }
407 #endif
408   Subtarget = &MF.getSubtarget<GCNSubtarget>();
409   Mode = AMDGPU::SIModeRegisterDefaults(MF.getFunction());
410   return SelectionDAGISel::runOnMachineFunction(MF);
411 }
412 
413 bool AMDGPUDAGToDAGISel::matchLoadD16FromBuildVector(SDNode *N) const {
414   assert(Subtarget->d16PreservesUnusedBits());
415   MVT VT = N->getValueType(0).getSimpleVT();
416   if (VT != MVT::v2i16 && VT != MVT::v2f16)
417     return false;
418 
419   SDValue Lo = N->getOperand(0);
420   SDValue Hi = N->getOperand(1);
421 
422   LoadSDNode *LdHi = dyn_cast<LoadSDNode>(stripBitcast(Hi));
423 
424   // build_vector lo, (load ptr) -> load_d16_hi ptr, lo
425   // build_vector lo, (zextload ptr from i8) -> load_d16_hi_u8 ptr, lo
426   // build_vector lo, (sextload ptr from i8) -> load_d16_hi_i8 ptr, lo
427 
428   // Need to check for possible indirect dependencies on the other half of the
429   // vector to avoid introducing a cycle.
430   if (LdHi && Hi.hasOneUse() && !LdHi->isPredecessorOf(Lo.getNode())) {
431     SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
432 
433     SDValue TiedIn = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Lo);
434     SDValue Ops[] = {
435       LdHi->getChain(), LdHi->getBasePtr(), TiedIn
436     };
437 
438     unsigned LoadOp = AMDGPUISD::LOAD_D16_HI;
439     if (LdHi->getMemoryVT() == MVT::i8) {
440       LoadOp = LdHi->getExtensionType() == ISD::SEXTLOAD ?
441         AMDGPUISD::LOAD_D16_HI_I8 : AMDGPUISD::LOAD_D16_HI_U8;
442     } else {
443       assert(LdHi->getMemoryVT() == MVT::i16);
444     }
445 
446     SDValue NewLoadHi =
447       CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdHi), VTList,
448                                   Ops, LdHi->getMemoryVT(),
449                                   LdHi->getMemOperand());
450 
451     CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadHi);
452     CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdHi, 1), NewLoadHi.getValue(1));
453     return true;
454   }
455 
456   // build_vector (load ptr), hi -> load_d16_lo ptr, hi
457   // build_vector (zextload ptr from i8), hi -> load_d16_lo_u8 ptr, hi
458   // build_vector (sextload ptr from i8), hi -> load_d16_lo_i8 ptr, hi
459   LoadSDNode *LdLo = dyn_cast<LoadSDNode>(stripBitcast(Lo));
460   if (LdLo && Lo.hasOneUse()) {
461     SDValue TiedIn = getHi16Elt(Hi);
462     if (!TiedIn || LdLo->isPredecessorOf(TiedIn.getNode()))
463       return false;
464 
465     SDVTList VTList = CurDAG->getVTList(VT, MVT::Other);
466     unsigned LoadOp = AMDGPUISD::LOAD_D16_LO;
467     if (LdLo->getMemoryVT() == MVT::i8) {
468       LoadOp = LdLo->getExtensionType() == ISD::SEXTLOAD ?
469         AMDGPUISD::LOAD_D16_LO_I8 : AMDGPUISD::LOAD_D16_LO_U8;
470     } else {
471       assert(LdLo->getMemoryVT() == MVT::i16);
472     }
473 
474     TiedIn = CurDAG->getNode(ISD::BITCAST, SDLoc(N), VT, TiedIn);
475 
476     SDValue Ops[] = {
477       LdLo->getChain(), LdLo->getBasePtr(), TiedIn
478     };
479 
480     SDValue NewLoadLo =
481       CurDAG->getMemIntrinsicNode(LoadOp, SDLoc(LdLo), VTList,
482                                   Ops, LdLo->getMemoryVT(),
483                                   LdLo->getMemOperand());
484 
485     CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), NewLoadLo);
486     CurDAG->ReplaceAllUsesOfValueWith(SDValue(LdLo, 1), NewLoadLo.getValue(1));
487     return true;
488   }
489 
490   return false;
491 }
492 
493 void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
494   if (!Subtarget->d16PreservesUnusedBits())
495     return;
496 
497   SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
498 
499   bool MadeChange = false;
500   while (Position != CurDAG->allnodes_begin()) {
501     SDNode *N = &*--Position;
502     if (N->use_empty())
503       continue;
504 
505     switch (N->getOpcode()) {
506     case ISD::BUILD_VECTOR:
507       MadeChange |= matchLoadD16FromBuildVector(N);
508       break;
509     default:
510       break;
511     }
512   }
513 
514   if (MadeChange) {
515     CurDAG->RemoveDeadNodes();
516     LLVM_DEBUG(dbgs() << "After PreProcess:\n";
517                CurDAG->dump(););
518   }
519 }
520 
521 bool AMDGPUDAGToDAGISel::isNoNanSrc(SDValue N) const {
522   if (TM.Options.NoNaNsFPMath)
523     return true;
524 
525   // TODO: Move into isKnownNeverNaN
526   if (N->getFlags().hasNoNaNs())
527     return true;
528 
529   return CurDAG->isKnownNeverNaN(N);
530 }
531 
532 bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N,
533                                            bool Negated) const {
534   if (N->isUndef())
535     return true;
536 
537   const SIInstrInfo *TII = Subtarget->getInstrInfo();
538   if (Negated) {
539     if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
540       return TII->isInlineConstant(-C->getAPIntValue());
541 
542     if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
543       return TII->isInlineConstant(-C->getValueAPF().bitcastToAPInt());
544 
545   } else {
546     if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N))
547       return TII->isInlineConstant(C->getAPIntValue());
548 
549     if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N))
550       return TII->isInlineConstant(C->getValueAPF().bitcastToAPInt());
551   }
552 
553   return false;
554 }
555 
556 /// Determine the register class for \p OpNo
557 /// \returns The register class of the virtual register that will be used for
558 /// the given operand number \OpNo or NULL if the register class cannot be
559 /// determined.
560 const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
561                                                           unsigned OpNo) const {
562   if (!N->isMachineOpcode()) {
563     if (N->getOpcode() == ISD::CopyToReg) {
564       Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
565       if (Reg.isVirtual()) {
566         MachineRegisterInfo &MRI = CurDAG->getMachineFunction().getRegInfo();
567         return MRI.getRegClass(Reg);
568       }
569 
570       const SIRegisterInfo *TRI
571         = static_cast<const GCNSubtarget *>(Subtarget)->getRegisterInfo();
572       return TRI->getPhysRegClass(Reg);
573     }
574 
575     return nullptr;
576   }
577 
578   switch (N->getMachineOpcode()) {
579   default: {
580     const MCInstrDesc &Desc =
581         Subtarget->getInstrInfo()->get(N->getMachineOpcode());
582     unsigned OpIdx = Desc.getNumDefs() + OpNo;
583     if (OpIdx >= Desc.getNumOperands())
584       return nullptr;
585     int RegClass = Desc.OpInfo[OpIdx].RegClass;
586     if (RegClass == -1)
587       return nullptr;
588 
589     return Subtarget->getRegisterInfo()->getRegClass(RegClass);
590   }
591   case AMDGPU::REG_SEQUENCE: {
592     unsigned RCID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
593     const TargetRegisterClass *SuperRC =
594         Subtarget->getRegisterInfo()->getRegClass(RCID);
595 
596     SDValue SubRegOp = N->getOperand(OpNo + 1);
597     unsigned SubRegIdx = cast<ConstantSDNode>(SubRegOp)->getZExtValue();
598     return Subtarget->getRegisterInfo()->getSubClassWithSubReg(SuperRC,
599                                                               SubRegIdx);
600   }
601   }
602 }
603 
604 SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
605                                          SDValue Glue) const {
606   SmallVector <SDValue, 8> Ops;
607   Ops.push_back(NewChain); // Replace the chain.
608   for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
609     Ops.push_back(N->getOperand(i));
610 
611   Ops.push_back(Glue);
612   return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
613 }
614 
615 SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
616   const SITargetLowering& Lowering =
617     *static_cast<const SITargetLowering*>(getTargetLowering());
618 
619   assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
620 
621   SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val);
622   return glueCopyToOp(N, M0, M0.getValue(1));
623 }
624 
625 SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
626   unsigned AS = cast<MemSDNode>(N)->getAddressSpace();
627   if (AS == AMDGPUAS::LOCAL_ADDRESS) {
628     if (Subtarget->ldsRequiresM0Init())
629       return glueCopyToM0(N, CurDAG->getTargetConstant(-1, SDLoc(N), MVT::i32));
630   } else if (AS == AMDGPUAS::REGION_ADDRESS) {
631     MachineFunction &MF = CurDAG->getMachineFunction();
632     unsigned Value = MF.getInfo<SIMachineFunctionInfo>()->getGDSSize();
633     return
634         glueCopyToM0(N, CurDAG->getTargetConstant(Value, SDLoc(N), MVT::i32));
635   }
636   return N;
637 }
638 
639 MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
640                                                   EVT VT) const {
641   SDNode *Lo = CurDAG->getMachineNode(
642       AMDGPU::S_MOV_B32, DL, MVT::i32,
643       CurDAG->getTargetConstant(Imm & 0xFFFFFFFF, DL, MVT::i32));
644   SDNode *Hi =
645       CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32,
646                              CurDAG->getTargetConstant(Imm >> 32, DL, MVT::i32));
647   const SDValue Ops[] = {
648       CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
649       SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
650       SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
651 
652   return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
653 }
654 
655 void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
656   EVT VT = N->getValueType(0);
657   unsigned NumVectorElts = VT.getVectorNumElements();
658   EVT EltVT = VT.getVectorElementType();
659   SDLoc DL(N);
660   SDValue RegClass = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
661 
662   if (NumVectorElts == 1) {
663     CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS, EltVT, N->getOperand(0),
664                          RegClass);
665     return;
666   }
667 
668   assert(NumVectorElts <= 32 && "Vectors with more than 32 elements not "
669                                   "supported yet");
670   // 32 = Max Num Vector Elements
671   // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
672   // 1 = Vector Register Class
673   SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
674 
675   bool IsGCN = CurDAG->getSubtarget().getTargetTriple().getArch() ==
676                Triple::amdgcn;
677   RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
678   bool IsRegSeq = true;
679   unsigned NOps = N->getNumOperands();
680   for (unsigned i = 0; i < NOps; i++) {
681     // XXX: Why is this here?
682     if (isa<RegisterSDNode>(N->getOperand(i))) {
683       IsRegSeq = false;
684       break;
685     }
686     unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
687                          : R600RegisterInfo::getSubRegFromChannel(i);
688     RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
689     RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
690   }
691   if (NOps != NumVectorElts) {
692     // Fill in the missing undef elements if this was a scalar_to_vector.
693     assert(N->getOpcode() == ISD::SCALAR_TO_VECTOR && NOps < NumVectorElts);
694     MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
695                                                    DL, EltVT);
696     for (unsigned i = NOps; i < NumVectorElts; ++i) {
697       unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
698                            : R600RegisterInfo::getSubRegFromChannel(i);
699       RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
700       RegSeqArgs[1 + (2 * i) + 1] =
701           CurDAG->getTargetConstant(Sub, DL, MVT::i32);
702     }
703   }
704 
705   if (!IsRegSeq)
706     SelectCode(N);
707   CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
708 }
709 
710 void AMDGPUDAGToDAGISel::Select(SDNode *N) {
711   unsigned int Opc = N->getOpcode();
712   if (N->isMachineOpcode()) {
713     N->setNodeId(-1);
714     return;   // Already selected.
715   }
716 
717   // isa<MemSDNode> almost works but is slightly too permissive for some DS
718   // intrinsics.
719   if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N) ||
720       (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC ||
721        Opc == ISD::ATOMIC_LOAD_FADD ||
722        Opc == AMDGPUISD::ATOMIC_LOAD_FMIN ||
723        Opc == AMDGPUISD::ATOMIC_LOAD_FMAX)) {
724     N = glueCopyToM0LDSInit(N);
725     SelectCode(N);
726     return;
727   }
728 
729   switch (Opc) {
730   default:
731     break;
732   // We are selecting i64 ADD here instead of custom lower it during
733   // DAG legalization, so we can fold some i64 ADDs used for address
734   // calculation into the LOAD and STORE instructions.
735   case ISD::ADDC:
736   case ISD::ADDE:
737   case ISD::SUBC:
738   case ISD::SUBE: {
739     if (N->getValueType(0) != MVT::i64)
740       break;
741 
742     SelectADD_SUB_I64(N);
743     return;
744   }
745   case ISD::ADDCARRY:
746   case ISD::SUBCARRY:
747     if (N->getValueType(0) != MVT::i32)
748       break;
749 
750     SelectAddcSubb(N);
751     return;
752   case ISD::UADDO:
753   case ISD::USUBO: {
754     SelectUADDO_USUBO(N);
755     return;
756   }
757   case AMDGPUISD::FMUL_W_CHAIN: {
758     SelectFMUL_W_CHAIN(N);
759     return;
760   }
761   case AMDGPUISD::FMA_W_CHAIN: {
762     SelectFMA_W_CHAIN(N);
763     return;
764   }
765 
766   case ISD::SCALAR_TO_VECTOR:
767   case ISD::BUILD_VECTOR: {
768     EVT VT = N->getValueType(0);
769     unsigned NumVectorElts = VT.getVectorNumElements();
770     if (VT.getScalarSizeInBits() == 16) {
771       if (Opc == ISD::BUILD_VECTOR && NumVectorElts == 2) {
772         if (SDNode *Packed = packConstantV2I16(N, *CurDAG)) {
773           ReplaceNode(N, Packed);
774           return;
775         }
776       }
777 
778       break;
779     }
780 
781     assert(VT.getVectorElementType().bitsEq(MVT::i32));
782     unsigned RegClassID =
783         SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID();
784     SelectBuildVector(N, RegClassID);
785     return;
786   }
787   case ISD::BUILD_PAIR: {
788     SDValue RC, SubReg0, SubReg1;
789     SDLoc DL(N);
790     if (N->getValueType(0) == MVT::i128) {
791       RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32);
792       SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
793       SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
794     } else if (N->getValueType(0) == MVT::i64) {
795       RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32);
796       SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
797       SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
798     } else {
799       llvm_unreachable("Unhandled value type for BUILD_PAIR");
800     }
801     const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
802                             N->getOperand(1), SubReg1 };
803     ReplaceNode(N, CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
804                                           N->getValueType(0), Ops));
805     return;
806   }
807 
808   case ISD::Constant:
809   case ISD::ConstantFP: {
810     if (N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
811       break;
812 
813     uint64_t Imm;
814     if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N))
815       Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
816     else {
817       ConstantSDNode *C = cast<ConstantSDNode>(N);
818       Imm = C->getZExtValue();
819     }
820 
821     SDLoc DL(N);
822     ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
823     return;
824   }
825   case AMDGPUISD::BFE_I32:
826   case AMDGPUISD::BFE_U32: {
827     // There is a scalar version available, but unlike the vector version which
828     // has a separate operand for the offset and width, the scalar version packs
829     // the width and offset into a single operand. Try to move to the scalar
830     // version if the offsets are constant, so that we can try to keep extended
831     // loads of kernel arguments in SGPRs.
832 
833     // TODO: Technically we could try to pattern match scalar bitshifts of
834     // dynamic values, but it's probably not useful.
835     ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
836     if (!Offset)
837       break;
838 
839     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
840     if (!Width)
841       break;
842 
843     bool Signed = Opc == AMDGPUISD::BFE_I32;
844 
845     uint32_t OffsetVal = Offset->getZExtValue();
846     uint32_t WidthVal = Width->getZExtValue();
847 
848     ReplaceNode(N, getS_BFE(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32,
849                             SDLoc(N), N->getOperand(0), OffsetVal, WidthVal));
850     return;
851   }
852   case AMDGPUISD::DIV_SCALE: {
853     SelectDIV_SCALE(N);
854     return;
855   }
856   case AMDGPUISD::MAD_I64_I32:
857   case AMDGPUISD::MAD_U64_U32: {
858     SelectMAD_64_32(N);
859     return;
860   }
861   case ISD::CopyToReg: {
862     const SITargetLowering& Lowering =
863       *static_cast<const SITargetLowering*>(getTargetLowering());
864     N = Lowering.legalizeTargetIndependentNode(N, *CurDAG);
865     break;
866   }
867   case ISD::AND:
868   case ISD::SRL:
869   case ISD::SRA:
870   case ISD::SIGN_EXTEND_INREG:
871     if (N->getValueType(0) != MVT::i32)
872       break;
873 
874     SelectS_BFE(N);
875     return;
876   case ISD::BRCOND:
877     SelectBRCOND(N);
878     return;
879   case ISD::FMAD:
880   case ISD::FMA:
881     SelectFMAD_FMA(N);
882     return;
883   case AMDGPUISD::ATOMIC_CMP_SWAP:
884     SelectATOMIC_CMP_SWAP(N);
885     return;
886   case AMDGPUISD::CVT_PKRTZ_F16_F32:
887   case AMDGPUISD::CVT_PKNORM_I16_F32:
888   case AMDGPUISD::CVT_PKNORM_U16_F32:
889   case AMDGPUISD::CVT_PK_U16_U32:
890   case AMDGPUISD::CVT_PK_I16_I32: {
891     // Hack around using a legal type if f16 is illegal.
892     if (N->getValueType(0) == MVT::i32) {
893       MVT NewVT = Opc == AMDGPUISD::CVT_PKRTZ_F16_F32 ? MVT::v2f16 : MVT::v2i16;
894       N = CurDAG->MorphNodeTo(N, N->getOpcode(), CurDAG->getVTList(NewVT),
895                               { N->getOperand(0), N->getOperand(1) });
896       SelectCode(N);
897       return;
898     }
899 
900     break;
901   }
902   case ISD::INTRINSIC_W_CHAIN: {
903     SelectINTRINSIC_W_CHAIN(N);
904     return;
905   }
906   case ISD::INTRINSIC_WO_CHAIN: {
907     SelectINTRINSIC_WO_CHAIN(N);
908     return;
909   }
910   case ISD::INTRINSIC_VOID: {
911     SelectINTRINSIC_VOID(N);
912     return;
913   }
914   }
915 
916   SelectCode(N);
917 }
918 
919 bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
920   const BasicBlock *BB = FuncInfo->MBB->getBasicBlock();
921   const Instruction *Term = BB->getTerminator();
922   return Term->getMetadata("amdgpu.uniform") ||
923          Term->getMetadata("structurizecfg.uniform");
924 }
925 
926 static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr,
927                                           SDValue &N0, SDValue &N1) {
928   if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
929       Addr.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
930     // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
931     // (i64 (bitcast (v2i32 (build_vector
932     //                        (or (extract_vector_elt V, 0), OFFSET),
933     //                        (extract_vector_elt V, 1)))))
934     SDValue Lo = Addr.getOperand(0).getOperand(0);
935     if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
936       SDValue BaseLo = Lo.getOperand(0);
937       SDValue BaseHi = Addr.getOperand(0).getOperand(1);
938       // Check that split base (Lo and Hi) are extracted from the same one.
939       if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
940           BaseHi.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
941           BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
942           // Lo is statically extracted from index 0.
943           isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
944           BaseLo.getConstantOperandVal(1) == 0 &&
945           // Hi is statically extracted from index 0.
946           isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
947           BaseHi.getConstantOperandVal(1) == 1) {
948         N0 = BaseLo.getOperand(0).getOperand(0);
949         N1 = Lo.getOperand(1);
950         return true;
951       }
952     }
953   }
954   return false;
955 }
956 
957 bool AMDGPUDAGToDAGISel::isBaseWithConstantOffset64(SDValue Addr, SDValue &LHS,
958                                                     SDValue &RHS) const {
959   if (CurDAG->isBaseWithConstantOffset(Addr)) {
960     LHS = Addr.getOperand(0);
961     RHS = Addr.getOperand(1);
962     return true;
963   }
964 
965   if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, LHS, RHS)) {
966     assert(LHS && RHS && isa<ConstantSDNode>(RHS));
967     return true;
968   }
969 
970   return false;
971 }
972 
973 StringRef AMDGPUDAGToDAGISel::getPassName() const {
974   return "AMDGPU DAG->DAG Pattern Instruction Selection";
975 }
976 
977 //===----------------------------------------------------------------------===//
978 // Complex Patterns
979 //===----------------------------------------------------------------------===//
980 
981 bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
982                                             SDValue &Offset) {
983   return false;
984 }
985 
986 bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
987                                             SDValue &Offset) {
988   ConstantSDNode *C;
989   SDLoc DL(Addr);
990 
991   if ((C = dyn_cast<ConstantSDNode>(Addr))) {
992     Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
993     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
994   } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
995              (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
996     Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
997     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
998   } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
999             (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
1000     Base = Addr.getOperand(0);
1001     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
1002   } else {
1003     Base = Addr;
1004     Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1005   }
1006 
1007   return true;
1008 }
1009 
1010 SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
1011                                                        const SDLoc &DL) const {
1012   SDNode *Mov = CurDAG->getMachineNode(
1013     AMDGPU::S_MOV_B32, DL, MVT::i32,
1014     CurDAG->getTargetConstant(Val, DL, MVT::i32));
1015   return SDValue(Mov, 0);
1016 }
1017 
1018 // FIXME: Should only handle addcarry/subcarry
1019 void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
1020   SDLoc DL(N);
1021   SDValue LHS = N->getOperand(0);
1022   SDValue RHS = N->getOperand(1);
1023 
1024   unsigned Opcode = N->getOpcode();
1025   bool ConsumeCarry = (Opcode == ISD::ADDE || Opcode == ISD::SUBE);
1026   bool ProduceCarry =
1027       ConsumeCarry || Opcode == ISD::ADDC || Opcode == ISD::SUBC;
1028   bool IsAdd = Opcode == ISD::ADD || Opcode == ISD::ADDC || Opcode == ISD::ADDE;
1029 
1030   SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1031   SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1032 
1033   SDNode *Lo0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1034                                        DL, MVT::i32, LHS, Sub0);
1035   SDNode *Hi0 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1036                                        DL, MVT::i32, LHS, Sub1);
1037 
1038   SDNode *Lo1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1039                                        DL, MVT::i32, RHS, Sub0);
1040   SDNode *Hi1 = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1041                                        DL, MVT::i32, RHS, Sub1);
1042 
1043   SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
1044 
1045   static const unsigned OpcMap[2][2][2] = {
1046       {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
1047        {AMDGPU::V_SUB_CO_U32_e32, AMDGPU::V_ADD_CO_U32_e32}},
1048       {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
1049        {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
1050 
1051   unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd];
1052   unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd];
1053 
1054   SDNode *AddLo;
1055   if (!ConsumeCarry) {
1056     SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0) };
1057     AddLo = CurDAG->getMachineNode(Opc, DL, VTList, Args);
1058   } else {
1059     SDValue Args[] = { SDValue(Lo0, 0), SDValue(Lo1, 0), N->getOperand(2) };
1060     AddLo = CurDAG->getMachineNode(CarryOpc, DL, VTList, Args);
1061   }
1062   SDValue AddHiArgs[] = {
1063     SDValue(Hi0, 0),
1064     SDValue(Hi1, 0),
1065     SDValue(AddLo, 1)
1066   };
1067   SDNode *AddHi = CurDAG->getMachineNode(CarryOpc, DL, VTList, AddHiArgs);
1068 
1069   SDValue RegSequenceArgs[] = {
1070     CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
1071     SDValue(AddLo,0),
1072     Sub0,
1073     SDValue(AddHi,0),
1074     Sub1,
1075   };
1076   SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1077                                                MVT::i64, RegSequenceArgs);
1078 
1079   if (ProduceCarry) {
1080     // Replace the carry-use
1081     ReplaceUses(SDValue(N, 1), SDValue(AddHi, 1));
1082   }
1083 
1084   // Replace the remaining uses.
1085   ReplaceNode(N, RegSequence);
1086 }
1087 
1088 void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
1089   SDLoc DL(N);
1090   SDValue LHS = N->getOperand(0);
1091   SDValue RHS = N->getOperand(1);
1092   SDValue CI = N->getOperand(2);
1093 
1094   if (N->isDivergent()) {
1095     unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::V_ADDC_U32_e64
1096                                                    : AMDGPU::V_SUBB_U32_e64;
1097     CurDAG->SelectNodeTo(
1098         N, Opc, N->getVTList(),
1099         {LHS, RHS, CI,
1100          CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
1101   } else {
1102     unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::S_ADD_CO_PSEUDO
1103                                                    : AMDGPU::S_SUB_CO_PSEUDO;
1104     CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI});
1105   }
1106 }
1107 
1108 void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
1109   // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
1110   // carry out despite the _i32 name. These were renamed in VI to _U32.
1111   // FIXME: We should probably rename the opcodes here.
1112   bool IsAdd = N->getOpcode() == ISD::UADDO;
1113   bool IsVALU = N->isDivergent();
1114 
1115   for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end(); UI != E;
1116        ++UI)
1117     if (UI.getUse().getResNo() == 1) {
1118       if ((IsAdd && (UI->getOpcode() != ISD::ADDCARRY)) ||
1119           (!IsAdd && (UI->getOpcode() != ISD::SUBCARRY))) {
1120         IsVALU = true;
1121         break;
1122       }
1123     }
1124 
1125   if (IsVALU) {
1126     unsigned Opc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
1127 
1128     CurDAG->SelectNodeTo(
1129         N, Opc, N->getVTList(),
1130         {N->getOperand(0), N->getOperand(1),
1131          CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
1132   } else {
1133     unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::S_UADDO_PSEUDO
1134                                                 : AMDGPU::S_USUBO_PSEUDO;
1135 
1136     CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
1137                          {N->getOperand(0), N->getOperand(1)});
1138   }
1139 }
1140 
1141 void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
1142   SDLoc SL(N);
1143   //  src0_modifiers, src0,  src1_modifiers, src1, src2_modifiers, src2, clamp, omod
1144   SDValue Ops[10];
1145 
1146   SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
1147   SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
1148   SelectVOP3Mods(N->getOperand(3), Ops[5], Ops[4]);
1149   Ops[8] = N->getOperand(0);
1150   Ops[9] = N->getOperand(4);
1151 
1152   CurDAG->SelectNodeTo(N, AMDGPU::V_FMA_F32_e64, N->getVTList(), Ops);
1153 }
1154 
1155 void AMDGPUDAGToDAGISel::SelectFMUL_W_CHAIN(SDNode *N) {
1156   SDLoc SL(N);
1157   //    src0_modifiers, src0,  src1_modifiers, src1, clamp, omod
1158   SDValue Ops[8];
1159 
1160   SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[4], Ops[5]);
1161   SelectVOP3Mods(N->getOperand(2), Ops[3], Ops[2]);
1162   Ops[6] = N->getOperand(0);
1163   Ops[7] = N->getOperand(3);
1164 
1165   CurDAG->SelectNodeTo(N, AMDGPU::V_MUL_F32_e64, N->getVTList(), Ops);
1166 }
1167 
1168 // We need to handle this here because tablegen doesn't support matching
1169 // instructions with multiple outputs.
1170 void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
1171   SDLoc SL(N);
1172   EVT VT = N->getValueType(0);
1173 
1174   assert(VT == MVT::f32 || VT == MVT::f64);
1175 
1176   unsigned Opc
1177     = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64_e64 : AMDGPU::V_DIV_SCALE_F32_e64;
1178 
1179   // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
1180   // omod
1181   SDValue Ops[8];
1182   SelectVOP3BMods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
1183   SelectVOP3BMods(N->getOperand(1), Ops[3], Ops[2]);
1184   SelectVOP3BMods(N->getOperand(2), Ops[5], Ops[4]);
1185   CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1186 }
1187 
1188 // We need to handle this here because tablegen doesn't support matching
1189 // instructions with multiple outputs.
1190 void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
1191   SDLoc SL(N);
1192   bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
1193   unsigned Opc = Signed ? AMDGPU::V_MAD_I64_I32_e64 : AMDGPU::V_MAD_U64_U32_e64;
1194 
1195   SDValue Clamp = CurDAG->getTargetConstant(0, SL, MVT::i1);
1196   SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
1197                     Clamp };
1198   CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
1199 }
1200 
1201 bool AMDGPUDAGToDAGISel::isDSOffsetLegal(SDValue Base, unsigned Offset) const {
1202   if (!isUInt<16>(Offset))
1203     return false;
1204 
1205   if (!Base || Subtarget->hasUsableDSOffset() ||
1206       Subtarget->unsafeDSOffsetFoldingEnabled())
1207     return true;
1208 
1209   // On Southern Islands instruction with a negative base value and an offset
1210   // don't seem to work.
1211   return CurDAG->SignBitIsZero(Base);
1212 }
1213 
1214 bool AMDGPUDAGToDAGISel::SelectDS1Addr1Offset(SDValue Addr, SDValue &Base,
1215                                               SDValue &Offset) const {
1216   SDLoc DL(Addr);
1217   if (CurDAG->isBaseWithConstantOffset(Addr)) {
1218     SDValue N0 = Addr.getOperand(0);
1219     SDValue N1 = Addr.getOperand(1);
1220     ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1221     if (isDSOffsetLegal(N0, C1->getSExtValue())) {
1222       // (add n0, c0)
1223       Base = N0;
1224       Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1225       return true;
1226     }
1227   } else if (Addr.getOpcode() == ISD::SUB) {
1228     // sub C, x -> add (sub 0, x), C
1229     if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1230       int64_t ByteOffset = C->getSExtValue();
1231       if (isDSOffsetLegal(SDValue(), ByteOffset)) {
1232         SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1233 
1234         // XXX - This is kind of hacky. Create a dummy sub node so we can check
1235         // the known bits in isDSOffsetLegal. We need to emit the selected node
1236         // here, so this is thrown away.
1237         SDValue Sub = CurDAG->getNode(ISD::SUB, DL, MVT::i32,
1238                                       Zero, Addr.getOperand(1));
1239 
1240         if (isDSOffsetLegal(Sub, ByteOffset)) {
1241           SmallVector<SDValue, 3> Opnds;
1242           Opnds.push_back(Zero);
1243           Opnds.push_back(Addr.getOperand(1));
1244 
1245           // FIXME: Select to VOP3 version for with-carry.
1246           unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1247           if (Subtarget->hasAddNoCarry()) {
1248             SubOp = AMDGPU::V_SUB_U32_e64;
1249             Opnds.push_back(
1250                 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1251           }
1252 
1253           MachineSDNode *MachineSub =
1254               CurDAG->getMachineNode(SubOp, DL, MVT::i32, Opnds);
1255 
1256           Base = SDValue(MachineSub, 0);
1257           Offset = CurDAG->getTargetConstant(ByteOffset, DL, MVT::i16);
1258           return true;
1259         }
1260       }
1261     }
1262   } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1263     // If we have a constant address, prefer to put the constant into the
1264     // offset. This can save moves to load the constant address since multiple
1265     // operations can share the zero base address register, and enables merging
1266     // into read2 / write2 instructions.
1267 
1268     SDLoc DL(Addr);
1269 
1270     if (isDSOffsetLegal(SDValue(), CAddr->getZExtValue())) {
1271       SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1272       MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
1273                                  DL, MVT::i32, Zero);
1274       Base = SDValue(MovZero, 0);
1275       Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1276       return true;
1277     }
1278   }
1279 
1280   // default case
1281   Base = Addr;
1282   Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i16);
1283   return true;
1284 }
1285 
1286 bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
1287                                           unsigned Offset1,
1288                                           unsigned Size) const {
1289   if (Offset0 % Size != 0 || Offset1 % Size != 0)
1290     return false;
1291   if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
1292     return false;
1293 
1294   if (!Base || Subtarget->hasUsableDSOffset() ||
1295       Subtarget->unsafeDSOffsetFoldingEnabled())
1296     return true;
1297 
1298   // On Southern Islands instruction with a negative base value and an offset
1299   // don't seem to work.
1300   return CurDAG->SignBitIsZero(Base);
1301 }
1302 
1303 // TODO: If offset is too big, put low 16-bit into offset.
1304 bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
1305                                                    SDValue &Offset0,
1306                                                    SDValue &Offset1) const {
1307   return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 4);
1308 }
1309 
1310 bool AMDGPUDAGToDAGISel::SelectDS128Bit8ByteAligned(SDValue Addr, SDValue &Base,
1311                                                     SDValue &Offset0,
1312                                                     SDValue &Offset1) const {
1313   return SelectDSReadWrite2(Addr, Base, Offset0, Offset1, 8);
1314 }
1315 
1316 bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base,
1317                                             SDValue &Offset0, SDValue &Offset1,
1318                                             unsigned Size) const {
1319   SDLoc DL(Addr);
1320 
1321   if (CurDAG->isBaseWithConstantOffset(Addr)) {
1322     SDValue N0 = Addr.getOperand(0);
1323     SDValue N1 = Addr.getOperand(1);
1324     ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1325     unsigned OffsetValue0 = C1->getZExtValue();
1326     unsigned OffsetValue1 = OffsetValue0 + Size;
1327 
1328     // (add n0, c0)
1329     if (isDSOffset2Legal(N0, OffsetValue0, OffsetValue1, Size)) {
1330       Base = N0;
1331       Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1332       Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1333       return true;
1334     }
1335   } else if (Addr.getOpcode() == ISD::SUB) {
1336     // sub C, x -> add (sub 0, x), C
1337     if (const ConstantSDNode *C =
1338             dyn_cast<ConstantSDNode>(Addr.getOperand(0))) {
1339       unsigned OffsetValue0 = C->getZExtValue();
1340       unsigned OffsetValue1 = OffsetValue0 + Size;
1341 
1342       if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1343         SDLoc DL(Addr);
1344         SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1345 
1346         // XXX - This is kind of hacky. Create a dummy sub node so we can check
1347         // the known bits in isDSOffsetLegal. We need to emit the selected node
1348         // here, so this is thrown away.
1349         SDValue Sub =
1350             CurDAG->getNode(ISD::SUB, DL, MVT::i32, Zero, Addr.getOperand(1));
1351 
1352         if (isDSOffset2Legal(Sub, OffsetValue0, OffsetValue1, Size)) {
1353           SmallVector<SDValue, 3> Opnds;
1354           Opnds.push_back(Zero);
1355           Opnds.push_back(Addr.getOperand(1));
1356           unsigned SubOp = AMDGPU::V_SUB_CO_U32_e32;
1357           if (Subtarget->hasAddNoCarry()) {
1358             SubOp = AMDGPU::V_SUB_U32_e64;
1359             Opnds.push_back(
1360                 CurDAG->getTargetConstant(0, {}, MVT::i1)); // clamp bit
1361           }
1362 
1363           MachineSDNode *MachineSub = CurDAG->getMachineNode(
1364               SubOp, DL, MVT::getIntegerVT(Size * 8), Opnds);
1365 
1366           Base = SDValue(MachineSub, 0);
1367           Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1368           Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1369           return true;
1370         }
1371       }
1372     }
1373   } else if (const ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1374     unsigned OffsetValue0 = CAddr->getZExtValue();
1375     unsigned OffsetValue1 = OffsetValue0 + Size;
1376 
1377     if (isDSOffset2Legal(SDValue(), OffsetValue0, OffsetValue1, Size)) {
1378       SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
1379       MachineSDNode *MovZero =
1380           CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero);
1381       Base = SDValue(MovZero, 0);
1382       Offset0 = CurDAG->getTargetConstant(OffsetValue0 / Size, DL, MVT::i8);
1383       Offset1 = CurDAG->getTargetConstant(OffsetValue1 / Size, DL, MVT::i8);
1384       return true;
1385     }
1386   }
1387 
1388   // default case
1389 
1390   Base = Addr;
1391   Offset0 = CurDAG->getTargetConstant(0, DL, MVT::i8);
1392   Offset1 = CurDAG->getTargetConstant(1, DL, MVT::i8);
1393   return true;
1394 }
1395 
1396 bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
1397                                      SDValue &VAddr, SDValue &SOffset,
1398                                      SDValue &Offset, SDValue &Offen,
1399                                      SDValue &Idxen, SDValue &Addr64,
1400                                      SDValue &GLC, SDValue &SLC,
1401                                      SDValue &TFE, SDValue &DLC,
1402                                      SDValue &SWZ, SDValue &SCCB) const {
1403   // Subtarget prefers to use flat instruction
1404   // FIXME: This should be a pattern predicate and not reach here
1405   if (Subtarget->useFlatForGlobal())
1406     return false;
1407 
1408   SDLoc DL(Addr);
1409 
1410   if (!GLC.getNode())
1411     GLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
1412   if (!SLC.getNode())
1413     SLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
1414   TFE = CurDAG->getTargetConstant(0, DL, MVT::i1);
1415   DLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
1416   SWZ = CurDAG->getTargetConstant(0, DL, MVT::i1);
1417   SCCB = CurDAG->getTargetConstant(0, DL, MVT::i1);
1418 
1419   Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1420   Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
1421   Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
1422   SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
1423 
1424   ConstantSDNode *C1 = nullptr;
1425   SDValue N0 = Addr;
1426   if (CurDAG->isBaseWithConstantOffset(Addr)) {
1427     C1 = cast<ConstantSDNode>(Addr.getOperand(1));
1428     if (isUInt<32>(C1->getZExtValue()))
1429       N0 = Addr.getOperand(0);
1430     else
1431       C1 = nullptr;
1432   }
1433 
1434   if (N0.getOpcode() == ISD::ADD) {
1435     // (add N2, N3) -> addr64, or
1436     // (add (add N2, N3), C1) -> addr64
1437     SDValue N2 = N0.getOperand(0);
1438     SDValue N3 = N0.getOperand(1);
1439     Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1440 
1441     if (N2->isDivergent()) {
1442       if (N3->isDivergent()) {
1443         // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
1444         // addr64, and construct the resource from a 0 address.
1445         Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1446         VAddr = N0;
1447       } else {
1448         // N2 is divergent, N3 is not.
1449         Ptr = N3;
1450         VAddr = N2;
1451       }
1452     } else {
1453       // N2 is not divergent.
1454       Ptr = N2;
1455       VAddr = N3;
1456     }
1457     Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
1458   } else if (N0->isDivergent()) {
1459     // N0 is divergent. Use it as the addr64, and construct the resource from a
1460     // 0 address.
1461     Ptr = SDValue(buildSMovImm64(DL, 0, MVT::v2i32), 0);
1462     VAddr = N0;
1463     Addr64 = CurDAG->getTargetConstant(1, DL, MVT::i1);
1464   } else {
1465     // N0 -> offset, or
1466     // (N0 + C1) -> offset
1467     VAddr = CurDAG->getTargetConstant(0, DL, MVT::i32);
1468     Ptr = N0;
1469   }
1470 
1471   if (!C1) {
1472     // No offset.
1473     Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
1474     return true;
1475   }
1476 
1477   if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue())) {
1478     // Legal offset for instruction.
1479     Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1480     return true;
1481   }
1482 
1483   // Illegal offset, store it in soffset.
1484   Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
1485   SOffset =
1486       SDValue(CurDAG->getMachineNode(
1487                   AMDGPU::S_MOV_B32, DL, MVT::i32,
1488                   CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32)),
1489               0);
1490   return true;
1491 }
1492 
1493 bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1494                                            SDValue &VAddr, SDValue &SOffset,
1495                                            SDValue &Offset, SDValue &GLC,
1496                                            SDValue &SLC, SDValue &TFE,
1497                                            SDValue &DLC, SDValue &SWZ,
1498                                            SDValue &SCCB) const {
1499   SDValue Ptr, Offen, Idxen, Addr64;
1500 
1501   // addr64 bit was removed for volcanic islands.
1502   // FIXME: This should be a pattern predicate and not reach here
1503   if (!Subtarget->hasAddr64())
1504     return false;
1505 
1506   if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
1507               GLC, SLC, TFE, DLC, SWZ, SCCB))
1508     return false;
1509 
1510   ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
1511   if (C->getSExtValue()) {
1512     SDLoc DL(Addr);
1513 
1514     const SITargetLowering& Lowering =
1515       *static_cast<const SITargetLowering*>(getTargetLowering());
1516 
1517     SRsrc = SDValue(Lowering.wrapAddr64Rsrc(*CurDAG, DL, Ptr), 0);
1518     return true;
1519   }
1520 
1521   return false;
1522 }
1523 
1524 bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
1525                                            SDValue &VAddr, SDValue &SOffset,
1526                                            SDValue &Offset,
1527                                            SDValue &SLC) const {
1528   SLC = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i1);
1529   SDValue GLC, TFE, DLC, SWZ, SCCB;
1530 
1531   return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE, DLC, SWZ, SCCB);
1532 }
1533 
1534 static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
1535   auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>();
1536   return PSV && PSV->isStack();
1537 }
1538 
1539 std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
1540   SDLoc DL(N);
1541 
1542   auto *FI = dyn_cast<FrameIndexSDNode>(N);
1543   SDValue TFI =
1544       FI ? CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0)) : N;
1545 
1546   // We rebase the base address into an absolute stack address and hence
1547   // use constant 0 for soffset. This value must be retained until
1548   // frame elimination and eliminateFrameIndex will choose the appropriate
1549   // frame register if need be.
1550   return std::make_pair(TFI, CurDAG->getTargetConstant(0, DL, MVT::i32));
1551 }
1552 
1553 bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
1554                                                  SDValue Addr, SDValue &Rsrc,
1555                                                  SDValue &VAddr, SDValue &SOffset,
1556                                                  SDValue &ImmOffset) const {
1557 
1558   SDLoc DL(Addr);
1559   MachineFunction &MF = CurDAG->getMachineFunction();
1560   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1561 
1562   Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1563 
1564   if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
1565     int64_t Imm = CAddr->getSExtValue();
1566     const int64_t NullPtr =
1567         AMDGPUTargetMachine::getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS);
1568     // Don't fold null pointer.
1569     if (Imm != NullPtr) {
1570       SDValue HighBits = CurDAG->getTargetConstant(Imm & ~4095, DL, MVT::i32);
1571       MachineSDNode *MovHighBits = CurDAG->getMachineNode(
1572         AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
1573       VAddr = SDValue(MovHighBits, 0);
1574 
1575       // In a call sequence, stores to the argument stack area are relative to the
1576       // stack pointer.
1577       const MachinePointerInfo &PtrInfo
1578         = cast<MemSDNode>(Parent)->getPointerInfo();
1579       SOffset = isStackPtrRelative(PtrInfo)
1580         ? CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32)
1581         : CurDAG->getTargetConstant(0, DL, MVT::i32);
1582       ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16);
1583       return true;
1584     }
1585   }
1586 
1587   if (CurDAG->isBaseWithConstantOffset(Addr)) {
1588     // (add n0, c1)
1589 
1590     SDValue N0 = Addr.getOperand(0);
1591     SDValue N1 = Addr.getOperand(1);
1592 
1593     // Offsets in vaddr must be positive if range checking is enabled.
1594     //
1595     // The total computation of vaddr + soffset + offset must not overflow.  If
1596     // vaddr is negative, even if offset is 0 the sgpr offset add will end up
1597     // overflowing.
1598     //
1599     // Prior to gfx9, MUBUF instructions with the vaddr offset enabled would
1600     // always perform a range check. If a negative vaddr base index was used,
1601     // this would fail the range check. The overall address computation would
1602     // compute a valid address, but this doesn't happen due to the range
1603     // check. For out-of-bounds MUBUF loads, a 0 is returned.
1604     //
1605     // Therefore it should be safe to fold any VGPR offset on gfx9 into the
1606     // MUBUF vaddr, but not on older subtargets which can only do this if the
1607     // sign bit is known 0.
1608     ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
1609     if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue()) &&
1610         (!Subtarget->privateMemoryResourceIsRangeChecked() ||
1611          CurDAG->SignBitIsZero(N0))) {
1612       std::tie(VAddr, SOffset) = foldFrameIndex(N0);
1613       ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
1614       return true;
1615     }
1616   }
1617 
1618   // (node)
1619   std::tie(VAddr, SOffset) = foldFrameIndex(Addr);
1620   ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16);
1621   return true;
1622 }
1623 
1624 bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
1625                                                   SDValue Addr,
1626                                                   SDValue &SRsrc,
1627                                                   SDValue &SOffset,
1628                                                   SDValue &Offset) const {
1629   ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr);
1630   if (!CAddr || !SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue()))
1631     return false;
1632 
1633   SDLoc DL(Addr);
1634   MachineFunction &MF = CurDAG->getMachineFunction();
1635   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1636 
1637   SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
1638 
1639   const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Parent)->getPointerInfo();
1640 
1641   // FIXME: Get from MachinePointerInfo? We should only be using the frame
1642   // offset if we know this is in a call sequence.
1643   SOffset = isStackPtrRelative(PtrInfo)
1644                 ? CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32)
1645                 : CurDAG->getTargetConstant(0, DL, MVT::i32);
1646 
1647   Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
1648   return true;
1649 }
1650 
1651 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1652                                            SDValue &SOffset, SDValue &Offset,
1653                                            SDValue &GLC, SDValue &SLC,
1654                                            SDValue &TFE, SDValue &DLC,
1655                                            SDValue &SWZ, SDValue &SCCB) const {
1656   SDValue Ptr, VAddr, Offen, Idxen, Addr64;
1657   const SIInstrInfo *TII =
1658     static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
1659 
1660   if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
1661               GLC, SLC, TFE, DLC, SWZ, SCCB))
1662     return false;
1663 
1664   if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
1665       !cast<ConstantSDNode>(Idxen)->getSExtValue() &&
1666       !cast<ConstantSDNode>(Addr64)->getSExtValue()) {
1667     uint64_t Rsrc = TII->getDefaultRsrcDataFormat() |
1668                     APInt::getAllOnesValue(32).getZExtValue(); // Size
1669     SDLoc DL(Addr);
1670 
1671     const SITargetLowering& Lowering =
1672       *static_cast<const SITargetLowering*>(getTargetLowering());
1673 
1674     SRsrc = SDValue(Lowering.buildRSRC(*CurDAG, DL, Ptr, 0, Rsrc), 0);
1675     return true;
1676   }
1677   return false;
1678 }
1679 
1680 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1681                                            SDValue &Soffset, SDValue &Offset
1682                                            ) const {
1683   SDValue GLC, SLC, TFE, DLC, SWZ, SCCB;
1684 
1685   return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC, SWZ, SCCB);
1686 }
1687 bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
1688                                            SDValue &Soffset, SDValue &Offset,
1689                                            SDValue &SLC) const {
1690   SDValue GLC, TFE, DLC, SWZ, SCCB;
1691 
1692   return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC, SWZ, SCCB);
1693 }
1694 
1695 // Find a load or store from corresponding pattern root.
1696 // Roots may be build_vector, bitconvert or their combinations.
1697 static MemSDNode* findMemSDNode(SDNode *N) {
1698   N = AMDGPUTargetLowering::stripBitcast(SDValue(N,0)).getNode();
1699   if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
1700     return MN;
1701   assert(isa<BuildVectorSDNode>(N));
1702   for (SDValue V : N->op_values())
1703     if (MemSDNode *MN =
1704           dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
1705       return MN;
1706   llvm_unreachable("cannot find MemSDNode in the pattern!");
1707 }
1708 
1709 template <bool IsSigned>
1710 bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N,
1711                                           SDValue Addr,
1712                                           SDValue &VAddr,
1713                                           SDValue &Offset) const {
1714   int64_t OffsetVal = 0;
1715 
1716   unsigned AS = findMemSDNode(N)->getAddressSpace();
1717 
1718   if (Subtarget->hasFlatInstOffsets() &&
1719       (!Subtarget->hasFlatSegmentOffsetBug() ||
1720        AS != AMDGPUAS::FLAT_ADDRESS)) {
1721     SDValue N0, N1;
1722     if (isBaseWithConstantOffset64(Addr, N0, N1)) {
1723       uint64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
1724 
1725       const SIInstrInfo *TII = Subtarget->getInstrInfo();
1726       if (TII->isLegalFLATOffset(COffsetVal, AS, IsSigned)) {
1727         Addr = N0;
1728         OffsetVal = COffsetVal;
1729       } else {
1730         // If the offset doesn't fit, put the low bits into the offset field and
1731         // add the rest.
1732         //
1733         // For a FLAT instruction the hardware decides whether to access
1734         // global/scratch/shared memory based on the high bits of vaddr,
1735         // ignoring the offset field, so we have to ensure that when we add
1736         // remainder to vaddr it still points into the same underlying object.
1737         // The easiest way to do that is to make sure that we split the offset
1738         // into two pieces that are both >= 0 or both <= 0.
1739 
1740         SDLoc DL(N);
1741         uint64_t RemainderOffset;
1742 
1743         std::tie(OffsetVal, RemainderOffset)
1744           = TII->splitFlatOffset(COffsetVal, AS, IsSigned);
1745 
1746         SDValue AddOffsetLo =
1747             getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1748         SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
1749 
1750         if (Addr.getValueType().getSizeInBits() == 32) {
1751           SmallVector<SDValue, 3> Opnds;
1752           Opnds.push_back(N0);
1753           Opnds.push_back(AddOffsetLo);
1754           unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
1755           if (Subtarget->hasAddNoCarry()) {
1756             AddOp = AMDGPU::V_ADD_U32_e64;
1757             Opnds.push_back(Clamp);
1758           }
1759           Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
1760         } else {
1761           // TODO: Should this try to use a scalar add pseudo if the base address
1762           // is uniform and saddr is usable?
1763           SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
1764           SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
1765 
1766           SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1767                                                 DL, MVT::i32, N0, Sub0);
1768           SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
1769                                                 DL, MVT::i32, N0, Sub1);
1770 
1771           SDValue AddOffsetHi =
1772               getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
1773 
1774           SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
1775 
1776           SDNode *Add =
1777               CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
1778                                      {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
1779 
1780           SDNode *Addc = CurDAG->getMachineNode(
1781               AMDGPU::V_ADDC_U32_e64, DL, VTs,
1782               {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
1783 
1784           SDValue RegSequenceArgs[] = {
1785               CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
1786               SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
1787 
1788           Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
1789                                                 MVT::i64, RegSequenceArgs),
1790                          0);
1791         }
1792       }
1793     }
1794   }
1795 
1796   VAddr = Addr;
1797   Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i16);
1798   return true;
1799 }
1800 
1801 // If this matches zero_extend i32:x, return x
1802 static SDValue matchZExtFromI32(SDValue Op) {
1803   if (Op.getOpcode() != ISD::ZERO_EXTEND)
1804     return SDValue();
1805 
1806   SDValue ExtSrc = Op.getOperand(0);
1807   return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue();
1808 }
1809 
1810 // Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
1811 bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
1812                                            SDValue Addr,
1813                                            SDValue &SAddr,
1814                                            SDValue &VOffset,
1815                                            SDValue &Offset) const {
1816   int64_t ImmOffset = 0;
1817 
1818   // Match the immediate offset first, which canonically is moved as low as
1819   // possible.
1820 
1821   SDValue LHS, RHS;
1822   if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
1823     int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
1824     const SIInstrInfo *TII = Subtarget->getInstrInfo();
1825 
1826     if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, true)) {
1827       Addr = LHS;
1828       ImmOffset = COffsetVal;
1829     } else if (!LHS->isDivergent() && COffsetVal > 0) {
1830       SDLoc SL(N);
1831       // saddr + large_offset -> saddr + (voffset = large_offset & ~MaxOffset) +
1832       //                         (large_offset & MaxOffset);
1833       int64_t SplitImmOffset, RemainderOffset;
1834       std::tie(SplitImmOffset, RemainderOffset)
1835         = TII->splitFlatOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, true);
1836 
1837       if (isUInt<32>(RemainderOffset)) {
1838         SDNode *VMov = CurDAG->getMachineNode(
1839           AMDGPU::V_MOV_B32_e32, SL, MVT::i32,
1840           CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
1841         VOffset = SDValue(VMov, 0);
1842         SAddr = LHS;
1843         Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16);
1844         return true;
1845       }
1846     }
1847   }
1848 
1849   // Match the variable offset.
1850   if (Addr.getOpcode() != ISD::ADD) {
1851     if (Addr->isDivergent() || Addr.getOpcode() == ISD::UNDEF ||
1852         isa<ConstantSDNode>(Addr))
1853       return false;
1854 
1855     // It's cheaper to materialize a single 32-bit zero for vaddr than the two
1856     // moves required to copy a 64-bit SGPR to VGPR.
1857     SAddr = Addr;
1858     SDNode *VMov = CurDAG->getMachineNode(
1859       AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
1860       CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
1861     VOffset = SDValue(VMov, 0);
1862     Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
1863     return true;
1864   }
1865 
1866   LHS = Addr.getOperand(0);
1867   RHS = Addr.getOperand(1);
1868 
1869   if (!LHS->isDivergent()) {
1870     // add (i64 sgpr), (zero_extend (i32 vgpr))
1871     if (SDValue ZextRHS = matchZExtFromI32(RHS)) {
1872       SAddr = LHS;
1873       VOffset = ZextRHS;
1874     }
1875   }
1876 
1877   if (!SAddr && !RHS->isDivergent()) {
1878     // add (zero_extend (i32 vgpr)), (i64 sgpr)
1879     if (SDValue ZextLHS = matchZExtFromI32(LHS)) {
1880       SAddr = RHS;
1881       VOffset = ZextLHS;
1882     }
1883   }
1884 
1885   if (!SAddr)
1886     return false;
1887 
1888   Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
1889   return true;
1890 }
1891 
1892 // Match (32-bit SGPR base) + sext(imm offset)
1893 bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *N,
1894                                             SDValue Addr,
1895                                             SDValue &SAddr,
1896                                             SDValue &Offset) const {
1897   if (Addr->isDivergent())
1898     return false;
1899 
1900   SAddr = Addr;
1901   int64_t COffsetVal = 0;
1902 
1903   if (CurDAG->isBaseWithConstantOffset(Addr)) {
1904     COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
1905     SAddr = Addr.getOperand(0);
1906   }
1907 
1908   if (auto FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
1909     SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
1910   } else if (SAddr.getOpcode() == ISD::ADD &&
1911              isa<FrameIndexSDNode>(SAddr.getOperand(0))) {
1912     // Materialize this into a scalar move for scalar address to avoid
1913     // readfirstlane.
1914     auto FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
1915     SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
1916                                               FI->getValueType(0));
1917     SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_U32, SDLoc(SAddr),
1918                                            MVT::i32, TFI, SAddr.getOperand(1)),
1919                     0);
1920   }
1921 
1922   const SIInstrInfo *TII = Subtarget->getInstrInfo();
1923 
1924   if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true)) {
1925     int64_t RemainderOffset = COffsetVal;
1926     int64_t ImmField = 0;
1927     const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(*Subtarget, true);
1928     // Use signed division by a power of two to truncate towards 0.
1929     int64_t D = 1LL << (NumBits - 1);
1930     RemainderOffset = (COffsetVal / D) * D;
1931     ImmField = COffsetVal - RemainderOffset;
1932 
1933     assert(TII->isLegalFLATOffset(ImmField, AMDGPUAS::PRIVATE_ADDRESS, true));
1934     assert(RemainderOffset + ImmField == COffsetVal);
1935 
1936     COffsetVal = ImmField;
1937 
1938     SDLoc DL(N);
1939     SDValue AddOffset =
1940         getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
1941     SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_U32, DL, MVT::i32,
1942                                            SAddr, AddOffset), 0);
1943   }
1944 
1945   Offset = CurDAG->getTargetConstant(COffsetVal, SDLoc(), MVT::i16);
1946 
1947   return true;
1948 }
1949 
1950 bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
1951                                           SDValue &Offset, bool &Imm) const {
1952   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
1953   if (!C) {
1954     if (ByteOffsetNode.getValueType().isScalarInteger() &&
1955         ByteOffsetNode.getValueType().getSizeInBits() == 32) {
1956       Offset = ByteOffsetNode;
1957       Imm = false;
1958       return true;
1959     }
1960     if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
1961       if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
1962         Offset = ByteOffsetNode.getOperand(0);
1963         Imm = false;
1964         return true;
1965       }
1966     }
1967     return false;
1968   }
1969 
1970   SDLoc SL(ByteOffsetNode);
1971   // GFX9 and GFX10 have signed byte immediate offsets.
1972   int64_t ByteOffset = C->getSExtValue();
1973   Optional<int64_t> EncodedOffset =
1974       AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset, false);
1975   if (EncodedOffset) {
1976     Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
1977     Imm = true;
1978     return true;
1979   }
1980 
1981   // SGPR and literal offsets are unsigned.
1982   if (ByteOffset < 0)
1983     return false;
1984 
1985   EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);
1986   if (EncodedOffset) {
1987     Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
1988     return true;
1989   }
1990 
1991   if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))
1992     return false;
1993 
1994   SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
1995   Offset = SDValue(
1996       CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
1997 
1998   return true;
1999 }
2000 
2001 SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const {
2002   if (Addr.getValueType() != MVT::i32)
2003     return Addr;
2004 
2005   // Zero-extend a 32-bit address.
2006   SDLoc SL(Addr);
2007 
2008   const MachineFunction &MF = CurDAG->getMachineFunction();
2009   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2010   unsigned AddrHiVal = Info->get32BitAddressHighBits();
2011   SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32);
2012 
2013   const SDValue Ops[] = {
2014     CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32),
2015     Addr,
2016     CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
2017     SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi),
2018             0),
2019     CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32),
2020   };
2021 
2022   return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64,
2023                                         Ops), 0);
2024 }
2025 
2026 bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
2027                                      SDValue &Offset, bool &Imm) const {
2028   SDLoc SL(Addr);
2029 
2030   // A 32-bit (address + offset) should not cause unsigned 32-bit integer
2031   // wraparound, because s_load instructions perform the addition in 64 bits.
2032   if ((Addr.getValueType() != MVT::i32 ||
2033        Addr->getFlags().hasNoUnsignedWrap())) {
2034     SDValue N0, N1;
2035     // Extract the base and offset if possible.
2036     if (CurDAG->isBaseWithConstantOffset(Addr) ||
2037         Addr.getOpcode() == ISD::ADD) {
2038       N0 = Addr.getOperand(0);
2039       N1 = Addr.getOperand(1);
2040     } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
2041       assert(N0 && N1 && isa<ConstantSDNode>(N1));
2042     }
2043     if (N0 && N1) {
2044       if (SelectSMRDOffset(N1, Offset, Imm)) {
2045         SBase = Expand32BitAddress(N0);
2046         return true;
2047       }
2048     }
2049   }
2050   SBase = Expand32BitAddress(Addr);
2051   Offset = CurDAG->getTargetConstant(0, SL, MVT::i32);
2052   Imm = true;
2053   return true;
2054 }
2055 
2056 bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
2057                                        SDValue &Offset) const {
2058   bool Imm = false;
2059   return SelectSMRD(Addr, SBase, Offset, Imm) && Imm;
2060 }
2061 
2062 bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
2063                                          SDValue &Offset) const {
2064 
2065   assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2066 
2067   bool Imm = false;
2068   if (!SelectSMRD(Addr, SBase, Offset, Imm))
2069     return false;
2070 
2071   return !Imm && isa<ConstantSDNode>(Offset);
2072 }
2073 
2074 bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
2075                                         SDValue &Offset) const {
2076   bool Imm = false;
2077   return SelectSMRD(Addr, SBase, Offset, Imm) && !Imm &&
2078          !isa<ConstantSDNode>(Offset);
2079 }
2080 
2081 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue Addr,
2082                                              SDValue &Offset) const {
2083   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr)) {
2084     // The immediate offset for S_BUFFER instructions is unsigned.
2085     if (auto Imm =
2086             AMDGPU::getSMRDEncodedOffset(*Subtarget, C->getZExtValue(), true)) {
2087       Offset = CurDAG->getTargetConstant(*Imm, SDLoc(Addr), MVT::i32);
2088       return true;
2089     }
2090   }
2091 
2092   return false;
2093 }
2094 
2095 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue Addr,
2096                                                SDValue &Offset) const {
2097   assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
2098 
2099   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr)) {
2100     if (auto Imm = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget,
2101                                                          C->getZExtValue())) {
2102       Offset = CurDAG->getTargetConstant(*Imm, SDLoc(Addr), MVT::i32);
2103       return true;
2104     }
2105   }
2106 
2107   return false;
2108 }
2109 
2110 bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
2111                                             SDValue &Base,
2112                                             SDValue &Offset) const {
2113   SDLoc DL(Index);
2114 
2115   if (CurDAG->isBaseWithConstantOffset(Index)) {
2116     SDValue N0 = Index.getOperand(0);
2117     SDValue N1 = Index.getOperand(1);
2118     ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
2119 
2120     // (add n0, c0)
2121     // Don't peel off the offset (c0) if doing so could possibly lead
2122     // the base (n0) to be negative.
2123     // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.
2124     if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) ||
2125         (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {
2126       Base = N0;
2127       Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
2128       return true;
2129     }
2130   }
2131 
2132   if (isa<ConstantSDNode>(Index))
2133     return false;
2134 
2135   Base = Index;
2136   Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
2137   return true;
2138 }
2139 
2140 SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, const SDLoc &DL,
2141                                      SDValue Val, uint32_t Offset,
2142                                      uint32_t Width) {
2143   // Transformation function, pack the offset and width of a BFE into
2144   // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
2145   // source, bits [5:0] contain the offset and bits [22:16] the width.
2146   uint32_t PackedVal = Offset | (Width << 16);
2147   SDValue PackedConst = CurDAG->getTargetConstant(PackedVal, DL, MVT::i32);
2148 
2149   return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst);
2150 }
2151 
2152 void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) {
2153   // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c)
2154   // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c)
2155   // Predicate: 0 < b <= c < 32
2156 
2157   const SDValue &Shl = N->getOperand(0);
2158   ConstantSDNode *B = dyn_cast<ConstantSDNode>(Shl->getOperand(1));
2159   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
2160 
2161   if (B && C) {
2162     uint32_t BVal = B->getZExtValue();
2163     uint32_t CVal = C->getZExtValue();
2164 
2165     if (0 < BVal && BVal <= CVal && CVal < 32) {
2166       bool Signed = N->getOpcode() == ISD::SRA;
2167       unsigned Opcode = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2168 
2169       ReplaceNode(N, getS_BFE(Opcode, SDLoc(N), Shl.getOperand(0), CVal - BVal,
2170                               32 - CVal));
2171       return;
2172     }
2173   }
2174   SelectCode(N);
2175 }
2176 
2177 void AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
2178   switch (N->getOpcode()) {
2179   case ISD::AND:
2180     if (N->getOperand(0).getOpcode() == ISD::SRL) {
2181       // "(a srl b) & mask" ---> "BFE_U32 a, b, popcount(mask)"
2182       // Predicate: isMask(mask)
2183       const SDValue &Srl = N->getOperand(0);
2184       ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(Srl.getOperand(1));
2185       ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
2186 
2187       if (Shift && Mask) {
2188         uint32_t ShiftVal = Shift->getZExtValue();
2189         uint32_t MaskVal = Mask->getZExtValue();
2190 
2191         if (isMask_32(MaskVal)) {
2192           uint32_t WidthVal = countPopulation(MaskVal);
2193 
2194           ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N),
2195                                   Srl.getOperand(0), ShiftVal, WidthVal));
2196           return;
2197         }
2198       }
2199     }
2200     break;
2201   case ISD::SRL:
2202     if (N->getOperand(0).getOpcode() == ISD::AND) {
2203       // "(a & mask) srl b)" ---> "BFE_U32 a, b, popcount(mask >> b)"
2204       // Predicate: isMask(mask >> b)
2205       const SDValue &And = N->getOperand(0);
2206       ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N->getOperand(1));
2207       ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(And->getOperand(1));
2208 
2209       if (Shift && Mask) {
2210         uint32_t ShiftVal = Shift->getZExtValue();
2211         uint32_t MaskVal = Mask->getZExtValue() >> ShiftVal;
2212 
2213         if (isMask_32(MaskVal)) {
2214           uint32_t WidthVal = countPopulation(MaskVal);
2215 
2216           ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N),
2217                                   And.getOperand(0), ShiftVal, WidthVal));
2218           return;
2219         }
2220       }
2221     } else if (N->getOperand(0).getOpcode() == ISD::SHL) {
2222       SelectS_BFEFromShifts(N);
2223       return;
2224     }
2225     break;
2226   case ISD::SRA:
2227     if (N->getOperand(0).getOpcode() == ISD::SHL) {
2228       SelectS_BFEFromShifts(N);
2229       return;
2230     }
2231     break;
2232 
2233   case ISD::SIGN_EXTEND_INREG: {
2234     // sext_inreg (srl x, 16), i8 -> bfe_i32 x, 16, 8
2235     SDValue Src = N->getOperand(0);
2236     if (Src.getOpcode() != ISD::SRL)
2237       break;
2238 
2239     const ConstantSDNode *Amt = dyn_cast<ConstantSDNode>(Src.getOperand(1));
2240     if (!Amt)
2241       break;
2242 
2243     unsigned Width = cast<VTSDNode>(N->getOperand(1))->getVT().getSizeInBits();
2244     ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_I32, SDLoc(N), Src.getOperand(0),
2245                             Amt->getZExtValue(), Width));
2246     return;
2247   }
2248   }
2249 
2250   SelectCode(N);
2251 }
2252 
2253 bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
2254   assert(N->getOpcode() == ISD::BRCOND);
2255   if (!N->hasOneUse())
2256     return false;
2257 
2258   SDValue Cond = N->getOperand(1);
2259   if (Cond.getOpcode() == ISD::CopyToReg)
2260     Cond = Cond.getOperand(2);
2261 
2262   if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse())
2263     return false;
2264 
2265   MVT VT = Cond.getOperand(0).getSimpleValueType();
2266   if (VT == MVT::i32)
2267     return true;
2268 
2269   if (VT == MVT::i64) {
2270     auto ST = static_cast<const GCNSubtarget *>(Subtarget);
2271 
2272     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
2273     return (CC == ISD::SETEQ || CC == ISD::SETNE) && ST->hasScalarCompareEq64();
2274   }
2275 
2276   return false;
2277 }
2278 
2279 void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
2280   SDValue Cond = N->getOperand(1);
2281 
2282   if (Cond.isUndef()) {
2283     CurDAG->SelectNodeTo(N, AMDGPU::SI_BR_UNDEF, MVT::Other,
2284                          N->getOperand(2), N->getOperand(0));
2285     return;
2286   }
2287 
2288   const GCNSubtarget *ST = static_cast<const GCNSubtarget *>(Subtarget);
2289   const SIRegisterInfo *TRI = ST->getRegisterInfo();
2290 
2291   bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
2292   unsigned BrOp = UseSCCBr ? AMDGPU::S_CBRANCH_SCC1 : AMDGPU::S_CBRANCH_VCCNZ;
2293   Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
2294   SDLoc SL(N);
2295 
2296   if (!UseSCCBr) {
2297     // This is the case that we are selecting to S_CBRANCH_VCCNZ.  We have not
2298     // analyzed what generates the vcc value, so we do not know whether vcc
2299     // bits for disabled lanes are 0.  Thus we need to mask out bits for
2300     // disabled lanes.
2301     //
2302     // For the case that we select S_CBRANCH_SCC1 and it gets
2303     // changed to S_CBRANCH_VCCNZ in SIFixSGPRCopies, SIFixSGPRCopies calls
2304     // SIInstrInfo::moveToVALU which inserts the S_AND).
2305     //
2306     // We could add an analysis of what generates the vcc value here and omit
2307     // the S_AND when is unnecessary. But it would be better to add a separate
2308     // pass after SIFixSGPRCopies to do the unnecessary S_AND removal, so it
2309     // catches both cases.
2310     Cond = SDValue(CurDAG->getMachineNode(ST->isWave32() ? AMDGPU::S_AND_B32
2311                                                          : AMDGPU::S_AND_B64,
2312                      SL, MVT::i1,
2313                      CurDAG->getRegister(ST->isWave32() ? AMDGPU::EXEC_LO
2314                                                         : AMDGPU::EXEC,
2315                                          MVT::i1),
2316                     Cond),
2317                    0);
2318   }
2319 
2320   SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, CondReg, Cond);
2321   CurDAG->SelectNodeTo(N, BrOp, MVT::Other,
2322                        N->getOperand(2), // Basic Block
2323                        VCC.getValue(0));
2324 }
2325 
2326 void AMDGPUDAGToDAGISel::SelectFMAD_FMA(SDNode *N) {
2327   MVT VT = N->getSimpleValueType(0);
2328   bool IsFMA = N->getOpcode() == ISD::FMA;
2329   if (VT != MVT::f32 || (!Subtarget->hasMadMixInsts() &&
2330                          !Subtarget->hasFmaMixInsts()) ||
2331       ((IsFMA && Subtarget->hasMadMixInsts()) ||
2332        (!IsFMA && Subtarget->hasFmaMixInsts()))) {
2333     SelectCode(N);
2334     return;
2335   }
2336 
2337   SDValue Src0 = N->getOperand(0);
2338   SDValue Src1 = N->getOperand(1);
2339   SDValue Src2 = N->getOperand(2);
2340   unsigned Src0Mods, Src1Mods, Src2Mods;
2341 
2342   // Avoid using v_mad_mix_f32/v_fma_mix_f32 unless there is actually an operand
2343   // using the conversion from f16.
2344   bool Sel0 = SelectVOP3PMadMixModsImpl(Src0, Src0, Src0Mods);
2345   bool Sel1 = SelectVOP3PMadMixModsImpl(Src1, Src1, Src1Mods);
2346   bool Sel2 = SelectVOP3PMadMixModsImpl(Src2, Src2, Src2Mods);
2347 
2348   assert((IsFMA || !Mode.allFP32Denormals()) &&
2349          "fmad selected with denormals enabled");
2350   // TODO: We can select this with f32 denormals enabled if all the sources are
2351   // converted from f16 (in which case fmad isn't legal).
2352 
2353   if (Sel0 || Sel1 || Sel2) {
2354     // For dummy operands.
2355     SDValue Zero = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2356     SDValue Ops[] = {
2357       CurDAG->getTargetConstant(Src0Mods, SDLoc(), MVT::i32), Src0,
2358       CurDAG->getTargetConstant(Src1Mods, SDLoc(), MVT::i32), Src1,
2359       CurDAG->getTargetConstant(Src2Mods, SDLoc(), MVT::i32), Src2,
2360       CurDAG->getTargetConstant(0, SDLoc(), MVT::i1),
2361       Zero, Zero
2362     };
2363 
2364     CurDAG->SelectNodeTo(N,
2365                          IsFMA ? AMDGPU::V_FMA_MIX_F32 : AMDGPU::V_MAD_MIX_F32,
2366                          MVT::f32, Ops);
2367   } else {
2368     SelectCode(N);
2369   }
2370 }
2371 
2372 // This is here because there isn't a way to use the generated sub0_sub1 as the
2373 // subreg index to EXTRACT_SUBREG in tablegen.
2374 void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) {
2375   MemSDNode *Mem = cast<MemSDNode>(N);
2376   unsigned AS = Mem->getAddressSpace();
2377   if (AS == AMDGPUAS::FLAT_ADDRESS) {
2378     SelectCode(N);
2379     return;
2380   }
2381 
2382   MVT VT = N->getSimpleValueType(0);
2383   bool Is32 = (VT == MVT::i32);
2384   SDLoc SL(N);
2385 
2386   MachineSDNode *CmpSwap = nullptr;
2387   if (Subtarget->hasAddr64()) {
2388     SDValue SRsrc, VAddr, SOffset, Offset, SLC;
2389 
2390     if (SelectMUBUFAddr64(Mem->getBasePtr(), SRsrc, VAddr, SOffset, Offset, SLC)) {
2391       unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN :
2392         AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN;
2393       SDValue CmpVal = Mem->getOperand(2);
2394       SDValue GLC = CurDAG->getTargetConstant(1, SL, MVT::i1);
2395 
2396       // XXX - Do we care about glue operands?
2397 
2398       SDValue Ops[] = {
2399         CmpVal, VAddr, SRsrc, SOffset, Offset, GLC, SLC, Mem->getChain()
2400       };
2401 
2402       CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops);
2403     }
2404   }
2405 
2406   if (!CmpSwap) {
2407     SDValue SRsrc, SOffset, Offset, SLC;
2408     if (SelectMUBUFOffset(Mem->getBasePtr(), SRsrc, SOffset, Offset, SLC)) {
2409       unsigned Opcode = Is32 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN :
2410         AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN;
2411 
2412       SDValue CmpVal = Mem->getOperand(2);
2413       SDValue GLC = CurDAG->getTargetConstant(1, SL, MVT::i1);
2414       SDValue Ops[] = {
2415         CmpVal, SRsrc, SOffset, Offset, GLC, SLC, Mem->getChain()
2416       };
2417 
2418       CmpSwap = CurDAG->getMachineNode(Opcode, SL, Mem->getVTList(), Ops);
2419     }
2420   }
2421 
2422   if (!CmpSwap) {
2423     SelectCode(N);
2424     return;
2425   }
2426 
2427   MachineMemOperand *MMO = Mem->getMemOperand();
2428   CurDAG->setNodeMemRefs(CmpSwap, {MMO});
2429 
2430   unsigned SubReg = Is32 ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
2431   SDValue Extract
2432     = CurDAG->getTargetExtractSubreg(SubReg, SL, VT, SDValue(CmpSwap, 0));
2433 
2434   ReplaceUses(SDValue(N, 0), Extract);
2435   ReplaceUses(SDValue(N, 1), SDValue(CmpSwap, 1));
2436   CurDAG->RemoveDeadNode(N);
2437 }
2438 
2439 void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
2440   // The address is assumed to be uniform, so if it ends up in a VGPR, it will
2441   // be copied to an SGPR with readfirstlane.
2442   unsigned Opc = IntrID == Intrinsic::amdgcn_ds_append ?
2443     AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
2444 
2445   SDValue Chain = N->getOperand(0);
2446   SDValue Ptr = N->getOperand(2);
2447   MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2448   MachineMemOperand *MMO = M->getMemOperand();
2449   bool IsGDS = M->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
2450 
2451   SDValue Offset;
2452   if (CurDAG->isBaseWithConstantOffset(Ptr)) {
2453     SDValue PtrBase = Ptr.getOperand(0);
2454     SDValue PtrOffset = Ptr.getOperand(1);
2455 
2456     const APInt &OffsetVal = cast<ConstantSDNode>(PtrOffset)->getAPIntValue();
2457     if (isDSOffsetLegal(PtrBase, OffsetVal.getZExtValue())) {
2458       N = glueCopyToM0(N, PtrBase);
2459       Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
2460     }
2461   }
2462 
2463   if (!Offset) {
2464     N = glueCopyToM0(N, Ptr);
2465     Offset = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
2466   }
2467 
2468   SDValue Ops[] = {
2469     Offset,
2470     CurDAG->getTargetConstant(IsGDS, SDLoc(), MVT::i32),
2471     Chain,
2472     N->getOperand(N->getNumOperands() - 1) // New glue
2473   };
2474 
2475   SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2476   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2477 }
2478 
2479 static unsigned gwsIntrinToOpcode(unsigned IntrID) {
2480   switch (IntrID) {
2481   case Intrinsic::amdgcn_ds_gws_init:
2482     return AMDGPU::DS_GWS_INIT;
2483   case Intrinsic::amdgcn_ds_gws_barrier:
2484     return AMDGPU::DS_GWS_BARRIER;
2485   case Intrinsic::amdgcn_ds_gws_sema_v:
2486     return AMDGPU::DS_GWS_SEMA_V;
2487   case Intrinsic::amdgcn_ds_gws_sema_br:
2488     return AMDGPU::DS_GWS_SEMA_BR;
2489   case Intrinsic::amdgcn_ds_gws_sema_p:
2490     return AMDGPU::DS_GWS_SEMA_P;
2491   case Intrinsic::amdgcn_ds_gws_sema_release_all:
2492     return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
2493   default:
2494     llvm_unreachable("not a gws intrinsic");
2495   }
2496 }
2497 
2498 void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
2499   if (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
2500       !Subtarget->hasGWSSemaReleaseAll()) {
2501     // Let this error.
2502     SelectCode(N);
2503     return;
2504   }
2505 
2506   // Chain, intrinsic ID, vsrc, offset
2507   const bool HasVSrc = N->getNumOperands() == 4;
2508   assert(HasVSrc || N->getNumOperands() == 3);
2509 
2510   SDLoc SL(N);
2511   SDValue BaseOffset = N->getOperand(HasVSrc ? 3 : 2);
2512   int ImmOffset = 0;
2513   MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
2514   MachineMemOperand *MMO = M->getMemOperand();
2515 
2516   // Don't worry if the offset ends up in a VGPR. Only one lane will have
2517   // effect, so SIFixSGPRCopies will validly insert readfirstlane.
2518 
2519   // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
2520   // offset field) % 64. Some versions of the programming guide omit the m0
2521   // part, or claim it's from offset 0.
2522   if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
2523     // If we have a constant offset, try to use the 0 in m0 as the base.
2524     // TODO: Look into changing the default m0 initialization value. If the
2525     // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
2526     // the immediate offset.
2527     glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32));
2528     ImmOffset = ConstOffset->getZExtValue();
2529   } else {
2530     if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
2531       ImmOffset = BaseOffset.getConstantOperandVal(1);
2532       BaseOffset = BaseOffset.getOperand(0);
2533     }
2534 
2535     // Prefer to do the shift in an SGPR since it should be possible to use m0
2536     // as the result directly. If it's already an SGPR, it will be eliminated
2537     // later.
2538     SDNode *SGPROffset
2539       = CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL, MVT::i32,
2540                                BaseOffset);
2541     // Shift to offset in m0
2542     SDNode *M0Base
2543       = CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
2544                                SDValue(SGPROffset, 0),
2545                                CurDAG->getTargetConstant(16, SL, MVT::i32));
2546     glueCopyToM0(N, SDValue(M0Base, 0));
2547   }
2548 
2549   SDValue Chain = N->getOperand(0);
2550   SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
2551 
2552   const unsigned Opc = gwsIntrinToOpcode(IntrID);
2553   SmallVector<SDValue, 5> Ops;
2554   if (HasVSrc)
2555     Ops.push_back(N->getOperand(2));
2556   Ops.push_back(OffsetField);
2557   Ops.push_back(Chain);
2558 
2559   SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
2560   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
2561 }
2562 
2563 void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
2564   if (Subtarget->getLDSBankCount() != 16) {
2565     // This is a single instruction with a pattern.
2566     SelectCode(N);
2567     return;
2568   }
2569 
2570   SDLoc DL(N);
2571 
2572   // This requires 2 instructions. It is possible to write a pattern to support
2573   // this, but the generated isel emitter doesn't correctly deal with multiple
2574   // output instructions using the same physical register input. The copy to m0
2575   // is incorrectly placed before the second instruction.
2576   //
2577   // TODO: Match source modifiers.
2578   //
2579   // def : Pat <
2580   //   (int_amdgcn_interp_p1_f16
2581   //    (VOP3Mods f32:$src0, i32:$src0_modifiers),
2582   //                             (i32 timm:$attrchan), (i32 timm:$attr),
2583   //                             (i1 timm:$high), M0),
2584   //   (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
2585   //       timm:$attrchan, 0,
2586   //       (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
2587   //   let Predicates = [has16BankLDS];
2588   // }
2589 
2590   // 16 bank LDS
2591   SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0,
2592                                       N->getOperand(5), SDValue());
2593 
2594   SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);
2595 
2596   SDNode *InterpMov =
2597     CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
2598         CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
2599         N->getOperand(3),  // Attr
2600         N->getOperand(2),  // Attrchan
2601         ToM0.getValue(1) // In glue
2602   });
2603 
2604   SDNode *InterpP1LV =
2605     CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
2606         CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
2607         N->getOperand(1), // Src0
2608         N->getOperand(3), // Attr
2609         N->getOperand(2), // Attrchan
2610         CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
2611         SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
2612         N->getOperand(4), // high
2613         CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
2614         CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
2615         SDValue(InterpMov, 1)
2616   });
2617 
2618   CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));
2619 }
2620 
2621 void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
2622   unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
2623   switch (IntrID) {
2624   case Intrinsic::amdgcn_ds_append:
2625   case Intrinsic::amdgcn_ds_consume: {
2626     if (N->getValueType(0) != MVT::i32)
2627       break;
2628     SelectDSAppendConsume(N, IntrID);
2629     return;
2630   }
2631   }
2632 
2633   SelectCode(N);
2634 }
2635 
2636 void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
2637   unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
2638   unsigned Opcode;
2639   switch (IntrID) {
2640   case Intrinsic::amdgcn_wqm:
2641     Opcode = AMDGPU::WQM;
2642     break;
2643   case Intrinsic::amdgcn_softwqm:
2644     Opcode = AMDGPU::SOFT_WQM;
2645     break;
2646   case Intrinsic::amdgcn_wwm:
2647     Opcode = AMDGPU::WWM;
2648     break;
2649   case Intrinsic::amdgcn_interp_p1_f16:
2650     SelectInterpP1F16(N);
2651     return;
2652   default:
2653     SelectCode(N);
2654     return;
2655   }
2656 
2657   SDValue Src = N->getOperand(1);
2658   CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src});
2659 }
2660 
2661 void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
2662   unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
2663   switch (IntrID) {
2664   case Intrinsic::amdgcn_ds_gws_init:
2665   case Intrinsic::amdgcn_ds_gws_barrier:
2666   case Intrinsic::amdgcn_ds_gws_sema_v:
2667   case Intrinsic::amdgcn_ds_gws_sema_br:
2668   case Intrinsic::amdgcn_ds_gws_sema_p:
2669   case Intrinsic::amdgcn_ds_gws_sema_release_all:
2670     SelectDS_GWS(N, IntrID);
2671     return;
2672   default:
2673     break;
2674   }
2675 
2676   SelectCode(N);
2677 }
2678 
2679 bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
2680                                             unsigned &Mods,
2681                                             bool AllowAbs) const {
2682   Mods = 0;
2683   Src = In;
2684 
2685   if (Src.getOpcode() == ISD::FNEG) {
2686     Mods |= SISrcMods::NEG;
2687     Src = Src.getOperand(0);
2688   }
2689 
2690   if (AllowAbs && Src.getOpcode() == ISD::FABS) {
2691     Mods |= SISrcMods::ABS;
2692     Src = Src.getOperand(0);
2693   }
2694 
2695   return true;
2696 }
2697 
2698 bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
2699                                         SDValue &SrcMods) const {
2700   unsigned Mods;
2701   if (SelectVOP3ModsImpl(In, Src, Mods)) {
2702     SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2703     return true;
2704   }
2705 
2706   return false;
2707 }
2708 
2709 bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
2710                                          SDValue &SrcMods) const {
2711   unsigned Mods;
2712   if (SelectVOP3ModsImpl(In, Src, Mods, /* AllowAbs */ false)) {
2713     SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2714     return true;
2715   }
2716 
2717   return false;
2718 }
2719 
2720 bool AMDGPUDAGToDAGISel::SelectVOP3Mods_NNaN(SDValue In, SDValue &Src,
2721                                              SDValue &SrcMods) const {
2722   SelectVOP3Mods(In, Src, SrcMods);
2723   return isNoNanSrc(Src);
2724 }
2725 
2726 bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
2727   if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
2728     return false;
2729 
2730   Src = In;
2731   return true;
2732 }
2733 
2734 bool AMDGPUDAGToDAGISel::SelectVOP3Mods0(SDValue In, SDValue &Src,
2735                                          SDValue &SrcMods, SDValue &Clamp,
2736                                          SDValue &Omod) const {
2737   SDLoc DL(In);
2738   Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2739   Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2740 
2741   return SelectVOP3Mods(In, Src, SrcMods);
2742 }
2743 
2744 bool AMDGPUDAGToDAGISel::SelectVOP3BMods0(SDValue In, SDValue &Src,
2745                                           SDValue &SrcMods, SDValue &Clamp,
2746                                           SDValue &Omod) const {
2747   SDLoc DL(In);
2748   Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2749   Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2750 
2751   return SelectVOP3BMods(In, Src, SrcMods);
2752 }
2753 
2754 bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
2755                                          SDValue &Clamp, SDValue &Omod) const {
2756   Src = In;
2757 
2758   SDLoc DL(In);
2759   Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
2760   Omod = CurDAG->getTargetConstant(0, DL, MVT::i1);
2761 
2762   return true;
2763 }
2764 
2765 bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
2766                                          SDValue &SrcMods) const {
2767   unsigned Mods = 0;
2768   Src = In;
2769 
2770   if (Src.getOpcode() == ISD::FNEG) {
2771     Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
2772     Src = Src.getOperand(0);
2773   }
2774 
2775   if (Src.getOpcode() == ISD::BUILD_VECTOR) {
2776     unsigned VecMods = Mods;
2777 
2778     SDValue Lo = stripBitcast(Src.getOperand(0));
2779     SDValue Hi = stripBitcast(Src.getOperand(1));
2780 
2781     if (Lo.getOpcode() == ISD::FNEG) {
2782       Lo = stripBitcast(Lo.getOperand(0));
2783       Mods ^= SISrcMods::NEG;
2784     }
2785 
2786     if (Hi.getOpcode() == ISD::FNEG) {
2787       Hi = stripBitcast(Hi.getOperand(0));
2788       Mods ^= SISrcMods::NEG_HI;
2789     }
2790 
2791     if (isExtractHiElt(Lo, Lo))
2792       Mods |= SISrcMods::OP_SEL_0;
2793 
2794     if (isExtractHiElt(Hi, Hi))
2795       Mods |= SISrcMods::OP_SEL_1;
2796 
2797     unsigned VecSize = Src.getValueSizeInBits();
2798     Lo = stripExtractLoElt(Lo);
2799     Hi = stripExtractLoElt(Hi);
2800 
2801     if (Lo.getValueSizeInBits() > VecSize) {
2802       Lo = CurDAG->getTargetExtractSubreg(
2803         (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
2804         MVT::getIntegerVT(VecSize), Lo);
2805     }
2806 
2807     if (Hi.getValueSizeInBits() > VecSize) {
2808       Hi = CurDAG->getTargetExtractSubreg(
2809         (VecSize > 32) ? AMDGPU::sub0_sub1 : AMDGPU::sub0, SDLoc(In),
2810         MVT::getIntegerVT(VecSize), Hi);
2811     }
2812 
2813     assert(Lo.getValueSizeInBits() <= VecSize &&
2814            Hi.getValueSizeInBits() <= VecSize);
2815 
2816     if (Lo == Hi && !isInlineImmediate(Lo.getNode())) {
2817       // Really a scalar input. Just select from the low half of the register to
2818       // avoid packing.
2819 
2820       if (VecSize == 32 || VecSize == Lo.getValueSizeInBits()) {
2821         Src = Lo;
2822       } else {
2823         assert(Lo.getValueSizeInBits() == 32 && VecSize == 64);
2824 
2825         SDLoc SL(In);
2826         SDValue Undef = SDValue(
2827           CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SL,
2828                                  Lo.getValueType()), 0);
2829         auto RC = Lo->isDivergent() ? AMDGPU::VReg_64RegClassID
2830                                     : AMDGPU::SReg_64RegClassID;
2831         const SDValue Ops[] = {
2832           CurDAG->getTargetConstant(RC, SL, MVT::i32),
2833           Lo, CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32),
2834           Undef, CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32) };
2835 
2836         Src = SDValue(CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SL,
2837                                              Src.getValueType(), Ops), 0);
2838       }
2839       SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2840       return true;
2841     }
2842 
2843     if (VecSize == 64 && Lo == Hi && isa<ConstantFPSDNode>(Lo)) {
2844       uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
2845                       .bitcastToAPInt().getZExtValue();
2846       if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
2847         Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);;
2848         SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2849         return true;
2850       }
2851     }
2852 
2853     Mods = VecMods;
2854   }
2855 
2856   // Packed instructions do not have abs modifiers.
2857   Mods |= SISrcMods::OP_SEL_1;
2858 
2859   SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2860   return true;
2861 }
2862 
2863 bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
2864                                          SDValue &SrcMods) const {
2865   Src = In;
2866   // FIXME: Handle op_sel
2867   SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);
2868   return true;
2869 }
2870 
2871 bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
2872                                              SDValue &SrcMods) const {
2873   // FIXME: Handle op_sel
2874   return SelectVOP3Mods(In, Src, SrcMods);
2875 }
2876 
2877 // The return value is not whether the match is possible (which it always is),
2878 // but whether or not it a conversion is really used.
2879 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
2880                                                    unsigned &Mods) const {
2881   Mods = 0;
2882   SelectVOP3ModsImpl(In, Src, Mods);
2883 
2884   if (Src.getOpcode() == ISD::FP_EXTEND) {
2885     Src = Src.getOperand(0);
2886     assert(Src.getValueType() == MVT::f16);
2887     Src = stripBitcast(Src);
2888 
2889     // Be careful about folding modifiers if we already have an abs. fneg is
2890     // applied last, so we don't want to apply an earlier fneg.
2891     if ((Mods & SISrcMods::ABS) == 0) {
2892       unsigned ModsTmp;
2893       SelectVOP3ModsImpl(Src, Src, ModsTmp);
2894 
2895       if ((ModsTmp & SISrcMods::NEG) != 0)
2896         Mods ^= SISrcMods::NEG;
2897 
2898       if ((ModsTmp & SISrcMods::ABS) != 0)
2899         Mods |= SISrcMods::ABS;
2900     }
2901 
2902     // op_sel/op_sel_hi decide the source type and source.
2903     // If the source's op_sel_hi is set, it indicates to do a conversion from fp16.
2904     // If the sources's op_sel is set, it picks the high half of the source
2905     // register.
2906 
2907     Mods |= SISrcMods::OP_SEL_1;
2908     if (isExtractHiElt(Src, Src)) {
2909       Mods |= SISrcMods::OP_SEL_0;
2910 
2911       // TODO: Should we try to look for neg/abs here?
2912     }
2913 
2914     return true;
2915   }
2916 
2917   return false;
2918 }
2919 
2920 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
2921                                                SDValue &SrcMods) const {
2922   unsigned Mods = 0;
2923   SelectVOP3PMadMixModsImpl(In, Src, Mods);
2924   SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
2925   return true;
2926 }
2927 
2928 SDValue AMDGPUDAGToDAGISel::getHi16Elt(SDValue In) const {
2929   if (In.isUndef())
2930     return CurDAG->getUNDEF(MVT::i32);
2931 
2932   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(In)) {
2933     SDLoc SL(In);
2934     return CurDAG->getConstant(C->getZExtValue() << 16, SL, MVT::i32);
2935   }
2936 
2937   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(In)) {
2938     SDLoc SL(In);
2939     return CurDAG->getConstant(
2940       C->getValueAPF().bitcastToAPInt().getZExtValue() << 16, SL, MVT::i32);
2941   }
2942 
2943   SDValue Src;
2944   if (isExtractHiElt(In, Src))
2945     return Src;
2946 
2947   return SDValue();
2948 }
2949 
2950 bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
2951   assert(CurDAG->getTarget().getTargetTriple().getArch() == Triple::amdgcn);
2952 
2953   const SIRegisterInfo *SIRI =
2954     static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
2955   const SIInstrInfo * SII =
2956     static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
2957 
2958   unsigned Limit = 0;
2959   bool AllUsesAcceptSReg = true;
2960   for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
2961     Limit < 10 && U != E; ++U, ++Limit) {
2962     const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
2963 
2964     // If the register class is unknown, it could be an unknown
2965     // register class that needs to be an SGPR, e.g. an inline asm
2966     // constraint
2967     if (!RC || SIRI->isSGPRClass(RC))
2968       return false;
2969 
2970     if (RC != &AMDGPU::VS_32RegClass) {
2971       AllUsesAcceptSReg = false;
2972       SDNode * User = *U;
2973       if (User->isMachineOpcode()) {
2974         unsigned Opc = User->getMachineOpcode();
2975         MCInstrDesc Desc = SII->get(Opc);
2976         if (Desc.isCommutable()) {
2977           unsigned OpIdx = Desc.getNumDefs() + U.getOperandNo();
2978           unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
2979           if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
2980             unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
2981             const TargetRegisterClass *CommutedRC = getOperandRegClass(*U, CommutedOpNo);
2982             if (CommutedRC == &AMDGPU::VS_32RegClass)
2983               AllUsesAcceptSReg = true;
2984           }
2985         }
2986       }
2987       // If "AllUsesAcceptSReg == false" so far we haven't suceeded
2988       // commuting current user. This means have at least one use
2989       // that strictly require VGPR. Thus, we will not attempt to commute
2990       // other user instructions.
2991       if (!AllUsesAcceptSReg)
2992         break;
2993     }
2994   }
2995   return !AllUsesAcceptSReg && (Limit < 10);
2996 }
2997 
2998 bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode * N) const {
2999   auto Ld = cast<LoadSDNode>(N);
3000 
3001   return Ld->getAlignment() >= 4 &&
3002         (
3003           (
3004             (
3005               Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS       ||
3006               Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT
3007             )
3008             &&
3009             !N->isDivergent()
3010           )
3011           ||
3012           (
3013             Subtarget->getScalarizeGlobalBehavior() &&
3014             Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
3015             Ld->isSimple() &&
3016             !N->isDivergent() &&
3017             static_cast<const SITargetLowering *>(
3018               getTargetLowering())->isMemOpHasNoClobberedMemOperand(N)
3019           )
3020         );
3021 }
3022 
3023 void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
3024   const AMDGPUTargetLowering& Lowering =
3025     *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
3026   bool IsModified = false;
3027   do {
3028     IsModified = false;
3029 
3030     // Go over all selected nodes and try to fold them a bit more
3031     SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_begin();
3032     while (Position != CurDAG->allnodes_end()) {
3033       SDNode *Node = &*Position++;
3034       MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(Node);
3035       if (!MachineNode)
3036         continue;
3037 
3038       SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
3039       if (ResNode != Node) {
3040         if (ResNode)
3041           ReplaceUses(Node, ResNode);
3042         IsModified = true;
3043       }
3044     }
3045     CurDAG->RemoveDeadNodes();
3046   } while (IsModified);
3047 }
3048 
3049 bool R600DAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
3050   Subtarget = &MF.getSubtarget<R600Subtarget>();
3051   return SelectionDAGISel::runOnMachineFunction(MF);
3052 }
3053 
3054 bool R600DAGToDAGISel::isConstantLoad(const MemSDNode *N, int CbId) const {
3055   if (!N->readMem())
3056     return false;
3057   if (CbId == -1)
3058     return N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
3059            N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
3060 
3061   return N->getAddressSpace() == AMDGPUAS::CONSTANT_BUFFER_0 + CbId;
3062 }
3063 
3064 bool R600DAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr,
3065                                                          SDValue& IntPtr) {
3066   if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Addr)) {
3067     IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, SDLoc(Addr),
3068                                        true);
3069     return true;
3070   }
3071   return false;
3072 }
3073 
3074 bool R600DAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr,
3075     SDValue& BaseReg, SDValue &Offset) {
3076   if (!isa<ConstantSDNode>(Addr)) {
3077     BaseReg = Addr;
3078     Offset = CurDAG->getIntPtrConstant(0, SDLoc(Addr), true);
3079     return true;
3080   }
3081   return false;
3082 }
3083 
3084 void R600DAGToDAGISel::Select(SDNode *N) {
3085   unsigned int Opc = N->getOpcode();
3086   if (N->isMachineOpcode()) {
3087     N->setNodeId(-1);
3088     return;   // Already selected.
3089   }
3090 
3091   switch (Opc) {
3092   default: break;
3093   case AMDGPUISD::BUILD_VERTICAL_VECTOR:
3094   case ISD::SCALAR_TO_VECTOR:
3095   case ISD::BUILD_VECTOR: {
3096     EVT VT = N->getValueType(0);
3097     unsigned NumVectorElts = VT.getVectorNumElements();
3098     unsigned RegClassID;
3099     // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
3100     // that adds a 128 bits reg copy when going through TwoAddressInstructions
3101     // pass. We want to avoid 128 bits copies as much as possible because they
3102     // can't be bundled by our scheduler.
3103     switch(NumVectorElts) {
3104     case 2: RegClassID = R600::R600_Reg64RegClassID; break;
3105     case 4:
3106       if (Opc == AMDGPUISD::BUILD_VERTICAL_VECTOR)
3107         RegClassID = R600::R600_Reg128VerticalRegClassID;
3108       else
3109         RegClassID = R600::R600_Reg128RegClassID;
3110       break;
3111     default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
3112     }
3113     SelectBuildVector(N, RegClassID);
3114     return;
3115   }
3116   }
3117 
3118   SelectCode(N);
3119 }
3120 
3121 bool R600DAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
3122                                           SDValue &Offset) {
3123   ConstantSDNode *C;
3124   SDLoc DL(Addr);
3125 
3126   if ((C = dyn_cast<ConstantSDNode>(Addr))) {
3127     Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
3128     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
3129   } else if ((Addr.getOpcode() == AMDGPUISD::DWORDADDR) &&
3130              (C = dyn_cast<ConstantSDNode>(Addr.getOperand(0)))) {
3131     Base = CurDAG->getRegister(R600::INDIRECT_BASE_ADDR, MVT::i32);
3132     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
3133   } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
3134             (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
3135     Base = Addr.getOperand(0);
3136     Offset = CurDAG->getTargetConstant(C->getZExtValue(), DL, MVT::i32);
3137   } else {
3138     Base = Addr;
3139     Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
3140   }
3141 
3142   return true;
3143 }
3144 
3145 bool R600DAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
3146                                           SDValue &Offset) {
3147   ConstantSDNode *IMMOffset;
3148 
3149   if (Addr.getOpcode() == ISD::ADD
3150       && (IMMOffset = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
3151       && isInt<16>(IMMOffset->getZExtValue())) {
3152 
3153       Base = Addr.getOperand(0);
3154       Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr),
3155                                          MVT::i32);
3156       return true;
3157   // If the pointer address is constant, we can move it to the offset field.
3158   } else if ((IMMOffset = dyn_cast<ConstantSDNode>(Addr))
3159              && isInt<16>(IMMOffset->getZExtValue())) {
3160     Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
3161                                   SDLoc(CurDAG->getEntryNode()),
3162                                   R600::ZERO, MVT::i32);
3163     Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), SDLoc(Addr),
3164                                        MVT::i32);
3165     return true;
3166   }
3167 
3168   // Default case, no offset
3169   Base = Addr;
3170   Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
3171   return true;
3172 }
3173