1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief Custom DAG lowering for SI
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #ifdef _MSC_VER
16 // Provide M_PI.
17 #define _USE_MATH_DEFINES
18 #include <cmath>
19 #endif
20 
21 #include "AMDGPU.h"
22 #include "AMDGPUIntrinsicInfo.h"
23 #include "AMDGPUSubtarget.h"
24 #include "SIISelLowering.h"
25 #include "SIInstrInfo.h"
26 #include "SIMachineFunctionInfo.h"
27 #include "SIRegisterInfo.h"
28 #include "llvm/ADT/BitVector.h"
29 #include "llvm/ADT/StringSwitch.h"
30 #include "llvm/CodeGen/CallingConvLower.h"
31 #include "llvm/CodeGen/MachineInstrBuilder.h"
32 #include "llvm/CodeGen/MachineRegisterInfo.h"
33 #include "llvm/CodeGen/SelectionDAG.h"
34 #include "llvm/IR/DiagnosticInfo.h"
35 #include "llvm/IR/Function.h"
36 
37 using namespace llvm;
38 
39 static unsigned findFirstFreeSGPR(CCState &CCInfo) {
40   unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
41   for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
42     if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
43       return AMDGPU::SGPR0 + Reg;
44     }
45   }
46   llvm_unreachable("Cannot allocate sgpr");
47 }
48 
49 SITargetLowering::SITargetLowering(TargetMachine &TM,
50                                    const AMDGPUSubtarget &STI)
51     : AMDGPUTargetLowering(TM, STI) {
52   addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
53   addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
54 
55   addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
56   addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
57 
58   addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
59   addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
60   addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
61 
62   addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass);
63   addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass);
64 
65   addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
66   addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
67 
68   addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
69   addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
70 
71   addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
72   addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
73 
74   computeRegisterProperties(STI.getRegisterInfo());
75 
76   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
77   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
78   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
79   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
80 
81   setOperationAction(ISD::ADD, MVT::i32, Legal);
82   setOperationAction(ISD::ADDC, MVT::i32, Legal);
83   setOperationAction(ISD::ADDE, MVT::i32, Legal);
84   setOperationAction(ISD::SUBC, MVT::i32, Legal);
85   setOperationAction(ISD::SUBE, MVT::i32, Legal);
86 
87   setOperationAction(ISD::FSIN, MVT::f32, Custom);
88   setOperationAction(ISD::FCOS, MVT::f32, Custom);
89 
90   setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
91   setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
92 
93   // We need to custom lower vector stores from local memory
94   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
95   setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
96   setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
97 
98   setOperationAction(ISD::STORE, MVT::v8i32, Custom);
99   setOperationAction(ISD::STORE, MVT::v16i32, Custom);
100 
101   setOperationAction(ISD::STORE, MVT::i1, Custom);
102   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
103 
104   setOperationAction(ISD::SELECT, MVT::i64, Custom);
105   setOperationAction(ISD::SELECT, MVT::f64, Promote);
106   AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
107 
108   setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
109   setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
110   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
111   setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
112 
113   setOperationAction(ISD::SETCC, MVT::i1, Promote);
114   setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
115   setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
116 
117   setOperationAction(ISD::BSWAP, MVT::i32, Legal);
118   setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
119 
120   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal);
121   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
122   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
123 
124   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Legal);
125   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
126   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
127 
128   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
129   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
130   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
131 
132   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
133   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
134 
135   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
136   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
137   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom);
138   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
139 
140   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
141 
142   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
143   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
144   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
145   setOperationAction(ISD::BR_CC, MVT::i64, Expand);
146   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
147   setOperationAction(ISD::BR_CC, MVT::f64, Expand);
148 
149   // On SI this is s_memtime and s_memrealtime on VI.
150   setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
151 
152   for (MVT VT : MVT::integer_valuetypes()) {
153     if (VT == MVT::i64)
154       continue;
155 
156     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
157     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal);
158     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal);
159     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
160 
161     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
162     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal);
163     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal);
164     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);
165 
166     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
167     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal);
168     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal);
169     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
170   }
171 
172   for (MVT VT : MVT::integer_vector_valuetypes()) {
173     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i16, Expand);
174     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v16i16, Expand);
175   }
176 
177   for (MVT VT : MVT::fp_valuetypes())
178     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
179 
180   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
181   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
182 
183   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
184   setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
185   setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
186   setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
187 
188 
189   setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
190 
191   setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
192   setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
193 
194   setOperationAction(ISD::LOAD, MVT::i1, Custom);
195 
196   setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
197   AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
198 
199   setOperationAction(ISD::STORE, MVT::v2i64, Promote);
200   AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
201 
202   setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);
203 
204   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
205   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
206   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
207 
208   // These should use UDIVREM, so set them to expand
209   setOperationAction(ISD::UDIV, MVT::i64, Expand);
210   setOperationAction(ISD::UREM, MVT::i64, Expand);
211 
212   setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
213   setOperationAction(ISD::SELECT, MVT::i1, Promote);
214 
215   setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
216 
217 
218   setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
219 
220   // We only support LOAD/STORE and vector manipulation ops for vectors
221   // with > 4 elements.
222   for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64}) {
223     for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
224       switch(Op) {
225       case ISD::LOAD:
226       case ISD::STORE:
227       case ISD::BUILD_VECTOR:
228       case ISD::BITCAST:
229       case ISD::EXTRACT_VECTOR_ELT:
230       case ISD::INSERT_VECTOR_ELT:
231       case ISD::INSERT_SUBVECTOR:
232       case ISD::EXTRACT_SUBVECTOR:
233       case ISD::SCALAR_TO_VECTOR:
234         break;
235       case ISD::CONCAT_VECTORS:
236         setOperationAction(Op, VT, Custom);
237         break;
238       default:
239         setOperationAction(Op, VT, Expand);
240         break;
241       }
242     }
243   }
244 
245   // Most operations are naturally 32-bit vector operations. We only support
246   // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
247   for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
248     setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
249     AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
250 
251     setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
252     AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
253 
254     setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
255     AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
256 
257     setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
258     AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
259   }
260 
261   if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
262     setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
263     setOperationAction(ISD::FCEIL, MVT::f64, Legal);
264     setOperationAction(ISD::FRINT, MVT::f64, Legal);
265   }
266 
267   setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
268   setOperationAction(ISD::FDIV, MVT::f32, Custom);
269   setOperationAction(ISD::FDIV, MVT::f64, Custom);
270 
271   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
272   // and output demarshalling
273   setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
274   setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
275 
276   // We can't return success/failure, only the old value,
277   // let LLVM add the comparison
278   setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
279   setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand);
280 
281   setTargetDAGCombine(ISD::FADD);
282   setTargetDAGCombine(ISD::FSUB);
283   setTargetDAGCombine(ISD::FMINNUM);
284   setTargetDAGCombine(ISD::FMAXNUM);
285   setTargetDAGCombine(ISD::SMIN);
286   setTargetDAGCombine(ISD::SMAX);
287   setTargetDAGCombine(ISD::UMIN);
288   setTargetDAGCombine(ISD::UMAX);
289   setTargetDAGCombine(ISD::SETCC);
290   setTargetDAGCombine(ISD::AND);
291   setTargetDAGCombine(ISD::OR);
292   setTargetDAGCombine(ISD::UINT_TO_FP);
293   setTargetDAGCombine(ISD::FCANONICALIZE);
294 
295   // All memory operations. Some folding on the pointer operand is done to help
296   // matching the constant offsets in the addressing modes.
297   setTargetDAGCombine(ISD::LOAD);
298   setTargetDAGCombine(ISD::STORE);
299   setTargetDAGCombine(ISD::ATOMIC_LOAD);
300   setTargetDAGCombine(ISD::ATOMIC_STORE);
301   setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP);
302   setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
303   setTargetDAGCombine(ISD::ATOMIC_SWAP);
304   setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD);
305   setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB);
306   setTargetDAGCombine(ISD::ATOMIC_LOAD_AND);
307   setTargetDAGCombine(ISD::ATOMIC_LOAD_OR);
308   setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR);
309   setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND);
310   setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN);
311   setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX);
312   setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN);
313   setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
314 
315   setSchedulingPreference(Sched::RegPressure);
316 }
317 
318 //===----------------------------------------------------------------------===//
319 // TargetLowering queries
320 //===----------------------------------------------------------------------===//
321 
322 bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
323                                           const CallInst &CI,
324                                           unsigned IntrID) const {
325   switch (IntrID) {
326   case Intrinsic::amdgcn_atomic_inc:
327   case Intrinsic::amdgcn_atomic_dec:
328     Info.opc = ISD::INTRINSIC_W_CHAIN;
329     Info.memVT = MVT::getVT(CI.getType());
330     Info.ptrVal = CI.getOperand(0);
331     Info.align = 0;
332     Info.vol = false;
333     Info.readMem = true;
334     Info.writeMem = true;
335     return true;
336   default:
337     return false;
338   }
339 }
340 
341 bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &,
342                                           EVT) const {
343   // SI has some legal vector types, but no legal vector operations. Say no
344   // shuffles are legal in order to prefer scalarizing some vector operations.
345   return false;
346 }
347 
348 bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
349   // Flat instructions do not have offsets, and only have the register
350   // address.
351   return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1);
352 }
353 
354 bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
355   // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
356   // additionally can do r + r + i with addr64. 32-bit has more addressing
357   // mode options. Depending on the resource constant, it can also do
358   // (i64 r0) + (i32 r1) * (i14 i).
359   //
360   // Private arrays end up using a scratch buffer most of the time, so also
361   // assume those use MUBUF instructions. Scratch loads / stores are currently
362   // implemented as mubuf instructions with offen bit set, so slightly
363   // different than the normal addr64.
364   if (!isUInt<12>(AM.BaseOffs))
365     return false;
366 
367   // FIXME: Since we can split immediate into soffset and immediate offset,
368   // would it make sense to allow any immediate?
369 
370   switch (AM.Scale) {
371   case 0: // r + i or just i, depending on HasBaseReg.
372     return true;
373   case 1:
374     return true; // We have r + r or r + i.
375   case 2:
376     if (AM.HasBaseReg) {
377       // Reject 2 * r + r.
378       return false;
379     }
380 
381     // Allow 2 * r as r + r
382     // Or  2 * r + i is allowed as r + r + i.
383     return true;
384   default: // Don't allow n * r
385     return false;
386   }
387 }
388 
389 bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
390                                              const AddrMode &AM, Type *Ty,
391                                              unsigned AS) const {
392   // No global is ever allowed as a base.
393   if (AM.BaseGV)
394     return false;
395 
396   switch (AS) {
397   case AMDGPUAS::GLOBAL_ADDRESS: {
398     if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
399       // Assume the we will use FLAT for all global memory accesses
400       // on VI.
401       // FIXME: This assumption is currently wrong.  On VI we still use
402       // MUBUF instructions for the r + i addressing mode.  As currently
403       // implemented, the MUBUF instructions only work on buffer < 4GB.
404       // It may be possible to support > 4GB buffers with MUBUF instructions,
405       // by setting the stride value in the resource descriptor which would
406       // increase the size limit to (stride * 4GB).  However, this is risky,
407       // because it has never been validated.
408       return isLegalFlatAddressingMode(AM);
409     }
410 
411     return isLegalMUBUFAddressingMode(AM);
412   }
413   case AMDGPUAS::CONSTANT_ADDRESS: {
414     // If the offset isn't a multiple of 4, it probably isn't going to be
415     // correctly aligned.
416     if (AM.BaseOffs % 4 != 0)
417       return isLegalMUBUFAddressingMode(AM);
418 
419     // There are no SMRD extloads, so if we have to do a small type access we
420     // will use a MUBUF load.
421     // FIXME?: We also need to do this if unaligned, but we don't know the
422     // alignment here.
423     if (DL.getTypeStoreSize(Ty) < 4)
424       return isLegalMUBUFAddressingMode(AM);
425 
426     if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
427       // SMRD instructions have an 8-bit, dword offset on SI.
428       if (!isUInt<8>(AM.BaseOffs / 4))
429         return false;
430     } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
431       // On CI+, this can also be a 32-bit literal constant offset. If it fits
432       // in 8-bits, it can use a smaller encoding.
433       if (!isUInt<32>(AM.BaseOffs / 4))
434         return false;
435     } else if (Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
436       // On VI, these use the SMEM format and the offset is 20-bit in bytes.
437       if (!isUInt<20>(AM.BaseOffs))
438         return false;
439     } else
440       llvm_unreachable("unhandled generation");
441 
442     if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
443       return true;
444 
445     if (AM.Scale == 1 && AM.HasBaseReg)
446       return true;
447 
448     return false;
449   }
450 
451   case AMDGPUAS::PRIVATE_ADDRESS:
452   case AMDGPUAS::UNKNOWN_ADDRESS_SPACE:
453     return isLegalMUBUFAddressingMode(AM);
454 
455   case AMDGPUAS::LOCAL_ADDRESS:
456   case AMDGPUAS::REGION_ADDRESS: {
457     // Basic, single offset DS instructions allow a 16-bit unsigned immediate
458     // field.
459     // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
460     // an 8-bit dword offset but we don't know the alignment here.
461     if (!isUInt<16>(AM.BaseOffs))
462       return false;
463 
464     if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
465       return true;
466 
467     if (AM.Scale == 1 && AM.HasBaseReg)
468       return true;
469 
470     return false;
471   }
472   case AMDGPUAS::FLAT_ADDRESS:
473     return isLegalFlatAddressingMode(AM);
474 
475   default:
476     llvm_unreachable("unhandled address space");
477   }
478 }
479 
480 bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
481                                                       unsigned AddrSpace,
482                                                       unsigned Align,
483                                                       bool *IsFast) const {
484   if (IsFast)
485     *IsFast = false;
486 
487   // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
488   // which isn't a simple VT.
489   if (!VT.isSimple() || VT == MVT::Other)
490     return false;
491 
492   // TODO - CI+ supports unaligned memory accesses, but this requires driver
493   // support.
494 
495   // XXX - The only mention I see of this in the ISA manual is for LDS direct
496   // reads the "byte address and must be dword aligned". Is it also true for the
497   // normal loads and stores?
498   if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS) {
499     // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
500     // aligned, 8 byte access in a single operation using ds_read2/write2_b32
501     // with adjacent offsets.
502     bool AlignedBy4 = (Align % 4 == 0);
503     if (IsFast)
504       *IsFast = AlignedBy4;
505     return AlignedBy4;
506   }
507 
508   // Smaller than dword value must be aligned.
509   // FIXME: This should be allowed on CI+
510   if (VT.bitsLT(MVT::i32))
511     return false;
512 
513   // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
514   // byte-address are ignored, thus forcing Dword alignment.
515   // This applies to private, global, and constant memory.
516   if (IsFast)
517     *IsFast = true;
518 
519   return VT.bitsGT(MVT::i32) && Align % 4 == 0;
520 }
521 
522 EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
523                                           unsigned SrcAlign, bool IsMemset,
524                                           bool ZeroMemset,
525                                           bool MemcpyStrSrc,
526                                           MachineFunction &MF) const {
527   // FIXME: Should account for address space here.
528 
529   // The default fallback uses the private pointer size as a guess for a type to
530   // use. Make sure we switch these to 64-bit accesses.
531 
532   if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global
533     return MVT::v4i32;
534 
535   if (Size >= 8 && DstAlign >= 4)
536     return MVT::v2i32;
537 
538   // Use the default.
539   return MVT::Other;
540 }
541 
542 static bool isFlatGlobalAddrSpace(unsigned AS) {
543   return AS == AMDGPUAS::GLOBAL_ADDRESS ||
544     AS == AMDGPUAS::FLAT_ADDRESS ||
545     AS == AMDGPUAS::CONSTANT_ADDRESS;
546 }
547 
548 bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
549                                            unsigned DestAS) const {
550   return isFlatGlobalAddrSpace(SrcAS) &&  isFlatGlobalAddrSpace(DestAS);
551 }
552 
553 
554 bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
555   const MemSDNode *MemNode = cast<MemSDNode>(N);
556   const Value *Ptr = MemNode->getMemOperand()->getValue();
557 
558   // UndefValue means this is a load of a kernel input.  These are uniform.
559   // Sometimes LDS instructions have constant pointers
560   if (isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || isa<Constant>(Ptr) ||
561       isa<GlobalValue>(Ptr))
562     return true;
563 
564   const Instruction *I = dyn_cast_or_null<Instruction>(Ptr);
565   return I && I->getMetadata("amdgpu.uniform");
566 }
567 
568 TargetLoweringBase::LegalizeTypeAction
569 SITargetLowering::getPreferredVectorAction(EVT VT) const {
570   if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
571     return TypeSplitVector;
572 
573   return TargetLoweringBase::getPreferredVectorAction(VT);
574 }
575 
576 bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
577                                                          Type *Ty) const {
578   const SIInstrInfo *TII =
579       static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
580   return TII->isInlineConstant(Imm);
581 }
582 
583 bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
584 
585   // SimplifySetCC uses this function to determine whether or not it should
586   // create setcc with i1 operands.  We don't have instructions for i1 setcc.
587   if (VT == MVT::i1 && Op == ISD::SETCC)
588     return false;
589 
590   return TargetLowering::isTypeDesirableForOp(Op, VT);
591 }
592 
593 SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
594                                          SDLoc SL, SDValue Chain,
595                                          unsigned Offset, bool Signed) const {
596   const DataLayout &DL = DAG.getDataLayout();
597   MachineFunction &MF = DAG.getMachineFunction();
598   const SIRegisterInfo *TRI =
599       static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
600   unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
601 
602   Type *Ty = VT.getTypeForEVT(*DAG.getContext());
603 
604   MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
605   MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
606   PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
607   SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
608                                        MRI.getLiveInVirtReg(InputPtrReg), PtrVT);
609   SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
610                             DAG.getConstant(Offset, SL, PtrVT));
611   SDValue PtrOffset = DAG.getUNDEF(PtrVT);
612   MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
613 
614   unsigned Align = DL.getABITypeAlignment(Ty);
615 
616   ISD::LoadExtType ExtTy = Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
617   if (MemVT.isFloatingPoint())
618     ExtTy = ISD::EXTLOAD;
619 
620   return DAG.getLoad(ISD::UNINDEXED, ExtTy,
621                      VT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemVT,
622                      false, // isVolatile
623                      true, // isNonTemporal
624                      true, // isInvariant
625                      Align); // Alignment
626 }
627 
628 SDValue SITargetLowering::LowerFormalArguments(
629     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
630     const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
631     SmallVectorImpl<SDValue> &InVals) const {
632   const SIRegisterInfo *TRI =
633       static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
634 
635   MachineFunction &MF = DAG.getMachineFunction();
636   FunctionType *FType = MF.getFunction()->getFunctionType();
637   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
638   const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
639 
640   if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
641     const Function *Fn = MF.getFunction();
642     DiagnosticInfoUnsupported NoGraphicsHSA(
643         *Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
644     DAG.getContext()->diagnose(NoGraphicsHSA);
645     return SDValue();
646   }
647 
648   SmallVector<ISD::InputArg, 16> Splits;
649   BitVector Skipped(Ins.size());
650 
651   for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) {
652     const ISD::InputArg &Arg = Ins[i];
653 
654     // First check if it's a PS input addr
655     if (CallConv == CallingConv::AMDGPU_PS && !Arg.Flags.isInReg() &&
656         !Arg.Flags.isByVal() && PSInputNum <= 15) {
657 
658       if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) {
659         // We can safely skip PS inputs
660         Skipped.set(i);
661         ++PSInputNum;
662         continue;
663       }
664 
665       Info->markPSInputAllocated(PSInputNum);
666       if (Arg.Used)
667         Info->PSInputEna |= 1 << PSInputNum;
668 
669       ++PSInputNum;
670     }
671 
672     // Second split vertices into their elements
673     if (AMDGPU::isShader(CallConv) &&
674         Arg.VT.isVector()) {
675       ISD::InputArg NewArg = Arg;
676       NewArg.Flags.setSplit();
677       NewArg.VT = Arg.VT.getVectorElementType();
678 
679       // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
680       // three or five element vertex only needs three or five registers,
681       // NOT four or eight.
682       Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
683       unsigned NumElements = ParamType->getVectorNumElements();
684 
685       for (unsigned j = 0; j != NumElements; ++j) {
686         Splits.push_back(NewArg);
687         NewArg.PartOffset += NewArg.VT.getStoreSize();
688       }
689 
690     } else if (AMDGPU::isShader(CallConv)) {
691       Splits.push_back(Arg);
692     }
693   }
694 
695   SmallVector<CCValAssign, 16> ArgLocs;
696   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
697                  *DAG.getContext());
698 
699   // At least one interpolation mode must be enabled or else the GPU will hang.
700   //
701   // Check PSInputAddr instead of PSInputEna. The idea is that if the user set
702   // PSInputAddr, the user wants to enable some bits after the compilation
703   // based on run-time states. Since we can't know what the final PSInputEna
704   // will look like, so we shouldn't do anything here and the user should take
705   // responsibility for the correct programming.
706   //
707   // Otherwise, the following restrictions apply:
708   // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
709   // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
710   //   enabled too.
711   if (CallConv == CallingConv::AMDGPU_PS &&
712       ((Info->getPSInputAddr() & 0x7F) == 0 ||
713        ((Info->getPSInputAddr() & 0xF) == 0 &&
714 	Info->isPSInputAllocated(11)))) {
715     CCInfo.AllocateReg(AMDGPU::VGPR0);
716     CCInfo.AllocateReg(AMDGPU::VGPR1);
717     Info->markPSInputAllocated(0);
718     Info->PSInputEna |= 1;
719   }
720 
721   if (!AMDGPU::isShader(CallConv)) {
722     getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
723                             Splits);
724 
725     assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
726   } else {
727     assert(!Info->hasPrivateSegmentBuffer() && !Info->hasDispatchPtr() &&
728            !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
729            !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
730            !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
731            !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
732            !Info->hasWorkItemIDZ());
733   }
734 
735   // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
736   if (Info->hasPrivateSegmentBuffer()) {
737     unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI);
738     MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass);
739     CCInfo.AllocateReg(PrivateSegmentBufferReg);
740   }
741 
742   if (Info->hasDispatchPtr()) {
743     unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI);
744     MF.addLiveIn(DispatchPtrReg, &AMDGPU::SReg_64RegClass);
745     CCInfo.AllocateReg(DispatchPtrReg);
746   }
747 
748   if (Info->hasKernargSegmentPtr()) {
749     unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI);
750     MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass);
751     CCInfo.AllocateReg(InputPtrReg);
752   }
753 
754   if (Info->hasFlatScratchInit()) {
755     unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI);
756     MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SReg_64RegClass);
757     CCInfo.AllocateReg(FlatScratchInitReg);
758   }
759 
760   AnalyzeFormalArguments(CCInfo, Splits);
761 
762   SmallVector<SDValue, 16> Chains;
763 
764   for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
765 
766     const ISD::InputArg &Arg = Ins[i];
767     if (Skipped[i]) {
768       InVals.push_back(DAG.getUNDEF(Arg.VT));
769       continue;
770     }
771 
772     CCValAssign &VA = ArgLocs[ArgIdx++];
773     MVT VT = VA.getLocVT();
774 
775     if (VA.isMemLoc()) {
776       VT = Ins[i].VT;
777       EVT MemVT = Splits[i].VT;
778       const unsigned Offset = Subtarget->getExplicitKernelArgOffset() +
779                               VA.getLocMemOffset();
780       // The first 36 bytes of the input buffer contains information about
781       // thread group and global sizes.
782       SDValue Arg = LowerParameter(DAG, VT, MemVT,  DL, Chain,
783                                    Offset, Ins[i].Flags.isSExt());
784       Chains.push_back(Arg.getValue(1));
785 
786       auto *ParamTy =
787         dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
788       if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
789           ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
790         // On SI local pointers are just offsets into LDS, so they are always
791         // less than 16-bits.  On CI and newer they could potentially be
792         // real pointers, so we can't guarantee their size.
793         Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
794                           DAG.getValueType(MVT::i16));
795       }
796 
797       InVals.push_back(Arg);
798       Info->ABIArgOffset = Offset + MemVT.getStoreSize();
799       continue;
800     }
801     assert(VA.isRegLoc() && "Parameter must be in a register!");
802 
803     unsigned Reg = VA.getLocReg();
804 
805     if (VT == MVT::i64) {
806       // For now assume it is a pointer
807       Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0,
808                                      &AMDGPU::SReg_64RegClass);
809       Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass);
810       SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
811       InVals.push_back(Copy);
812       continue;
813     }
814 
815     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
816 
817     Reg = MF.addLiveIn(Reg, RC);
818     SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
819 
820     if (Arg.VT.isVector()) {
821 
822       // Build a vector from the registers
823       Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
824       unsigned NumElements = ParamType->getVectorNumElements();
825 
826       SmallVector<SDValue, 4> Regs;
827       Regs.push_back(Val);
828       for (unsigned j = 1; j != NumElements; ++j) {
829         Reg = ArgLocs[ArgIdx++].getLocReg();
830         Reg = MF.addLiveIn(Reg, RC);
831 
832         SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
833         Regs.push_back(Copy);
834       }
835 
836       // Fill up the missing vector elements
837       NumElements = Arg.VT.getVectorNumElements() - NumElements;
838       Regs.append(NumElements, DAG.getUNDEF(VT));
839 
840       InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, Regs));
841       continue;
842     }
843 
844     InVals.push_back(Val);
845   }
846 
847   // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
848   // these from the dispatch pointer.
849 
850   // Start adding system SGPRs.
851   if (Info->hasWorkGroupIDX()) {
852     unsigned Reg = Info->addWorkGroupIDX();
853     MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
854     CCInfo.AllocateReg(Reg);
855   }
856 
857   if (Info->hasWorkGroupIDY()) {
858     unsigned Reg = Info->addWorkGroupIDY();
859     MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
860     CCInfo.AllocateReg(Reg);
861   }
862 
863   if (Info->hasWorkGroupIDZ()) {
864     unsigned Reg = Info->addWorkGroupIDZ();
865     MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
866     CCInfo.AllocateReg(Reg);
867   }
868 
869   if (Info->hasWorkGroupInfo()) {
870     unsigned Reg = Info->addWorkGroupInfo();
871     MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
872     CCInfo.AllocateReg(Reg);
873   }
874 
875   if (Info->hasPrivateSegmentWaveByteOffset()) {
876     // Scratch wave offset passed in system SGPR.
877     unsigned PrivateSegmentWaveByteOffsetReg;
878 
879     if (AMDGPU::isShader(CallConv)) {
880       PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
881       Info->setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
882     } else
883       PrivateSegmentWaveByteOffsetReg = Info->addPrivateSegmentWaveByteOffset();
884 
885     MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
886     CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
887   }
888 
889   // Now that we've figured out where the scratch register inputs are, see if
890   // should reserve the arguments and use them directly.
891   bool HasStackObjects = MF.getFrameInfo()->hasStackObjects();
892   // Record that we know we have non-spill stack objects so we don't need to
893   // check all stack objects later.
894   if (HasStackObjects)
895     Info->setHasNonSpillStackObjects(true);
896 
897   if (ST.isAmdHsaOS()) {
898     // TODO: Assume we will spill without optimizations.
899     if (HasStackObjects) {
900       // If we have stack objects, we unquestionably need the private buffer
901       // resource. For the HSA ABI, this will be the first 4 user SGPR
902       // inputs. We can reserve those and use them directly.
903 
904       unsigned PrivateSegmentBufferReg = TRI->getPreloadedValue(
905         MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
906       Info->setScratchRSrcReg(PrivateSegmentBufferReg);
907 
908       unsigned PrivateSegmentWaveByteOffsetReg = TRI->getPreloadedValue(
909         MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
910       Info->setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
911     } else {
912       unsigned ReservedBufferReg
913         = TRI->reservedPrivateSegmentBufferReg(MF);
914       unsigned ReservedOffsetReg
915         = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
916 
917       // We tentatively reserve the last registers (skipping the last two
918       // which may contain VCC). After register allocation, we'll replace
919       // these with the ones immediately after those which were really
920       // allocated. In the prologue copies will be inserted from the argument
921       // to these reserved registers.
922       Info->setScratchRSrcReg(ReservedBufferReg);
923       Info->setScratchWaveOffsetReg(ReservedOffsetReg);
924     }
925   } else {
926     unsigned ReservedBufferReg = TRI->reservedPrivateSegmentBufferReg(MF);
927 
928     // Without HSA, relocations are used for the scratch pointer and the
929     // buffer resource setup is always inserted in the prologue. Scratch wave
930     // offset is still in an input SGPR.
931     Info->setScratchRSrcReg(ReservedBufferReg);
932 
933     if (HasStackObjects) {
934       unsigned ScratchWaveOffsetReg = TRI->getPreloadedValue(
935         MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
936       Info->setScratchWaveOffsetReg(ScratchWaveOffsetReg);
937     } else {
938       unsigned ReservedOffsetReg
939         = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
940       Info->setScratchWaveOffsetReg(ReservedOffsetReg);
941     }
942   }
943 
944   if (Info->hasWorkItemIDX()) {
945     unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X);
946     MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
947     CCInfo.AllocateReg(Reg);
948   }
949 
950   if (Info->hasWorkItemIDY()) {
951     unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y);
952     MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
953     CCInfo.AllocateReg(Reg);
954   }
955 
956   if (Info->hasWorkItemIDZ()) {
957     unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z);
958     MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
959     CCInfo.AllocateReg(Reg);
960   }
961 
962   if (Chains.empty())
963     return Chain;
964 
965   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
966 }
967 
968 SDValue SITargetLowering::LowerReturn(SDValue Chain,
969                                       CallingConv::ID CallConv,
970                                       bool isVarArg,
971                                       const SmallVectorImpl<ISD::OutputArg> &Outs,
972                                       const SmallVectorImpl<SDValue> &OutVals,
973                                       SDLoc DL, SelectionDAG &DAG) const {
974   MachineFunction &MF = DAG.getMachineFunction();
975   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
976 
977   if (!AMDGPU::isShader(CallConv))
978     return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
979                                              OutVals, DL, DAG);
980 
981   Info->setIfReturnsVoid(Outs.size() == 0);
982 
983   SmallVector<ISD::OutputArg, 48> Splits;
984   SmallVector<SDValue, 48> SplitVals;
985 
986   // Split vectors into their elements.
987   for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
988     const ISD::OutputArg &Out = Outs[i];
989 
990     if (Out.VT.isVector()) {
991       MVT VT = Out.VT.getVectorElementType();
992       ISD::OutputArg NewOut = Out;
993       NewOut.Flags.setSplit();
994       NewOut.VT = VT;
995 
996       // We want the original number of vector elements here, e.g.
997       // three or five, not four or eight.
998       unsigned NumElements = Out.ArgVT.getVectorNumElements();
999 
1000       for (unsigned j = 0; j != NumElements; ++j) {
1001         SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, OutVals[i],
1002                                    DAG.getConstant(j, DL, MVT::i32));
1003         SplitVals.push_back(Elem);
1004         Splits.push_back(NewOut);
1005         NewOut.PartOffset += NewOut.VT.getStoreSize();
1006       }
1007     } else {
1008       SplitVals.push_back(OutVals[i]);
1009       Splits.push_back(Out);
1010     }
1011   }
1012 
1013   // CCValAssign - represent the assignment of the return value to a location.
1014   SmallVector<CCValAssign, 48> RVLocs;
1015 
1016   // CCState - Info about the registers and stack slots.
1017   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
1018                  *DAG.getContext());
1019 
1020   // Analyze outgoing return values.
1021   AnalyzeReturn(CCInfo, Splits);
1022 
1023   SDValue Flag;
1024   SmallVector<SDValue, 48> RetOps;
1025   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
1026 
1027   // Copy the result values into the output registers.
1028   for (unsigned i = 0, realRVLocIdx = 0;
1029        i != RVLocs.size();
1030        ++i, ++realRVLocIdx) {
1031     CCValAssign &VA = RVLocs[i];
1032     assert(VA.isRegLoc() && "Can only return in registers!");
1033 
1034     SDValue Arg = SplitVals[realRVLocIdx];
1035 
1036     // Copied from other backends.
1037     switch (VA.getLocInfo()) {
1038     default: llvm_unreachable("Unknown loc info!");
1039     case CCValAssign::Full:
1040       break;
1041     case CCValAssign::BCvt:
1042       Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
1043       break;
1044     }
1045 
1046     Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
1047     Flag = Chain.getValue(1);
1048     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
1049   }
1050 
1051   // Update chain and glue.
1052   RetOps[0] = Chain;
1053   if (Flag.getNode())
1054     RetOps.push_back(Flag);
1055 
1056   return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, RetOps);
1057 }
1058 
1059 unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
1060                                              SelectionDAG &DAG) const {
1061   unsigned Reg = StringSwitch<unsigned>(RegName)
1062     .Case("m0", AMDGPU::M0)
1063     .Case("exec", AMDGPU::EXEC)
1064     .Case("exec_lo", AMDGPU::EXEC_LO)
1065     .Case("exec_hi", AMDGPU::EXEC_HI)
1066     .Case("flat_scratch", AMDGPU::FLAT_SCR)
1067     .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
1068     .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
1069     .Default(AMDGPU::NoRegister);
1070 
1071   if (Reg == AMDGPU::NoRegister) {
1072     report_fatal_error(Twine("invalid register name \""
1073                              + StringRef(RegName)  + "\"."));
1074 
1075   }
1076 
1077   if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
1078       Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
1079     report_fatal_error(Twine("invalid register \""
1080                              + StringRef(RegName)  + "\" for subtarget."));
1081   }
1082 
1083   switch (Reg) {
1084   case AMDGPU::M0:
1085   case AMDGPU::EXEC_LO:
1086   case AMDGPU::EXEC_HI:
1087   case AMDGPU::FLAT_SCR_LO:
1088   case AMDGPU::FLAT_SCR_HI:
1089     if (VT.getSizeInBits() == 32)
1090       return Reg;
1091     break;
1092   case AMDGPU::EXEC:
1093   case AMDGPU::FLAT_SCR:
1094     if (VT.getSizeInBits() == 64)
1095       return Reg;
1096     break;
1097   default:
1098     llvm_unreachable("missing register type checking");
1099   }
1100 
1101   report_fatal_error(Twine("invalid type for register \""
1102                            + StringRef(RegName) + "\"."));
1103 }
1104 
1105 MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
1106   MachineInstr *MI, MachineBasicBlock *BB) const {
1107   switch (MI->getOpcode()) {
1108   case AMDGPU::SI_INIT_M0: {
1109     const SIInstrInfo *TII =
1110       static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
1111     BuildMI(*BB, MI->getIterator(), MI->getDebugLoc(),
1112             TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1113       .addOperand(MI->getOperand(0));
1114     MI->eraseFromParent();
1115     break;
1116   }
1117   case AMDGPU::BRANCH:
1118     return BB;
1119   case AMDGPU::GET_GROUPSTATICSIZE: {
1120     const SIInstrInfo *TII =
1121       static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
1122     MachineFunction *MF = BB->getParent();
1123     SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1124     DebugLoc DL = MI->getDebugLoc();
1125     BuildMI (*BB, MI, DL, TII->get(AMDGPU::S_MOVK_I32))
1126       .addOperand(MI->getOperand(0))
1127       .addImm(MFI->LDSSize);
1128     MI->eraseFromParent();
1129     return BB;
1130   }
1131   default:
1132     return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
1133   }
1134   return BB;
1135 }
1136 
1137 bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
1138   // This currently forces unfolding various combinations of fsub into fma with
1139   // free fneg'd operands. As long as we have fast FMA (controlled by
1140   // isFMAFasterThanFMulAndFAdd), we should perform these.
1141 
1142   // When fma is quarter rate, for f64 where add / sub are at best half rate,
1143   // most of these combines appear to be cycle neutral but save on instruction
1144   // count / code size.
1145   return true;
1146 }
1147 
1148 EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
1149                                          EVT VT) const {
1150   if (!VT.isVector()) {
1151     return MVT::i1;
1152   }
1153   return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
1154 }
1155 
1156 MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT) const {
1157   return MVT::i32;
1158 }
1159 
1160 // Answering this is somewhat tricky and depends on the specific device which
1161 // have different rates for fma or all f64 operations.
1162 //
1163 // v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
1164 // regardless of which device (although the number of cycles differs between
1165 // devices), so it is always profitable for f64.
1166 //
1167 // v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
1168 // only on full rate devices. Normally, we should prefer selecting v_mad_f32
1169 // which we can always do even without fused FP ops since it returns the same
1170 // result as the separate operations and since it is always full
1171 // rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
1172 // however does not support denormals, so we do report fma as faster if we have
1173 // a fast fma device and require denormals.
1174 //
1175 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
1176   VT = VT.getScalarType();
1177 
1178   if (!VT.isSimple())
1179     return false;
1180 
1181   switch (VT.getSimpleVT().SimpleTy) {
1182   case MVT::f32:
1183     // This is as fast on some subtargets. However, we always have full rate f32
1184     // mad available which returns the same result as the separate operations
1185     // which we should prefer over fma. We can't use this if we want to support
1186     // denormals, so only report this in these cases.
1187     return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32();
1188   case MVT::f64:
1189     return true;
1190   default:
1191     break;
1192   }
1193 
1194   return false;
1195 }
1196 
1197 //===----------------------------------------------------------------------===//
1198 // Custom DAG Lowering Operations
1199 //===----------------------------------------------------------------------===//
1200 
1201 SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
1202   switch (Op.getOpcode()) {
1203   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
1204   case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
1205   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
1206   case ISD::LOAD: {
1207     SDValue Result = LowerLOAD(Op, DAG);
1208     assert((!Result.getNode() ||
1209             Result.getNode()->getNumValues() == 2) &&
1210            "Load should return a value and a chain");
1211     return Result;
1212   }
1213 
1214   case ISD::FSIN:
1215   case ISD::FCOS:
1216     return LowerTrig(Op, DAG);
1217   case ISD::SELECT: return LowerSELECT(Op, DAG);
1218   case ISD::FDIV: return LowerFDIV(Op, DAG);
1219   case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
1220   case ISD::STORE: return LowerSTORE(Op, DAG);
1221   case ISD::GlobalAddress: {
1222     MachineFunction &MF = DAG.getMachineFunction();
1223     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1224     return LowerGlobalAddress(MFI, Op, DAG);
1225   }
1226   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
1227   case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
1228   case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
1229   }
1230   return SDValue();
1231 }
1232 
1233 /// \brief Helper function for LowerBRCOND
1234 static SDNode *findUser(SDValue Value, unsigned Opcode) {
1235 
1236   SDNode *Parent = Value.getNode();
1237   for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
1238        I != E; ++I) {
1239 
1240     if (I.getUse().get() != Value)
1241       continue;
1242 
1243     if (I->getOpcode() == Opcode)
1244       return *I;
1245   }
1246   return nullptr;
1247 }
1248 
1249 SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
1250 
1251   SDLoc SL(Op);
1252   FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op);
1253   unsigned FrameIndex = FINode->getIndex();
1254 
1255   // A FrameIndex node represents a 32-bit offset into scratch memory. If the
1256   // high bit of a frame index offset were to be set, this would mean that it
1257   // represented an offset of ~2GB * 64 = ~128GB from the start of the scratch
1258   // buffer, with 64 being the number of threads per wave.
1259   //
1260   // The maximum private allocation for the entire GPU is 4G, and we are
1261   // concerned with the largest the index could ever be for an individual
1262   // workitem. This will occur with the minmum dispatch size. If a program
1263   // requires more, the dispatch size will be reduced.
1264   //
1265   // With this limit, we can mark the high bit of the FrameIndex node as known
1266   // zero, which is important, because it means in most situations we can prove
1267   // that values derived from FrameIndex nodes are non-negative. This enables us
1268   // to take advantage of more addressing modes when accessing scratch buffers,
1269   // since for scratch reads/writes, the register offset must always be
1270   // positive.
1271 
1272   uint64_t MaxGPUAlloc = UINT64_C(4) * 1024 * 1024 * 1024;
1273 
1274   // XXX - It is unclear if partial dispatch works. Assume it works at half wave
1275   // granularity. It is probably a full wave.
1276   uint64_t MinGranularity = 32;
1277 
1278   unsigned KnownBits = Log2_64(MaxGPUAlloc / MinGranularity);
1279   EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), KnownBits);
1280 
1281   SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
1282   return DAG.getNode(ISD::AssertZext, SL, MVT::i32, TFI,
1283                      DAG.getValueType(ExtVT));
1284 }
1285 
1286 bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
1287   if (Intr->getOpcode() != ISD::INTRINSIC_W_CHAIN)
1288     return false;
1289 
1290   switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
1291   default: return false;
1292   case AMDGPUIntrinsic::amdgcn_if:
1293   case AMDGPUIntrinsic::amdgcn_else:
1294   case AMDGPUIntrinsic::amdgcn_break:
1295   case AMDGPUIntrinsic::amdgcn_if_break:
1296   case AMDGPUIntrinsic::amdgcn_else_break:
1297   case AMDGPUIntrinsic::amdgcn_loop:
1298   case AMDGPUIntrinsic::amdgcn_end_cf:
1299     return true;
1300   }
1301 }
1302 
1303 /// This transforms the control flow intrinsics to get the branch destination as
1304 /// last parameter, also switches branch target with BR if the need arise
1305 SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
1306                                       SelectionDAG &DAG) const {
1307 
1308   SDLoc DL(BRCOND);
1309 
1310   SDNode *Intr = BRCOND.getOperand(1).getNode();
1311   SDValue Target = BRCOND.getOperand(2);
1312   SDNode *BR = nullptr;
1313   SDNode *SetCC = nullptr;
1314 
1315   if (Intr->getOpcode() == ISD::SETCC) {
1316     // As long as we negate the condition everything is fine
1317     SetCC = Intr;
1318     Intr = SetCC->getOperand(0).getNode();
1319 
1320   } else {
1321     // Get the target from BR if we don't negate the condition
1322     BR = findUser(BRCOND, ISD::BR);
1323     Target = BR->getOperand(1);
1324   }
1325 
1326   if (Intr->getOpcode() != ISD::INTRINSIC_W_CHAIN) {
1327     // This is a uniform branch so we don't need to legalize.
1328     return BRCOND;
1329   }
1330 
1331   assert(!SetCC ||
1332         (SetCC->getConstantOperandVal(1) == 1 &&
1333          isCFIntrinsic(Intr) &&
1334          cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
1335                                                              ISD::SETNE));
1336 
1337   // Build the result and
1338   ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
1339 
1340   // operands of the new intrinsic call
1341   SmallVector<SDValue, 4> Ops;
1342   Ops.push_back(BRCOND.getOperand(0));
1343   Ops.append(Intr->op_begin() + 1, Intr->op_end());
1344   Ops.push_back(Target);
1345 
1346   // build the new intrinsic call
1347   SDNode *Result = DAG.getNode(
1348     Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL,
1349     DAG.getVTList(Res), Ops).getNode();
1350 
1351   if (BR) {
1352     // Give the branch instruction our target
1353     SDValue Ops[] = {
1354       BR->getOperand(0),
1355       BRCOND.getOperand(2)
1356     };
1357     SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
1358     DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
1359     BR = NewBR.getNode();
1360   }
1361 
1362   SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
1363 
1364   // Copy the intrinsic results to registers
1365   for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
1366     SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
1367     if (!CopyToReg)
1368       continue;
1369 
1370     Chain = DAG.getCopyToReg(
1371       Chain, DL,
1372       CopyToReg->getOperand(1),
1373       SDValue(Result, i - 1),
1374       SDValue());
1375 
1376     DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
1377   }
1378 
1379   // Remove the old intrinsic from the chain
1380   DAG.ReplaceAllUsesOfValueWith(
1381     SDValue(Intr, Intr->getNumValues() - 1),
1382     Intr->getOperand(0));
1383 
1384   return Chain;
1385 }
1386 
1387 SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
1388                                              SDValue Op,
1389                                              SelectionDAG &DAG) const {
1390   GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
1391 
1392   if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
1393     return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
1394 
1395   SDLoc DL(GSD);
1396   const GlobalValue *GV = GSD->getGlobal();
1397   MVT PtrVT = getPointerTy(DAG.getDataLayout(), GSD->getAddressSpace());
1398 
1399   SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32);
1400   return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT, GA);
1401 }
1402 
1403 SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL,
1404                                    SDValue V) const {
1405   // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
1406   // the destination register.
1407   //
1408   // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
1409   // so we will end up with redundant moves to m0.
1410   //
1411   // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
1412 
1413   // A Null SDValue creates a glue result.
1414   SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
1415                                   V, Chain);
1416   return SDValue(M0, 0);
1417 }
1418 
1419 SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
1420                                                  SDValue Op,
1421                                                  MVT VT,
1422                                                  unsigned Offset) const {
1423   SDLoc SL(Op);
1424   SDValue Param = LowerParameter(DAG, MVT::i32, MVT::i32, SL,
1425                                  DAG.getEntryNode(), Offset, false);
1426   // The local size values will have the hi 16-bits as zero.
1427   return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
1428                      DAG.getValueType(VT));
1429 }
1430 
1431 static SDValue emitNonHSAIntrinsicError(SelectionDAG& DAG, EVT VT) {
1432   DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(),
1433                                       "non-hsa intrinsic with hsa target");
1434   DAG.getContext()->diagnose(BadIntrin);
1435   return DAG.getUNDEF(VT);
1436 }
1437 
1438 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
1439                                                   SelectionDAG &DAG) const {
1440   MachineFunction &MF = DAG.getMachineFunction();
1441   auto MFI = MF.getInfo<SIMachineFunctionInfo>();
1442   const SIRegisterInfo *TRI =
1443       static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
1444 
1445   EVT VT = Op.getValueType();
1446   SDLoc DL(Op);
1447   unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
1448 
1449   // TODO: Should this propagate fast-math-flags?
1450 
1451   switch (IntrinsicID) {
1452   case Intrinsic::amdgcn_dispatch_ptr:
1453     if (!Subtarget->isAmdHsaOS()) {
1454       DiagnosticInfoUnsupported BadIntrin(
1455           *MF.getFunction(), "unsupported hsa intrinsic without hsa target",
1456           DL.getDebugLoc());
1457       DAG.getContext()->diagnose(BadIntrin);
1458       return DAG.getUNDEF(VT);
1459     }
1460 
1461     return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass,
1462       TRI->getPreloadedValue(MF, SIRegisterInfo::DISPATCH_PTR), VT);
1463   case Intrinsic::amdgcn_rcp:
1464     return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
1465   case Intrinsic::amdgcn_rsq:
1466   case AMDGPUIntrinsic::AMDGPU_rsq: // Legacy name
1467     return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
1468   case Intrinsic::amdgcn_rsq_clamp:
1469   case AMDGPUIntrinsic::AMDGPU_rsq_clamped: { // Legacy name
1470     if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
1471       return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
1472 
1473     Type *Type = VT.getTypeForEVT(*DAG.getContext());
1474     APFloat Max = APFloat::getLargest(Type->getFltSemantics());
1475     APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
1476 
1477     SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
1478     SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
1479                               DAG.getConstantFP(Max, DL, VT));
1480     return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
1481                        DAG.getConstantFP(Min, DL, VT));
1482   }
1483   case Intrinsic::r600_read_ngroups_x:
1484     if (Subtarget->isAmdHsaOS())
1485       return emitNonHSAIntrinsicError(DAG, VT);
1486 
1487     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
1488                           SI::KernelInputOffsets::NGROUPS_X, false);
1489   case Intrinsic::r600_read_ngroups_y:
1490     if (Subtarget->isAmdHsaOS())
1491       return emitNonHSAIntrinsicError(DAG, VT);
1492 
1493     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
1494                           SI::KernelInputOffsets::NGROUPS_Y, false);
1495   case Intrinsic::r600_read_ngroups_z:
1496     if (Subtarget->isAmdHsaOS())
1497       return emitNonHSAIntrinsicError(DAG, VT);
1498 
1499     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
1500                           SI::KernelInputOffsets::NGROUPS_Z, false);
1501   case Intrinsic::r600_read_global_size_x:
1502     if (Subtarget->isAmdHsaOS())
1503       return emitNonHSAIntrinsicError(DAG, VT);
1504 
1505     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
1506                           SI::KernelInputOffsets::GLOBAL_SIZE_X, false);
1507   case Intrinsic::r600_read_global_size_y:
1508     if (Subtarget->isAmdHsaOS())
1509       return emitNonHSAIntrinsicError(DAG, VT);
1510 
1511     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
1512                           SI::KernelInputOffsets::GLOBAL_SIZE_Y, false);
1513   case Intrinsic::r600_read_global_size_z:
1514     if (Subtarget->isAmdHsaOS())
1515       return emitNonHSAIntrinsicError(DAG, VT);
1516 
1517     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
1518                           SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
1519   case Intrinsic::r600_read_local_size_x:
1520     if (Subtarget->isAmdHsaOS())
1521       return emitNonHSAIntrinsicError(DAG, VT);
1522 
1523     return lowerImplicitZextParam(DAG, Op, MVT::i16,
1524                                   SI::KernelInputOffsets::LOCAL_SIZE_X);
1525   case Intrinsic::r600_read_local_size_y:
1526     if (Subtarget->isAmdHsaOS())
1527       return emitNonHSAIntrinsicError(DAG, VT);
1528 
1529     return lowerImplicitZextParam(DAG, Op, MVT::i16,
1530                                   SI::KernelInputOffsets::LOCAL_SIZE_Y);
1531   case Intrinsic::r600_read_local_size_z:
1532     if (Subtarget->isAmdHsaOS())
1533       return emitNonHSAIntrinsicError(DAG, VT);
1534 
1535     return lowerImplicitZextParam(DAG, Op, MVT::i16,
1536                                   SI::KernelInputOffsets::LOCAL_SIZE_Z);
1537   case Intrinsic::amdgcn_read_workdim:
1538   case AMDGPUIntrinsic::AMDGPU_read_workdim: // Legacy name.
1539     // Really only 2 bits.
1540     return lowerImplicitZextParam(DAG, Op, MVT::i8,
1541                                   getImplicitParameterOffset(MFI, GRID_DIM));
1542   case Intrinsic::amdgcn_workgroup_id_x:
1543   case Intrinsic::r600_read_tgid_x:
1544     return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
1545       TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT);
1546   case Intrinsic::amdgcn_workgroup_id_y:
1547   case Intrinsic::r600_read_tgid_y:
1548     return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
1549       TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT);
1550   case Intrinsic::amdgcn_workgroup_id_z:
1551   case Intrinsic::r600_read_tgid_z:
1552     return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
1553       TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT);
1554   case Intrinsic::amdgcn_workitem_id_x:
1555   case Intrinsic::r600_read_tidig_x:
1556     return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
1557       TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X), VT);
1558   case Intrinsic::amdgcn_workitem_id_y:
1559   case Intrinsic::r600_read_tidig_y:
1560     return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
1561       TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y), VT);
1562   case Intrinsic::amdgcn_workitem_id_z:
1563   case Intrinsic::r600_read_tidig_z:
1564     return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
1565       TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT);
1566   case AMDGPUIntrinsic::SI_load_const: {
1567     SDValue Ops[] = {
1568       Op.getOperand(1),
1569       Op.getOperand(2)
1570     };
1571 
1572     MachineMemOperand *MMO = MF.getMachineMemOperand(
1573       MachinePointerInfo(),
1574       MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant,
1575       VT.getStoreSize(), 4);
1576     return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
1577                                    Op->getVTList(), Ops, VT, MMO);
1578   }
1579   case AMDGPUIntrinsic::SI_vs_load_input:
1580     return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT,
1581                        Op.getOperand(1),
1582                        Op.getOperand(2),
1583                        Op.getOperand(3));
1584 
1585   case AMDGPUIntrinsic::SI_fs_constant: {
1586     SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3));
1587     SDValue Glue = M0.getValue(1);
1588     return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32,
1589                        DAG.getConstant(2, DL, MVT::i32), // P0
1590                        Op.getOperand(1), Op.getOperand(2), Glue);
1591   }
1592   case AMDGPUIntrinsic::SI_packf16:
1593     if (Op.getOperand(1).isUndef() && Op.getOperand(2).isUndef())
1594       return DAG.getUNDEF(MVT::i32);
1595     return Op;
1596   case AMDGPUIntrinsic::SI_fs_interp: {
1597     SDValue IJ = Op.getOperand(4);
1598     SDValue I = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ,
1599                             DAG.getConstant(0, DL, MVT::i32));
1600     SDValue J = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ,
1601                             DAG.getConstant(1, DL, MVT::i32));
1602     SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3));
1603     SDValue Glue = M0.getValue(1);
1604     SDValue P1 = DAG.getNode(AMDGPUISD::INTERP_P1, DL,
1605                              DAG.getVTList(MVT::f32, MVT::Glue),
1606                              I, Op.getOperand(1), Op.getOperand(2), Glue);
1607     Glue = SDValue(P1.getNode(), 1);
1608     return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, P1, J,
1609                              Op.getOperand(1), Op.getOperand(2), Glue);
1610   }
1611   case Intrinsic::amdgcn_interp_p1: {
1612     SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
1613     SDValue Glue = M0.getValue(1);
1614     return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1),
1615                        Op.getOperand(2), Op.getOperand(3), Glue);
1616   }
1617   case Intrinsic::amdgcn_interp_p2: {
1618     SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
1619     SDValue Glue = SDValue(M0.getNode(), 1);
1620     return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1),
1621                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
1622                        Glue);
1623   }
1624   case Intrinsic::amdgcn_sin:
1625     return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
1626 
1627   case Intrinsic::amdgcn_cos:
1628     return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
1629 
1630   case Intrinsic::amdgcn_log_clamp: {
1631     if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
1632       return SDValue();
1633 
1634     DiagnosticInfoUnsupported BadIntrin(
1635       *MF.getFunction(), "intrinsic not supported on subtarget",
1636       DL.getDebugLoc());
1637       DAG.getContext()->diagnose(BadIntrin);
1638       return DAG.getUNDEF(VT);
1639   }
1640   case Intrinsic::amdgcn_ldexp:
1641     return DAG.getNode(AMDGPUISD::LDEXP, DL, VT,
1642                        Op.getOperand(1), Op.getOperand(2));
1643   case Intrinsic::amdgcn_class:
1644     return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
1645                        Op.getOperand(1), Op.getOperand(2));
1646   case Intrinsic::amdgcn_div_fmas:
1647     return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
1648                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
1649                        Op.getOperand(4));
1650 
1651   case Intrinsic::amdgcn_div_fixup:
1652     return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
1653                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
1654 
1655   case Intrinsic::amdgcn_trig_preop:
1656     return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
1657                        Op.getOperand(1), Op.getOperand(2));
1658   case Intrinsic::amdgcn_div_scale: {
1659     // 3rd parameter required to be a constant.
1660     const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
1661     if (!Param)
1662       return DAG.getUNDEF(VT);
1663 
1664     // Translate to the operands expected by the machine instruction. The
1665     // first parameter must be the same as the first instruction.
1666     SDValue Numerator = Op.getOperand(1);
1667     SDValue Denominator = Op.getOperand(2);
1668 
1669     // Note this order is opposite of the machine instruction's operations,
1670     // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
1671     // intrinsic has the numerator as the first operand to match a normal
1672     // division operation.
1673 
1674     SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
1675 
1676     return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
1677                        Denominator, Numerator);
1678   }
1679   case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte0:
1680     return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Op.getOperand(1));
1681   case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte1:
1682     return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE1, DL, VT, Op.getOperand(1));
1683   case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte2:
1684     return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE2, DL, VT, Op.getOperand(1));
1685   case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte3:
1686     return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE3, DL, VT, Op.getOperand(1));
1687   default:
1688     return AMDGPUTargetLowering::LowerOperation(Op, DAG);
1689   }
1690 }
1691 
1692 SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
1693                                                  SelectionDAG &DAG) const {
1694   unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
1695   switch (IntrID) {
1696   case Intrinsic::amdgcn_atomic_inc:
1697   case Intrinsic::amdgcn_atomic_dec: {
1698     MemSDNode *M = cast<MemSDNode>(Op);
1699     unsigned Opc = (IntrID == Intrinsic::amdgcn_atomic_inc) ?
1700       AMDGPUISD::ATOMIC_INC : AMDGPUISD::ATOMIC_DEC;
1701     SDValue Ops[] = {
1702       M->getOperand(0), // Chain
1703       M->getOperand(2), // Ptr
1704       M->getOperand(3)  // Value
1705     };
1706 
1707     return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
1708                                    M->getMemoryVT(), M->getMemOperand());
1709   }
1710   default:
1711     return SDValue();
1712   }
1713 }
1714 
1715 SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
1716                                               SelectionDAG &DAG) const {
1717   MachineFunction &MF = DAG.getMachineFunction();
1718   SDLoc DL(Op);
1719   SDValue Chain = Op.getOperand(0);
1720   unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
1721 
1722   switch (IntrinsicID) {
1723   case AMDGPUIntrinsic::SI_sendmsg: {
1724     Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
1725     SDValue Glue = Chain.getValue(1);
1726     return DAG.getNode(AMDGPUISD::SENDMSG, DL, MVT::Other, Chain,
1727                        Op.getOperand(2), Glue);
1728   }
1729   case AMDGPUIntrinsic::SI_tbuffer_store: {
1730     SDValue Ops[] = {
1731       Chain,
1732       Op.getOperand(2),
1733       Op.getOperand(3),
1734       Op.getOperand(4),
1735       Op.getOperand(5),
1736       Op.getOperand(6),
1737       Op.getOperand(7),
1738       Op.getOperand(8),
1739       Op.getOperand(9),
1740       Op.getOperand(10),
1741       Op.getOperand(11),
1742       Op.getOperand(12),
1743       Op.getOperand(13),
1744       Op.getOperand(14)
1745     };
1746 
1747     EVT VT = Op.getOperand(3).getValueType();
1748 
1749     MachineMemOperand *MMO = MF.getMachineMemOperand(
1750       MachinePointerInfo(),
1751       MachineMemOperand::MOStore,
1752       VT.getStoreSize(), 4);
1753     return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
1754                                    Op->getVTList(), Ops, VT, MMO);
1755   }
1756   default:
1757     return SDValue();
1758   }
1759 }
1760 
1761 SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
1762   SDLoc DL(Op);
1763   LoadSDNode *Load = cast<LoadSDNode>(Op);
1764   ISD::LoadExtType ExtType = Load->getExtensionType();
1765   EVT MemVT = Load->getMemoryVT();
1766 
1767   if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
1768     assert(MemVT == MVT::i1 && "Only i1 non-extloads expected");
1769     // FIXME: Copied from PPC
1770     // First, load into 32 bits, then truncate to 1 bit.
1771 
1772     SDValue Chain = Load->getChain();
1773     SDValue BasePtr = Load->getBasePtr();
1774     MachineMemOperand *MMO = Load->getMemOperand();
1775 
1776     SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
1777                                    BasePtr, MVT::i8, MMO);
1778 
1779     SDValue Ops[] = {
1780       DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
1781       NewLD.getValue(1)
1782     };
1783 
1784     return DAG.getMergeValues(Ops, DL);
1785   }
1786 
1787   if (!MemVT.isVector())
1788     return SDValue();
1789 
1790   assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
1791          "Custom lowering for non-i32 vectors hasn't been implemented.");
1792   unsigned NumElements = MemVT.getVectorNumElements();
1793   assert(NumElements != 2 && "v2 loads are supported for all address spaces.");
1794 
1795   switch (Load->getAddressSpace()) {
1796   case AMDGPUAS::CONSTANT_ADDRESS:
1797     if (isMemOpUniform(Load))
1798       return SDValue();
1799     // Non-uniform loads will be selected to MUBUF instructions, so they
1800     // have the same legalization requires ments as global and private
1801     // loads.
1802     //
1803     // Fall-through
1804   case AMDGPUAS::GLOBAL_ADDRESS:
1805   case AMDGPUAS::FLAT_ADDRESS:
1806     if (NumElements > 4)
1807       return SplitVectorLoad(Op, DAG);
1808     // v4 loads are supported for private and global memory.
1809     return SDValue();
1810   case AMDGPUAS::PRIVATE_ADDRESS: {
1811     // Depending on the setting of the private_element_size field in the
1812     // resource descriptor, we can only make private accesses up to a certain
1813     // size.
1814     switch (Subtarget->getMaxPrivateElementSize()) {
1815     case 4:
1816       return scalarizeVectorLoad(Load, DAG);
1817     case 8:
1818       if (NumElements > 2)
1819         return SplitVectorLoad(Op, DAG);
1820       return SDValue();
1821     case 16:
1822       // Same as global/flat
1823       if (NumElements > 4)
1824         return SplitVectorLoad(Op, DAG);
1825       return SDValue();
1826     default:
1827       llvm_unreachable("unsupported private_element_size");
1828     }
1829   }
1830   case AMDGPUAS::LOCAL_ADDRESS:
1831     // If properly aligned, if we split we might be able to use ds_read_b64.
1832     return SplitVectorLoad(Op, DAG);
1833   default:
1834     return SDValue();
1835   }
1836 }
1837 
1838 SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
1839   if (Op.getValueType() != MVT::i64)
1840     return SDValue();
1841 
1842   SDLoc DL(Op);
1843   SDValue Cond = Op.getOperand(0);
1844 
1845   SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
1846   SDValue One = DAG.getConstant(1, DL, MVT::i32);
1847 
1848   SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
1849   SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
1850 
1851   SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
1852   SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
1853 
1854   SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
1855 
1856   SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
1857   SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
1858 
1859   SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
1860 
1861   SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i32, Lo, Hi);
1862   return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res);
1863 }
1864 
1865 // Catch division cases where we can use shortcuts with rcp and rsq
1866 // instructions.
1867 SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const {
1868   SDLoc SL(Op);
1869   SDValue LHS = Op.getOperand(0);
1870   SDValue RHS = Op.getOperand(1);
1871   EVT VT = Op.getValueType();
1872   bool Unsafe = DAG.getTarget().Options.UnsafeFPMath;
1873 
1874   if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
1875     if ((Unsafe || (VT == MVT::f32 && !Subtarget->hasFP32Denormals())) &&
1876         CLHS->isExactlyValue(1.0)) {
1877       // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
1878       // the CI documentation has a worst case error of 1 ulp.
1879       // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
1880       // use it as long as we aren't trying to use denormals.
1881 
1882       // 1.0 / sqrt(x) -> rsq(x)
1883       //
1884       // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
1885       // error seems really high at 2^29 ULP.
1886       if (RHS.getOpcode() == ISD::FSQRT)
1887         return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
1888 
1889       // 1.0 / x -> rcp(x)
1890       return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
1891     }
1892   }
1893 
1894   if (Unsafe) {
1895     // Turn into multiply by the reciprocal.
1896     // x / y -> x * (1.0 / y)
1897     SDNodeFlags Flags;
1898     Flags.setUnsafeAlgebra(true);
1899     SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
1900     return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, &Flags);
1901   }
1902 
1903   return SDValue();
1904 }
1905 
1906 SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
1907   if (SDValue FastLowered = LowerFastFDIV(Op, DAG))
1908     return FastLowered;
1909 
1910   // This uses v_rcp_f32 which does not handle denormals. Let this hit a
1911   // selection error for now rather than do something incorrect.
1912   if (Subtarget->hasFP32Denormals())
1913     return SDValue();
1914 
1915   SDLoc SL(Op);
1916   SDValue LHS = Op.getOperand(0);
1917   SDValue RHS = Op.getOperand(1);
1918 
1919   SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
1920 
1921   const APFloat K0Val(BitsToFloat(0x6f800000));
1922   const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
1923 
1924   const APFloat K1Val(BitsToFloat(0x2f800000));
1925   const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
1926 
1927   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
1928 
1929   EVT SetCCVT =
1930       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
1931 
1932   SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
1933 
1934   SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
1935 
1936   // TODO: Should this propagate fast-math-flags?
1937 
1938   r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
1939 
1940   SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
1941 
1942   SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
1943 
1944   return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
1945 }
1946 
1947 SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
1948   if (DAG.getTarget().Options.UnsafeFPMath)
1949     return LowerFastFDIV(Op, DAG);
1950 
1951   SDLoc SL(Op);
1952   SDValue X = Op.getOperand(0);
1953   SDValue Y = Op.getOperand(1);
1954 
1955   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
1956 
1957   SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
1958 
1959   SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
1960 
1961   SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
1962 
1963   SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
1964 
1965   SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
1966 
1967   SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
1968 
1969   SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
1970 
1971   SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
1972 
1973   SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
1974   SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
1975 
1976   SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
1977                              NegDivScale0, Mul, DivScale1);
1978 
1979   SDValue Scale;
1980 
1981   if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1982     // Workaround a hardware bug on SI where the condition output from div_scale
1983     // is not usable.
1984 
1985     const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
1986 
1987     // Figure out if the scale to use for div_fmas.
1988     SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
1989     SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
1990     SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
1991     SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
1992 
1993     SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
1994     SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
1995 
1996     SDValue Scale0Hi
1997       = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
1998     SDValue Scale1Hi
1999       = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
2000 
2001     SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
2002     SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
2003     Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
2004   } else {
2005     Scale = DivScale1.getValue(1);
2006   }
2007 
2008   SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
2009                              Fma4, Fma3, Mul, Scale);
2010 
2011   return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
2012 }
2013 
2014 SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
2015   EVT VT = Op.getValueType();
2016 
2017   if (VT == MVT::f32)
2018     return LowerFDIV32(Op, DAG);
2019 
2020   if (VT == MVT::f64)
2021     return LowerFDIV64(Op, DAG);
2022 
2023   llvm_unreachable("Unexpected type for fdiv");
2024 }
2025 
2026 SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
2027   SDLoc DL(Op);
2028   StoreSDNode *Store = cast<StoreSDNode>(Op);
2029   EVT VT = Store->getMemoryVT();
2030 
2031   if (VT == MVT::i1) {
2032     return DAG.getTruncStore(Store->getChain(), DL,
2033        DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
2034        Store->getBasePtr(), MVT::i1, Store->getMemOperand());
2035   }
2036 
2037   assert(Store->getValue().getValueType().getScalarType() == MVT::i32);
2038 
2039   unsigned NumElements = VT.getVectorNumElements();
2040   switch (Store->getAddressSpace()) {
2041   case AMDGPUAS::GLOBAL_ADDRESS:
2042   case AMDGPUAS::FLAT_ADDRESS:
2043     if (NumElements > 4)
2044       return SplitVectorStore(Op, DAG);
2045     return SDValue();
2046   case AMDGPUAS::PRIVATE_ADDRESS: {
2047     switch (Subtarget->getMaxPrivateElementSize()) {
2048     case 4:
2049       return scalarizeVectorStore(Store, DAG);
2050     case 8:
2051       if (NumElements > 2)
2052         return SplitVectorStore(Op, DAG);
2053       return SDValue();
2054     case 16:
2055       if (NumElements > 4)
2056         return SplitVectorStore(Op, DAG);
2057       return SDValue();
2058     default:
2059       llvm_unreachable("unsupported private_element_size");
2060     }
2061   }
2062   case AMDGPUAS::LOCAL_ADDRESS:
2063     // If properly aligned, if we split we might be able to use ds_write_b64.
2064     return SplitVectorStore(Op, DAG);
2065   default:
2066     llvm_unreachable("unhandled address space");
2067   }
2068 }
2069 
2070 SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
2071   SDLoc DL(Op);
2072   EVT VT = Op.getValueType();
2073   SDValue Arg = Op.getOperand(0);
2074   // TODO: Should this propagate fast-math-flags?
2075   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
2076                                   DAG.getNode(ISD::FMUL, DL, VT, Arg,
2077                                               DAG.getConstantFP(0.5/M_PI, DL,
2078                                                                 VT)));
2079 
2080   switch (Op.getOpcode()) {
2081   case ISD::FCOS:
2082     return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, FractPart);
2083   case ISD::FSIN:
2084     return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, FractPart);
2085   default:
2086     llvm_unreachable("Wrong trig opcode");
2087   }
2088 }
2089 
2090 SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
2091   AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
2092   assert(AtomicNode->isCompareAndSwap());
2093   unsigned AS = AtomicNode->getAddressSpace();
2094 
2095   // No custom lowering required for local address space
2096   if (!isFlatGlobalAddrSpace(AS))
2097     return Op;
2098 
2099   // Non-local address space requires custom lowering for atomic compare
2100   // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
2101   SDLoc DL(Op);
2102   SDValue ChainIn = Op.getOperand(0);
2103   SDValue Addr = Op.getOperand(1);
2104   SDValue Old = Op.getOperand(2);
2105   SDValue New = Op.getOperand(3);
2106   EVT VT = Op.getValueType();
2107   MVT SimpleVT = VT.getSimpleVT();
2108   MVT VecType = MVT::getVectorVT(SimpleVT, 2);
2109 
2110   SDValue NewOld = DAG.getNode(ISD::BUILD_VECTOR, DL, VecType,
2111                                New, Old);
2112   SDValue Ops[] = { ChainIn, Addr, NewOld };
2113   SDVTList VTList = DAG.getVTList(VT, MVT::Other);
2114   return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL,
2115                                  VTList, Ops, VT, AtomicNode->getMemOperand());
2116 }
2117 
2118 //===----------------------------------------------------------------------===//
2119 // Custom DAG optimizations
2120 //===----------------------------------------------------------------------===//
2121 
2122 SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
2123                                                      DAGCombinerInfo &DCI) const {
2124   EVT VT = N->getValueType(0);
2125   EVT ScalarVT = VT.getScalarType();
2126   if (ScalarVT != MVT::f32)
2127     return SDValue();
2128 
2129   SelectionDAG &DAG = DCI.DAG;
2130   SDLoc DL(N);
2131 
2132   SDValue Src = N->getOperand(0);
2133   EVT SrcVT = Src.getValueType();
2134 
2135   // TODO: We could try to match extracting the higher bytes, which would be
2136   // easier if i8 vectors weren't promoted to i32 vectors, particularly after
2137   // types are legalized. v4i8 -> v4f32 is probably the only case to worry
2138   // about in practice.
2139   if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) {
2140     if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
2141       SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src);
2142       DCI.AddToWorklist(Cvt.getNode());
2143       return Cvt;
2144     }
2145   }
2146 
2147   // We are primarily trying to catch operations on illegal vector types
2148   // before they are expanded.
2149   // For scalars, we can use the more flexible method of checking masked bits
2150   // after legalization.
2151   if (!DCI.isBeforeLegalize() ||
2152       !SrcVT.isVector() ||
2153       SrcVT.getVectorElementType() != MVT::i8) {
2154     return SDValue();
2155   }
2156 
2157   assert(DCI.isBeforeLegalize() && "Unexpected legal type");
2158 
2159   // Weird sized vectors are a pain to handle, but we know 3 is really the same
2160   // size as 4.
2161   unsigned NElts = SrcVT.getVectorNumElements();
2162   if (!SrcVT.isSimple() && NElts != 3)
2163     return SDValue();
2164 
2165   // Handle v4i8 -> v4f32 extload. Replace the v4i8 with a legal i32 load to
2166   // prevent a mess from expanding to v4i32 and repacking.
2167   if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
2168     EVT LoadVT = getEquivalentMemType(*DAG.getContext(), SrcVT);
2169     EVT RegVT = getEquivalentLoadRegType(*DAG.getContext(), SrcVT);
2170     EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, NElts);
2171     LoadSDNode *Load = cast<LoadSDNode>(Src);
2172 
2173     unsigned AS = Load->getAddressSpace();
2174     unsigned Align = Load->getAlignment();
2175     Type *Ty = LoadVT.getTypeForEVT(*DAG.getContext());
2176     unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty);
2177 
2178     // Don't try to replace the load if we have to expand it due to alignment
2179     // problems. Otherwise we will end up scalarizing the load, and trying to
2180     // repack into the vector for no real reason.
2181     if (Align < ABIAlignment &&
2182         !allowsMisalignedMemoryAccesses(LoadVT, AS, Align, nullptr)) {
2183       return SDValue();
2184     }
2185 
2186     SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegVT,
2187                                      Load->getChain(),
2188                                      Load->getBasePtr(),
2189                                      LoadVT,
2190                                      Load->getMemOperand());
2191 
2192     // Make sure successors of the original load stay after it by updating
2193     // them to use the new Chain.
2194     DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), NewLoad.getValue(1));
2195 
2196     SmallVector<SDValue, 4> Elts;
2197     if (RegVT.isVector())
2198       DAG.ExtractVectorElements(NewLoad, Elts);
2199     else
2200       Elts.push_back(NewLoad);
2201 
2202     SmallVector<SDValue, 4> Ops;
2203 
2204     unsigned EltIdx = 0;
2205     for (SDValue Elt : Elts) {
2206       unsigned ComponentsInElt = std::min(4u, NElts - 4 * EltIdx);
2207       for (unsigned I = 0; I < ComponentsInElt; ++I) {
2208         unsigned Opc = AMDGPUISD::CVT_F32_UBYTE0 + I;
2209         SDValue Cvt = DAG.getNode(Opc, DL, MVT::f32, Elt);
2210         DCI.AddToWorklist(Cvt.getNode());
2211         Ops.push_back(Cvt);
2212       }
2213 
2214       ++EltIdx;
2215     }
2216 
2217     assert(Ops.size() == NElts);
2218 
2219     return DAG.getNode(ISD::BUILD_VECTOR, DL, FloatVT, Ops);
2220   }
2221 
2222   return SDValue();
2223 }
2224 
2225 /// \brief Return true if the given offset Size in bytes can be folded into
2226 /// the immediate offsets of a memory instruction for the given address space.
2227 static bool canFoldOffset(unsigned OffsetSize, unsigned AS,
2228                           const AMDGPUSubtarget &STI) {
2229   switch (AS) {
2230   case AMDGPUAS::GLOBAL_ADDRESS: {
2231     // MUBUF instructions a 12-bit offset in bytes.
2232     return isUInt<12>(OffsetSize);
2233   }
2234   case AMDGPUAS::CONSTANT_ADDRESS: {
2235     // SMRD instructions have an 8-bit offset in dwords on SI and
2236     // a 20-bit offset in bytes on VI.
2237     if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
2238       return isUInt<20>(OffsetSize);
2239     else
2240       return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4);
2241   }
2242   case AMDGPUAS::LOCAL_ADDRESS:
2243   case AMDGPUAS::REGION_ADDRESS: {
2244     // The single offset versions have a 16-bit offset in bytes.
2245     return isUInt<16>(OffsetSize);
2246   }
2247   case AMDGPUAS::PRIVATE_ADDRESS:
2248   // Indirect register addressing does not use any offsets.
2249   default:
2250     return 0;
2251   }
2252 }
2253 
2254 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
2255 
2256 // This is a variant of
2257 // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
2258 //
2259 // The normal DAG combiner will do this, but only if the add has one use since
2260 // that would increase the number of instructions.
2261 //
2262 // This prevents us from seeing a constant offset that can be folded into a
2263 // memory instruction's addressing mode. If we know the resulting add offset of
2264 // a pointer can be folded into an addressing offset, we can replace the pointer
2265 // operand with the add of new constant offset. This eliminates one of the uses,
2266 // and may allow the remaining use to also be simplified.
2267 //
2268 SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
2269                                                unsigned AddrSpace,
2270                                                DAGCombinerInfo &DCI) const {
2271   SDValue N0 = N->getOperand(0);
2272   SDValue N1 = N->getOperand(1);
2273 
2274   if (N0.getOpcode() != ISD::ADD)
2275     return SDValue();
2276 
2277   const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
2278   if (!CN1)
2279     return SDValue();
2280 
2281   const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
2282   if (!CAdd)
2283     return SDValue();
2284 
2285   // If the resulting offset is too large, we can't fold it into the addressing
2286   // mode offset.
2287   APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
2288   if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *Subtarget))
2289     return SDValue();
2290 
2291   SelectionDAG &DAG = DCI.DAG;
2292   SDLoc SL(N);
2293   EVT VT = N->getValueType(0);
2294 
2295   SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
2296   SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32);
2297 
2298   return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset);
2299 }
2300 
2301 SDValue SITargetLowering::performAndCombine(SDNode *N,
2302                                             DAGCombinerInfo &DCI) const {
2303   if (DCI.isBeforeLegalize())
2304     return SDValue();
2305 
2306   if (SDValue Base = AMDGPUTargetLowering::performAndCombine(N, DCI))
2307     return Base;
2308 
2309   SelectionDAG &DAG = DCI.DAG;
2310 
2311   // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
2312   // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
2313   SDValue LHS = N->getOperand(0);
2314   SDValue RHS = N->getOperand(1);
2315 
2316   if (LHS.getOpcode() == ISD::SETCC &&
2317       RHS.getOpcode() == ISD::SETCC) {
2318     ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
2319     ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
2320 
2321     SDValue X = LHS.getOperand(0);
2322     SDValue Y = RHS.getOperand(0);
2323     if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X)
2324       return SDValue();
2325 
2326     if (LCC == ISD::SETO) {
2327       if (X != LHS.getOperand(1))
2328         return SDValue();
2329 
2330       if (RCC == ISD::SETUNE) {
2331         const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
2332         if (!C1 || !C1->isInfinity() || C1->isNegative())
2333           return SDValue();
2334 
2335         const uint32_t Mask = SIInstrFlags::N_NORMAL |
2336                               SIInstrFlags::N_SUBNORMAL |
2337                               SIInstrFlags::N_ZERO |
2338                               SIInstrFlags::P_ZERO |
2339                               SIInstrFlags::P_SUBNORMAL |
2340                               SIInstrFlags::P_NORMAL;
2341 
2342         static_assert(((~(SIInstrFlags::S_NAN |
2343                           SIInstrFlags::Q_NAN |
2344                           SIInstrFlags::N_INFINITY |
2345                           SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
2346                       "mask not equal");
2347 
2348         SDLoc DL(N);
2349         return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
2350                            X, DAG.getConstant(Mask, DL, MVT::i32));
2351       }
2352     }
2353   }
2354 
2355   return SDValue();
2356 }
2357 
2358 SDValue SITargetLowering::performOrCombine(SDNode *N,
2359                                            DAGCombinerInfo &DCI) const {
2360   SelectionDAG &DAG = DCI.DAG;
2361   SDValue LHS = N->getOperand(0);
2362   SDValue RHS = N->getOperand(1);
2363 
2364   EVT VT = N->getValueType(0);
2365   if (VT == MVT::i64) {
2366     // TODO: This could be a generic combine with a predicate for extracting the
2367     // high half of an integer being free.
2368 
2369     // (or i64:x, (zero_extend i32:y)) ->
2370     //   i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
2371     if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
2372         RHS.getOpcode() != ISD::ZERO_EXTEND)
2373       std::swap(LHS, RHS);
2374 
2375     if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
2376       SDValue ExtSrc = RHS.getOperand(0);
2377       EVT SrcVT = ExtSrc.getValueType();
2378       if (SrcVT == MVT::i32) {
2379         SDLoc SL(N);
2380         SDValue LowLHS, HiBits;
2381         std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
2382         SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
2383 
2384         DCI.AddToWorklist(LowOr.getNode());
2385         DCI.AddToWorklist(HiBits.getNode());
2386 
2387         SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
2388                                   LowOr, HiBits);
2389         return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
2390       }
2391     }
2392   }
2393 
2394   // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
2395   if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
2396       RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
2397     SDValue Src = LHS.getOperand(0);
2398     if (Src != RHS.getOperand(0))
2399       return SDValue();
2400 
2401     const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
2402     const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
2403     if (!CLHS || !CRHS)
2404       return SDValue();
2405 
2406     // Only 10 bits are used.
2407     static const uint32_t MaxMask = 0x3ff;
2408 
2409     uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
2410     SDLoc DL(N);
2411     return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
2412                        Src, DAG.getConstant(NewMask, DL, MVT::i32));
2413   }
2414 
2415   return SDValue();
2416 }
2417 
2418 SDValue SITargetLowering::performClassCombine(SDNode *N,
2419                                               DAGCombinerInfo &DCI) const {
2420   SelectionDAG &DAG = DCI.DAG;
2421   SDValue Mask = N->getOperand(1);
2422 
2423   // fp_class x, 0 -> false
2424   if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) {
2425     if (CMask->isNullValue())
2426       return DAG.getConstant(0, SDLoc(N), MVT::i1);
2427   }
2428 
2429   return SDValue();
2430 }
2431 
2432 // Constant fold canonicalize.
2433 SDValue SITargetLowering::performFCanonicalizeCombine(
2434   SDNode *N,
2435   DAGCombinerInfo &DCI) const {
2436   ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
2437   if (!CFP)
2438     return SDValue();
2439 
2440   SelectionDAG &DAG = DCI.DAG;
2441   const APFloat &C = CFP->getValueAPF();
2442 
2443   // Flush denormals to 0 if not enabled.
2444   if (C.isDenormal()) {
2445     EVT VT = N->getValueType(0);
2446     if (VT == MVT::f32 && !Subtarget->hasFP32Denormals())
2447       return DAG.getConstantFP(0.0, SDLoc(N), VT);
2448 
2449     if (VT == MVT::f64 && !Subtarget->hasFP64Denormals())
2450       return DAG.getConstantFP(0.0, SDLoc(N), VT);
2451   }
2452 
2453   if (C.isNaN()) {
2454     EVT VT = N->getValueType(0);
2455     APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
2456     if (C.isSignaling()) {
2457       // Quiet a signaling NaN.
2458       return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
2459     }
2460 
2461     // Make sure it is the canonical NaN bitpattern.
2462     //
2463     // TODO: Can we use -1 as the canonical NaN value since it's an inline
2464     // immediate?
2465     if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
2466       return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
2467   }
2468 
2469   return SDValue(CFP, 0);
2470 }
2471 
2472 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
2473   switch (Opc) {
2474   case ISD::FMAXNUM:
2475     return AMDGPUISD::FMAX3;
2476   case ISD::SMAX:
2477     return AMDGPUISD::SMAX3;
2478   case ISD::UMAX:
2479     return AMDGPUISD::UMAX3;
2480   case ISD::FMINNUM:
2481     return AMDGPUISD::FMIN3;
2482   case ISD::SMIN:
2483     return AMDGPUISD::SMIN3;
2484   case ISD::UMIN:
2485     return AMDGPUISD::UMIN3;
2486   default:
2487     llvm_unreachable("Not a min/max opcode");
2488   }
2489 }
2490 
2491 static SDValue performIntMed3ImmCombine(SelectionDAG &DAG,
2492                                         SDLoc SL,
2493                                         SDValue Op0,
2494                                         SDValue Op1,
2495                                         bool Signed) {
2496   ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1);
2497   if (!K1)
2498     return SDValue();
2499 
2500   ConstantSDNode *K0 = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
2501   if (!K0)
2502     return SDValue();
2503 
2504 
2505   if (Signed) {
2506     if (K0->getAPIntValue().sge(K1->getAPIntValue()))
2507       return SDValue();
2508   } else {
2509     if (K0->getAPIntValue().uge(K1->getAPIntValue()))
2510       return SDValue();
2511   }
2512 
2513   EVT VT = K0->getValueType(0);
2514   return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT,
2515                      Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
2516 }
2517 
2518 static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
2519   if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions())
2520     return true;
2521 
2522   return DAG.isKnownNeverNaN(Op);
2523 }
2524 
2525 static SDValue performFPMed3ImmCombine(SelectionDAG &DAG,
2526                                        SDLoc SL,
2527                                        SDValue Op0,
2528                                        SDValue Op1) {
2529   ConstantFPSDNode *K1 = dyn_cast<ConstantFPSDNode>(Op1);
2530   if (!K1)
2531     return SDValue();
2532 
2533   ConstantFPSDNode *K0 = dyn_cast<ConstantFPSDNode>(Op0.getOperand(1));
2534   if (!K0)
2535     return SDValue();
2536 
2537   // Ordered >= (although NaN inputs should have folded away by now).
2538   APFloat::cmpResult Cmp = K0->getValueAPF().compare(K1->getValueAPF());
2539   if (Cmp == APFloat::cmpGreaterThan)
2540     return SDValue();
2541 
2542   // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
2543   // signaling NaN gives a quiet NaN. The quiet NaN input to the min would then
2544   // give the other result, which is different from med3 with a NaN input.
2545   SDValue Var = Op0.getOperand(0);
2546   if (!isKnownNeverSNan(DAG, Var))
2547     return SDValue();
2548 
2549   return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
2550                      Var, SDValue(K0, 0), SDValue(K1, 0));
2551 }
2552 
2553 SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
2554                                                DAGCombinerInfo &DCI) const {
2555   SelectionDAG &DAG = DCI.DAG;
2556 
2557   unsigned Opc = N->getOpcode();
2558   SDValue Op0 = N->getOperand(0);
2559   SDValue Op1 = N->getOperand(1);
2560 
2561   // Only do this if the inner op has one use since this will just increases
2562   // register pressure for no benefit.
2563 
2564   if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY) {
2565     // max(max(a, b), c) -> max3(a, b, c)
2566     // min(min(a, b), c) -> min3(a, b, c)
2567     if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
2568       SDLoc DL(N);
2569       return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
2570                          DL,
2571                          N->getValueType(0),
2572                          Op0.getOperand(0),
2573                          Op0.getOperand(1),
2574                          Op1);
2575     }
2576 
2577     // Try commuted.
2578     // max(a, max(b, c)) -> max3(a, b, c)
2579     // min(a, min(b, c)) -> min3(a, b, c)
2580     if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
2581       SDLoc DL(N);
2582       return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
2583                          DL,
2584                          N->getValueType(0),
2585                          Op0,
2586                          Op1.getOperand(0),
2587                          Op1.getOperand(1));
2588     }
2589   }
2590 
2591   // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
2592   if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
2593     if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true))
2594       return Med3;
2595   }
2596 
2597   if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
2598     if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false))
2599       return Med3;
2600   }
2601 
2602   // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
2603   if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
2604        (Opc == AMDGPUISD::FMIN_LEGACY &&
2605         Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
2606       N->getValueType(0) == MVT::f32 && Op0.hasOneUse()) {
2607     if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
2608       return Res;
2609   }
2610 
2611   return SDValue();
2612 }
2613 
2614 SDValue SITargetLowering::performSetCCCombine(SDNode *N,
2615                                               DAGCombinerInfo &DCI) const {
2616   SelectionDAG &DAG = DCI.DAG;
2617   SDLoc SL(N);
2618 
2619   SDValue LHS = N->getOperand(0);
2620   SDValue RHS = N->getOperand(1);
2621   EVT VT = LHS.getValueType();
2622 
2623   if (VT != MVT::f32 && VT != MVT::f64)
2624     return SDValue();
2625 
2626   // Match isinf pattern
2627   // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
2628   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
2629   if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) {
2630     const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
2631     if (!CRHS)
2632       return SDValue();
2633 
2634     const APFloat &APF = CRHS->getValueAPF();
2635     if (APF.isInfinity() && !APF.isNegative()) {
2636       unsigned Mask = SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY;
2637       return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
2638                          DAG.getConstant(Mask, SL, MVT::i32));
2639     }
2640   }
2641 
2642   return SDValue();
2643 }
2644 
2645 SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
2646                                             DAGCombinerInfo &DCI) const {
2647   SelectionDAG &DAG = DCI.DAG;
2648   SDLoc DL(N);
2649 
2650   switch (N->getOpcode()) {
2651   default:
2652     return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
2653   case ISD::SETCC:
2654     return performSetCCCombine(N, DCI);
2655   case ISD::FMAXNUM:
2656   case ISD::FMINNUM:
2657   case ISD::SMAX:
2658   case ISD::SMIN:
2659   case ISD::UMAX:
2660   case ISD::UMIN:
2661   case AMDGPUISD::FMIN_LEGACY:
2662   case AMDGPUISD::FMAX_LEGACY: {
2663     if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
2664         N->getValueType(0) != MVT::f64 &&
2665         getTargetMachine().getOptLevel() > CodeGenOpt::None)
2666       return performMinMaxCombine(N, DCI);
2667     break;
2668   }
2669 
2670   case AMDGPUISD::CVT_F32_UBYTE0:
2671   case AMDGPUISD::CVT_F32_UBYTE1:
2672   case AMDGPUISD::CVT_F32_UBYTE2:
2673   case AMDGPUISD::CVT_F32_UBYTE3: {
2674     unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
2675 
2676     SDValue Src = N->getOperand(0);
2677     APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
2678 
2679     APInt KnownZero, KnownOne;
2680     TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
2681                                           !DCI.isBeforeLegalizeOps());
2682     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2683     if (TLO.ShrinkDemandedConstant(Src, Demanded) ||
2684         TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) {
2685       DCI.CommitTargetLoweringOpt(TLO);
2686     }
2687 
2688     break;
2689   }
2690 
2691   case ISD::UINT_TO_FP: {
2692     return performUCharToFloatCombine(N, DCI);
2693   }
2694   case ISD::FADD: {
2695     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
2696       break;
2697 
2698     EVT VT = N->getValueType(0);
2699     if (VT != MVT::f32)
2700       break;
2701 
2702     // Only do this if we are not trying to support denormals. v_mad_f32 does
2703     // not support denormals ever.
2704     if (Subtarget->hasFP32Denormals())
2705       break;
2706 
2707     SDValue LHS = N->getOperand(0);
2708     SDValue RHS = N->getOperand(1);
2709 
2710     // These should really be instruction patterns, but writing patterns with
2711     // source modiifiers is a pain.
2712 
2713     // fadd (fadd (a, a), b) -> mad 2.0, a, b
2714     if (LHS.getOpcode() == ISD::FADD) {
2715       SDValue A = LHS.getOperand(0);
2716       if (A == LHS.getOperand(1)) {
2717         const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32);
2718         return DAG.getNode(ISD::FMAD, DL, VT, Two, A, RHS);
2719       }
2720     }
2721 
2722     // fadd (b, fadd (a, a)) -> mad 2.0, a, b
2723     if (RHS.getOpcode() == ISD::FADD) {
2724       SDValue A = RHS.getOperand(0);
2725       if (A == RHS.getOperand(1)) {
2726         const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32);
2727         return DAG.getNode(ISD::FMAD, DL, VT, Two, A, LHS);
2728       }
2729     }
2730 
2731     return SDValue();
2732   }
2733   case ISD::FSUB: {
2734     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
2735       break;
2736 
2737     EVT VT = N->getValueType(0);
2738 
2739     // Try to get the fneg to fold into the source modifier. This undoes generic
2740     // DAG combines and folds them into the mad.
2741     //
2742     // Only do this if we are not trying to support denormals. v_mad_f32 does
2743     // not support denormals ever.
2744     if (VT == MVT::f32 &&
2745         !Subtarget->hasFP32Denormals()) {
2746       SDValue LHS = N->getOperand(0);
2747       SDValue RHS = N->getOperand(1);
2748       if (LHS.getOpcode() == ISD::FADD) {
2749         // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
2750 
2751         SDValue A = LHS.getOperand(0);
2752         if (A == LHS.getOperand(1)) {
2753           const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32);
2754           SDValue NegRHS = DAG.getNode(ISD::FNEG, DL, VT, RHS);
2755 
2756           return DAG.getNode(ISD::FMAD, DL, VT, Two, A, NegRHS);
2757         }
2758       }
2759 
2760       if (RHS.getOpcode() == ISD::FADD) {
2761         // (fsub c, (fadd a, a)) -> mad -2.0, a, c
2762 
2763         SDValue A = RHS.getOperand(0);
2764         if (A == RHS.getOperand(1)) {
2765           const SDValue NegTwo = DAG.getConstantFP(-2.0, DL, MVT::f32);
2766           return DAG.getNode(ISD::FMAD, DL, VT, NegTwo, A, LHS);
2767         }
2768       }
2769 
2770       return SDValue();
2771     }
2772 
2773     break;
2774   }
2775   case ISD::LOAD:
2776   case ISD::STORE:
2777   case ISD::ATOMIC_LOAD:
2778   case ISD::ATOMIC_STORE:
2779   case ISD::ATOMIC_CMP_SWAP:
2780   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
2781   case ISD::ATOMIC_SWAP:
2782   case ISD::ATOMIC_LOAD_ADD:
2783   case ISD::ATOMIC_LOAD_SUB:
2784   case ISD::ATOMIC_LOAD_AND:
2785   case ISD::ATOMIC_LOAD_OR:
2786   case ISD::ATOMIC_LOAD_XOR:
2787   case ISD::ATOMIC_LOAD_NAND:
2788   case ISD::ATOMIC_LOAD_MIN:
2789   case ISD::ATOMIC_LOAD_MAX:
2790   case ISD::ATOMIC_LOAD_UMIN:
2791   case ISD::ATOMIC_LOAD_UMAX:
2792   case AMDGPUISD::ATOMIC_INC:
2793   case AMDGPUISD::ATOMIC_DEC: { // TODO: Target mem intrinsics.
2794     if (DCI.isBeforeLegalize())
2795       break;
2796 
2797     MemSDNode *MemNode = cast<MemSDNode>(N);
2798     SDValue Ptr = MemNode->getBasePtr();
2799 
2800     // TODO: We could also do this for multiplies.
2801     unsigned AS = MemNode->getAddressSpace();
2802     if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) {
2803       SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI);
2804       if (NewPtr) {
2805         SmallVector<SDValue, 8> NewOps(MemNode->op_begin(), MemNode->op_end());
2806 
2807         NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr;
2808         return SDValue(DAG.UpdateNodeOperands(MemNode, NewOps), 0);
2809       }
2810     }
2811     break;
2812   }
2813   case ISD::AND:
2814     return performAndCombine(N, DCI);
2815   case ISD::OR:
2816     return performOrCombine(N, DCI);
2817   case AMDGPUISD::FP_CLASS:
2818     return performClassCombine(N, DCI);
2819   case ISD::FCANONICALIZE:
2820     return performFCanonicalizeCombine(N, DCI);
2821   }
2822   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
2823 }
2824 
2825 /// \brief Analyze the possible immediate value Op
2826 ///
2827 /// Returns -1 if it isn't an immediate, 0 if it's and inline immediate
2828 /// and the immediate value if it's a literal immediate
2829 int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const {
2830 
2831   const SIInstrInfo *TII =
2832       static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
2833 
2834   if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) {
2835     if (TII->isInlineConstant(Node->getAPIntValue()))
2836       return 0;
2837 
2838     uint64_t Val = Node->getZExtValue();
2839     return isUInt<32>(Val) ? Val : -1;
2840   }
2841 
2842   if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) {
2843     if (TII->isInlineConstant(Node->getValueAPF().bitcastToAPInt()))
2844       return 0;
2845 
2846     if (Node->getValueType(0) == MVT::f32)
2847       return FloatToBits(Node->getValueAPF().convertToFloat());
2848 
2849     return -1;
2850   }
2851 
2852   return -1;
2853 }
2854 
2855 /// \brief Helper function for adjustWritemask
2856 static unsigned SubIdx2Lane(unsigned Idx) {
2857   switch (Idx) {
2858   default: return 0;
2859   case AMDGPU::sub0: return 0;
2860   case AMDGPU::sub1: return 1;
2861   case AMDGPU::sub2: return 2;
2862   case AMDGPU::sub3: return 3;
2863   }
2864 }
2865 
2866 /// \brief Adjust the writemask of MIMG instructions
2867 void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
2868                                        SelectionDAG &DAG) const {
2869   SDNode *Users[4] = { };
2870   unsigned Lane = 0;
2871   unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3;
2872   unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
2873   unsigned NewDmask = 0;
2874 
2875   // Try to figure out the used register components
2876   for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
2877        I != E; ++I) {
2878 
2879     // Abort if we can't understand the usage
2880     if (!I->isMachineOpcode() ||
2881         I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
2882       return;
2883 
2884     // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used.
2885     // Note that subregs are packed, i.e. Lane==0 is the first bit set
2886     // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
2887     // set, etc.
2888     Lane = SubIdx2Lane(I->getConstantOperandVal(1));
2889 
2890     // Set which texture component corresponds to the lane.
2891     unsigned Comp;
2892     for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) {
2893       assert(Dmask);
2894       Comp = countTrailingZeros(Dmask);
2895       Dmask &= ~(1 << Comp);
2896     }
2897 
2898     // Abort if we have more than one user per component
2899     if (Users[Lane])
2900       return;
2901 
2902     Users[Lane] = *I;
2903     NewDmask |= 1 << Comp;
2904   }
2905 
2906   // Abort if there's no change
2907   if (NewDmask == OldDmask)
2908     return;
2909 
2910   // Adjust the writemask in the node
2911   std::vector<SDValue> Ops;
2912   Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
2913   Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
2914   Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
2915   Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops);
2916 
2917   // If we only got one lane, replace it with a copy
2918   // (if NewDmask has only one bit set...)
2919   if (NewDmask && (NewDmask & (NewDmask-1)) == 0) {
2920     SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, SDLoc(),
2921                                        MVT::i32);
2922     SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
2923                                       SDLoc(), Users[Lane]->getValueType(0),
2924                                       SDValue(Node, 0), RC);
2925     DAG.ReplaceAllUsesWith(Users[Lane], Copy);
2926     return;
2927   }
2928 
2929   // Update the users of the node with the new indices
2930   for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) {
2931 
2932     SDNode *User = Users[i];
2933     if (!User)
2934       continue;
2935 
2936     SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
2937     DAG.UpdateNodeOperands(User, User->getOperand(0), Op);
2938 
2939     switch (Idx) {
2940     default: break;
2941     case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
2942     case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
2943     case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
2944     }
2945   }
2946 }
2947 
2948 static bool isFrameIndexOp(SDValue Op) {
2949   if (Op.getOpcode() == ISD::AssertZext)
2950     Op = Op.getOperand(0);
2951 
2952   return isa<FrameIndexSDNode>(Op);
2953 }
2954 
2955 /// \brief Legalize target independent instructions (e.g. INSERT_SUBREG)
2956 /// with frame index operands.
2957 /// LLVM assumes that inputs are to these instructions are registers.
2958 void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
2959                                                      SelectionDAG &DAG) const {
2960 
2961   SmallVector<SDValue, 8> Ops;
2962   for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
2963     if (!isFrameIndexOp(Node->getOperand(i))) {
2964       Ops.push_back(Node->getOperand(i));
2965       continue;
2966     }
2967 
2968     SDLoc DL(Node);
2969     Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
2970                                      Node->getOperand(i).getValueType(),
2971                                      Node->getOperand(i)), 0));
2972   }
2973 
2974   DAG.UpdateNodeOperands(Node, Ops);
2975 }
2976 
2977 /// \brief Fold the instructions after selecting them.
2978 SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
2979                                           SelectionDAG &DAG) const {
2980   const SIInstrInfo *TII =
2981       static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
2982   unsigned Opcode = Node->getMachineOpcode();
2983 
2984   if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore())
2985     adjustWritemask(Node, DAG);
2986 
2987   if (Opcode == AMDGPU::INSERT_SUBREG ||
2988       Opcode == AMDGPU::REG_SEQUENCE) {
2989     legalizeTargetIndependentNode(Node, DAG);
2990     return Node;
2991   }
2992   return Node;
2993 }
2994 
2995 /// \brief Assign the register class depending on the number of
2996 /// bits set in the writemask
2997 void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
2998                                                      SDNode *Node) const {
2999   const SIInstrInfo *TII =
3000       static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
3001 
3002   MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
3003 
3004   if (TII->isVOP3(MI->getOpcode())) {
3005     // Make sure constant bus requirements are respected.
3006     TII->legalizeOperandsVOP3(MRI, MI);
3007     return;
3008   }
3009 
3010   if (TII->isMIMG(*MI)) {
3011     unsigned VReg = MI->getOperand(0).getReg();
3012     unsigned DmaskIdx = MI->getNumOperands() == 12 ? 3 : 4;
3013     unsigned Writemask = MI->getOperand(DmaskIdx).getImm();
3014     unsigned BitsSet = 0;
3015     for (unsigned i = 0; i < 4; ++i)
3016       BitsSet += Writemask & (1 << i) ? 1 : 0;
3017 
3018     const TargetRegisterClass *RC;
3019     switch (BitsSet) {
3020     default: return;
3021     case 1:  RC = &AMDGPU::VGPR_32RegClass; break;
3022     case 2:  RC = &AMDGPU::VReg_64RegClass; break;
3023     case 3:  RC = &AMDGPU::VReg_96RegClass; break;
3024     }
3025 
3026     unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet);
3027     MI->setDesc(TII->get(NewOpcode));
3028     MRI.setRegClass(VReg, RC);
3029     return;
3030   }
3031 
3032   // Replace unused atomics with the no return version.
3033   int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI->getOpcode());
3034   if (NoRetAtomicOp != -1) {
3035     if (!Node->hasAnyUseOfValue(0)) {
3036       MI->setDesc(TII->get(NoRetAtomicOp));
3037       MI->RemoveOperand(0);
3038       return;
3039     }
3040 
3041     // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg
3042     // instruction, because the return type of these instructions is a vec2 of
3043     // the memory type, so it can be tied to the input operand.
3044     // This means these instructions always have a use, so we need to add a
3045     // special case to check if the atomic has only one extract_subreg use,
3046     // which itself has no uses.
3047     if ((Node->hasNUsesOfValue(1, 0) &&
3048          Node->use_begin()->isMachineOpcode() &&
3049          Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG &&
3050          !Node->use_begin()->hasAnyUseOfValue(0))) {
3051       unsigned Def = MI->getOperand(0).getReg();
3052 
3053       // Change this into a noret atomic.
3054       MI->setDesc(TII->get(NoRetAtomicOp));
3055       MI->RemoveOperand(0);
3056 
3057       // If we only remove the def operand from the atomic instruction, the
3058       // extract_subreg will be left with a use of a vreg without a def.
3059       // So we need to insert an implicit_def to avoid machine verifier
3060       // errors.
3061       BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3062               TII->get(AMDGPU::IMPLICIT_DEF), Def);
3063     }
3064     return;
3065   }
3066 }
3067 
3068 static SDValue buildSMovImm32(SelectionDAG &DAG, SDLoc DL, uint64_t Val) {
3069   SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
3070   return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
3071 }
3072 
3073 MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
3074                                                 SDLoc DL,
3075                                                 SDValue Ptr) const {
3076   const SIInstrInfo *TII =
3077     static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
3078 
3079   // Build the half of the subregister with the constants before building the
3080   // full 128-bit register. If we are building multiple resource descriptors,
3081   // this will allow CSEing of the 2-component register.
3082   const SDValue Ops0[] = {
3083     DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
3084     buildSMovImm32(DAG, DL, 0),
3085     DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
3086     buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
3087     DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
3088   };
3089 
3090   SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
3091                                                 MVT::v2i32, Ops0), 0);
3092 
3093   // Combine the constants and the pointer.
3094   const SDValue Ops1[] = {
3095     DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
3096     Ptr,
3097     DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
3098     SubRegHi,
3099     DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
3100   };
3101 
3102   return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
3103 }
3104 
3105 /// \brief Return a resource descriptor with the 'Add TID' bit enabled
3106 ///        The TID (Thread ID) is multiplied by the stride value (bits [61:48]
3107 ///        of the resource descriptor) to create an offset, which is added to
3108 ///        the resource pointer.
3109 MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG,
3110                                            SDLoc DL,
3111                                            SDValue Ptr,
3112                                            uint32_t RsrcDword1,
3113                                            uint64_t RsrcDword2And3) const {
3114   SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
3115   SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
3116   if (RsrcDword1) {
3117     PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
3118                                      DAG.getConstant(RsrcDword1, DL, MVT::i32)),
3119                     0);
3120   }
3121 
3122   SDValue DataLo = buildSMovImm32(DAG, DL,
3123                                   RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
3124   SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
3125 
3126   const SDValue Ops[] = {
3127     DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
3128     PtrLo,
3129     DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
3130     PtrHi,
3131     DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
3132     DataLo,
3133     DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
3134     DataHi,
3135     DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
3136   };
3137 
3138   return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
3139 }
3140 
3141 SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
3142                                                const TargetRegisterClass *RC,
3143                                                unsigned Reg, EVT VT) const {
3144   SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT);
3145 
3146   return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()),
3147                             cast<RegisterSDNode>(VReg)->getReg(), VT);
3148 }
3149 
3150 //===----------------------------------------------------------------------===//
3151 //                         SI Inline Assembly Support
3152 //===----------------------------------------------------------------------===//
3153 
3154 std::pair<unsigned, const TargetRegisterClass *>
3155 SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
3156                                                StringRef Constraint,
3157                                                MVT VT) const {
3158 
3159   if (Constraint.size() == 1) {
3160     switch (Constraint[0]) {
3161     case 's':
3162     case 'r':
3163       switch (VT.getSizeInBits()) {
3164       default:
3165         return std::make_pair(0U, nullptr);
3166       case 32:
3167         return std::make_pair(0U, &AMDGPU::SGPR_32RegClass);
3168       case 64:
3169         return std::make_pair(0U, &AMDGPU::SGPR_64RegClass);
3170       case 128:
3171         return std::make_pair(0U, &AMDGPU::SReg_128RegClass);
3172       case 256:
3173         return std::make_pair(0U, &AMDGPU::SReg_256RegClass);
3174       }
3175 
3176     case 'v':
3177       switch (VT.getSizeInBits()) {
3178       default:
3179         return std::make_pair(0U, nullptr);
3180       case 32:
3181         return std::make_pair(0U, &AMDGPU::VGPR_32RegClass);
3182       case 64:
3183         return std::make_pair(0U, &AMDGPU::VReg_64RegClass);
3184       case 96:
3185         return std::make_pair(0U, &AMDGPU::VReg_96RegClass);
3186       case 128:
3187         return std::make_pair(0U, &AMDGPU::VReg_128RegClass);
3188       case 256:
3189         return std::make_pair(0U, &AMDGPU::VReg_256RegClass);
3190       case 512:
3191         return std::make_pair(0U, &AMDGPU::VReg_512RegClass);
3192       }
3193     }
3194   }
3195 
3196   if (Constraint.size() > 1) {
3197     const TargetRegisterClass *RC = nullptr;
3198     if (Constraint[1] == 'v') {
3199       RC = &AMDGPU::VGPR_32RegClass;
3200     } else if (Constraint[1] == 's') {
3201       RC = &AMDGPU::SGPR_32RegClass;
3202     }
3203 
3204     if (RC) {
3205       uint32_t Idx;
3206       bool Failed = Constraint.substr(2).getAsInteger(10, Idx);
3207       if (!Failed && Idx < RC->getNumRegs())
3208         return std::make_pair(RC->getRegister(Idx), RC);
3209     }
3210   }
3211   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
3212 }
3213 
3214 SITargetLowering::ConstraintType
3215 SITargetLowering::getConstraintType(StringRef Constraint) const {
3216   if (Constraint.size() == 1) {
3217     switch (Constraint[0]) {
3218     default: break;
3219     case 's':
3220     case 'v':
3221       return C_RegisterClass;
3222     }
3223   }
3224   return TargetLowering::getConstraintType(Constraint);
3225 }
3226