1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief Custom DAG lowering for SI
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #ifdef _MSC_VER
16 // Provide M_PI.
17 #define _USE_MATH_DEFINES
18 #include <cmath>
19 #endif
20 
21 #include "AMDGPU.h"
22 #include "AMDGPUIntrinsicInfo.h"
23 #include "AMDGPUSubtarget.h"
24 #include "SIISelLowering.h"
25 #include "SIInstrInfo.h"
26 #include "SIMachineFunctionInfo.h"
27 #include "SIRegisterInfo.h"
28 #include "llvm/ADT/BitVector.h"
29 #include "llvm/ADT/StringSwitch.h"
30 #include "llvm/CodeGen/CallingConvLower.h"
31 #include "llvm/CodeGen/MachineInstrBuilder.h"
32 #include "llvm/CodeGen/MachineRegisterInfo.h"
33 #include "llvm/CodeGen/SelectionDAG.h"
34 #include "llvm/IR/DiagnosticInfo.h"
35 #include "llvm/IR/Function.h"
36 
37 using namespace llvm;
38 
39 static unsigned findFirstFreeSGPR(CCState &CCInfo) {
40   unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
41   for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
42     if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
43       return AMDGPU::SGPR0 + Reg;
44     }
45   }
46   llvm_unreachable("Cannot allocate sgpr");
47 }
48 
49 SITargetLowering::SITargetLowering(TargetMachine &TM,
50                                    const AMDGPUSubtarget &STI)
51     : AMDGPUTargetLowering(TM, STI) {
52   addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
53   addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
54 
55   addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
56   addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
57 
58   addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
59   addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
60   addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
61 
62   addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass);
63   addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass);
64 
65   addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
66   addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
67 
68   addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
69   addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
70 
71   addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
72   addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
73 
74   computeRegisterProperties(STI.getRegisterInfo());
75 
76   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
77   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
78   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
79   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
80 
81   setOperationAction(ISD::ADD, MVT::i32, Legal);
82   setOperationAction(ISD::ADDC, MVT::i32, Legal);
83   setOperationAction(ISD::ADDE, MVT::i32, Legal);
84   setOperationAction(ISD::SUBC, MVT::i32, Legal);
85   setOperationAction(ISD::SUBE, MVT::i32, Legal);
86 
87   setOperationAction(ISD::FSIN, MVT::f32, Custom);
88   setOperationAction(ISD::FCOS, MVT::f32, Custom);
89 
90   setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
91   setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
92 
93   // We need to custom lower vector stores from local memory
94   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
95   setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
96   setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
97 
98   setOperationAction(ISD::STORE, MVT::v8i32, Custom);
99   setOperationAction(ISD::STORE, MVT::v16i32, Custom);
100 
101   setOperationAction(ISD::STORE, MVT::i1, Custom);
102   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
103 
104   setOperationAction(ISD::SELECT, MVT::i64, Custom);
105   setOperationAction(ISD::SELECT, MVT::f64, Promote);
106   AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
107 
108   setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
109   setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
110   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
111   setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
112 
113   setOperationAction(ISD::SETCC, MVT::i1, Promote);
114   setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
115   setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
116 
117   setOperationAction(ISD::BSWAP, MVT::i32, Legal);
118   setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
119 
120   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal);
121   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
122   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
123 
124   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Legal);
125   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
126   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
127 
128   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
129   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
130   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
131 
132   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
133   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
134 
135   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
136   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
137   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom);
138   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
139 
140   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
141 
142   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
143   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
144   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
145   setOperationAction(ISD::BR_CC, MVT::i64, Expand);
146   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
147   setOperationAction(ISD::BR_CC, MVT::f64, Expand);
148 
149   // On SI this is s_memtime and s_memrealtime on VI.
150   setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
151 
152   for (MVT VT : MVT::integer_valuetypes()) {
153     if (VT == MVT::i64)
154       continue;
155 
156     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
157     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal);
158     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal);
159     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
160 
161     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
162     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal);
163     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal);
164     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);
165 
166     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
167     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal);
168     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal);
169     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
170   }
171 
172   for (MVT VT : MVT::integer_vector_valuetypes()) {
173     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i16, Expand);
174     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v16i16, Expand);
175   }
176 
177   for (MVT VT : MVT::fp_valuetypes())
178     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
179 
180   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
181   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
182 
183   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
184   setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
185   setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
186   setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
187 
188 
189   setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
190 
191   setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
192   setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
193 
194   setOperationAction(ISD::LOAD, MVT::i1, Custom);
195 
196   setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
197   AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
198 
199   setOperationAction(ISD::STORE, MVT::v2i64, Promote);
200   AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
201 
202   setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);
203 
204   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
205   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
206   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
207 
208   // These should use UDIVREM, so set them to expand
209   setOperationAction(ISD::UDIV, MVT::i64, Expand);
210   setOperationAction(ISD::UREM, MVT::i64, Expand);
211 
212   setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
213   setOperationAction(ISD::SELECT, MVT::i1, Promote);
214 
215   setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
216 
217 
218   setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
219 
220   // We only support LOAD/STORE and vector manipulation ops for vectors
221   // with > 4 elements.
222   for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64}) {
223     for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
224       switch(Op) {
225       case ISD::LOAD:
226       case ISD::STORE:
227       case ISD::BUILD_VECTOR:
228       case ISD::BITCAST:
229       case ISD::EXTRACT_VECTOR_ELT:
230       case ISD::INSERT_VECTOR_ELT:
231       case ISD::INSERT_SUBVECTOR:
232       case ISD::EXTRACT_SUBVECTOR:
233       case ISD::SCALAR_TO_VECTOR:
234         break;
235       case ISD::CONCAT_VECTORS:
236         setOperationAction(Op, VT, Custom);
237         break;
238       default:
239         setOperationAction(Op, VT, Expand);
240         break;
241       }
242     }
243   }
244 
245   // Most operations are naturally 32-bit vector operations. We only support
246   // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
247   for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
248     setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
249     AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
250 
251     setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
252     AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
253 
254     setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
255     AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
256 
257     setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
258     AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
259   }
260 
261   if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
262     setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
263     setOperationAction(ISD::FCEIL, MVT::f64, Legal);
264     setOperationAction(ISD::FRINT, MVT::f64, Legal);
265   }
266 
267   setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
268   setOperationAction(ISD::FDIV, MVT::f32, Custom);
269   setOperationAction(ISD::FDIV, MVT::f64, Custom);
270 
271   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
272   // and output demarshalling
273   setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
274   setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
275 
276   // We can't return success/failure, only the old value,
277   // let LLVM add the comparison
278   setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
279   setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand);
280 
281   if (Subtarget->hasFlatAddressSpace()) {
282     setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
283     setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
284   }
285 
286   setTargetDAGCombine(ISD::FADD);
287   setTargetDAGCombine(ISD::FSUB);
288   setTargetDAGCombine(ISD::FMINNUM);
289   setTargetDAGCombine(ISD::FMAXNUM);
290   setTargetDAGCombine(ISD::SMIN);
291   setTargetDAGCombine(ISD::SMAX);
292   setTargetDAGCombine(ISD::UMIN);
293   setTargetDAGCombine(ISD::UMAX);
294   setTargetDAGCombine(ISD::SETCC);
295   setTargetDAGCombine(ISD::AND);
296   setTargetDAGCombine(ISD::OR);
297   setTargetDAGCombine(ISD::UINT_TO_FP);
298   setTargetDAGCombine(ISD::FCANONICALIZE);
299 
300   // All memory operations. Some folding on the pointer operand is done to help
301   // matching the constant offsets in the addressing modes.
302   setTargetDAGCombine(ISD::LOAD);
303   setTargetDAGCombine(ISD::STORE);
304   setTargetDAGCombine(ISD::ATOMIC_LOAD);
305   setTargetDAGCombine(ISD::ATOMIC_STORE);
306   setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP);
307   setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
308   setTargetDAGCombine(ISD::ATOMIC_SWAP);
309   setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD);
310   setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB);
311   setTargetDAGCombine(ISD::ATOMIC_LOAD_AND);
312   setTargetDAGCombine(ISD::ATOMIC_LOAD_OR);
313   setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR);
314   setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND);
315   setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN);
316   setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX);
317   setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN);
318   setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
319 
320   setSchedulingPreference(Sched::RegPressure);
321 }
322 
323 //===----------------------------------------------------------------------===//
324 // TargetLowering queries
325 //===----------------------------------------------------------------------===//
326 
327 bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
328                                           const CallInst &CI,
329                                           unsigned IntrID) const {
330   switch (IntrID) {
331   case Intrinsic::amdgcn_atomic_inc:
332   case Intrinsic::amdgcn_atomic_dec:
333     Info.opc = ISD::INTRINSIC_W_CHAIN;
334     Info.memVT = MVT::getVT(CI.getType());
335     Info.ptrVal = CI.getOperand(0);
336     Info.align = 0;
337     Info.vol = false;
338     Info.readMem = true;
339     Info.writeMem = true;
340     return true;
341   default:
342     return false;
343   }
344 }
345 
346 bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &,
347                                           EVT) const {
348   // SI has some legal vector types, but no legal vector operations. Say no
349   // shuffles are legal in order to prefer scalarizing some vector operations.
350   return false;
351 }
352 
353 bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
354   // Flat instructions do not have offsets, and only have the register
355   // address.
356   return AM.BaseOffs == 0 && (AM.Scale == 0 || AM.Scale == 1);
357 }
358 
359 bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
360   // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
361   // additionally can do r + r + i with addr64. 32-bit has more addressing
362   // mode options. Depending on the resource constant, it can also do
363   // (i64 r0) + (i32 r1) * (i14 i).
364   //
365   // Private arrays end up using a scratch buffer most of the time, so also
366   // assume those use MUBUF instructions. Scratch loads / stores are currently
367   // implemented as mubuf instructions with offen bit set, so slightly
368   // different than the normal addr64.
369   if (!isUInt<12>(AM.BaseOffs))
370     return false;
371 
372   // FIXME: Since we can split immediate into soffset and immediate offset,
373   // would it make sense to allow any immediate?
374 
375   switch (AM.Scale) {
376   case 0: // r + i or just i, depending on HasBaseReg.
377     return true;
378   case 1:
379     return true; // We have r + r or r + i.
380   case 2:
381     if (AM.HasBaseReg) {
382       // Reject 2 * r + r.
383       return false;
384     }
385 
386     // Allow 2 * r as r + r
387     // Or  2 * r + i is allowed as r + r + i.
388     return true;
389   default: // Don't allow n * r
390     return false;
391   }
392 }
393 
394 bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
395                                              const AddrMode &AM, Type *Ty,
396                                              unsigned AS) const {
397   // No global is ever allowed as a base.
398   if (AM.BaseGV)
399     return false;
400 
401   switch (AS) {
402   case AMDGPUAS::GLOBAL_ADDRESS: {
403     if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
404       // Assume the we will use FLAT for all global memory accesses
405       // on VI.
406       // FIXME: This assumption is currently wrong.  On VI we still use
407       // MUBUF instructions for the r + i addressing mode.  As currently
408       // implemented, the MUBUF instructions only work on buffer < 4GB.
409       // It may be possible to support > 4GB buffers with MUBUF instructions,
410       // by setting the stride value in the resource descriptor which would
411       // increase the size limit to (stride * 4GB).  However, this is risky,
412       // because it has never been validated.
413       return isLegalFlatAddressingMode(AM);
414     }
415 
416     return isLegalMUBUFAddressingMode(AM);
417   }
418   case AMDGPUAS::CONSTANT_ADDRESS: {
419     // If the offset isn't a multiple of 4, it probably isn't going to be
420     // correctly aligned.
421     if (AM.BaseOffs % 4 != 0)
422       return isLegalMUBUFAddressingMode(AM);
423 
424     // There are no SMRD extloads, so if we have to do a small type access we
425     // will use a MUBUF load.
426     // FIXME?: We also need to do this if unaligned, but we don't know the
427     // alignment here.
428     if (DL.getTypeStoreSize(Ty) < 4)
429       return isLegalMUBUFAddressingMode(AM);
430 
431     if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
432       // SMRD instructions have an 8-bit, dword offset on SI.
433       if (!isUInt<8>(AM.BaseOffs / 4))
434         return false;
435     } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
436       // On CI+, this can also be a 32-bit literal constant offset. If it fits
437       // in 8-bits, it can use a smaller encoding.
438       if (!isUInt<32>(AM.BaseOffs / 4))
439         return false;
440     } else if (Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS) {
441       // On VI, these use the SMEM format and the offset is 20-bit in bytes.
442       if (!isUInt<20>(AM.BaseOffs))
443         return false;
444     } else
445       llvm_unreachable("unhandled generation");
446 
447     if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
448       return true;
449 
450     if (AM.Scale == 1 && AM.HasBaseReg)
451       return true;
452 
453     return false;
454   }
455 
456   case AMDGPUAS::PRIVATE_ADDRESS:
457     return isLegalMUBUFAddressingMode(AM);
458 
459   case AMDGPUAS::LOCAL_ADDRESS:
460   case AMDGPUAS::REGION_ADDRESS: {
461     // Basic, single offset DS instructions allow a 16-bit unsigned immediate
462     // field.
463     // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
464     // an 8-bit dword offset but we don't know the alignment here.
465     if (!isUInt<16>(AM.BaseOffs))
466       return false;
467 
468     if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
469       return true;
470 
471     if (AM.Scale == 1 && AM.HasBaseReg)
472       return true;
473 
474     return false;
475   }
476   case AMDGPUAS::FLAT_ADDRESS:
477   case AMDGPUAS::UNKNOWN_ADDRESS_SPACE:
478     // For an unknown address space, this usually means that this is for some
479     // reason being used for pure arithmetic, and not based on some addressing
480     // computation. We don't have instructions that compute pointers with any
481     // addressing modes, so treat them as having no offset like flat
482     // instructions.
483     return isLegalFlatAddressingMode(AM);
484 
485   default:
486     llvm_unreachable("unhandled address space");
487   }
488 }
489 
490 bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
491                                                       unsigned AddrSpace,
492                                                       unsigned Align,
493                                                       bool *IsFast) const {
494   if (IsFast)
495     *IsFast = false;
496 
497   // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
498   // which isn't a simple VT.
499   if (!VT.isSimple() || VT == MVT::Other)
500     return false;
501 
502   // TODO - CI+ supports unaligned memory accesses, but this requires driver
503   // support.
504 
505   // XXX - The only mention I see of this in the ISA manual is for LDS direct
506   // reads the "byte address and must be dword aligned". Is it also true for the
507   // normal loads and stores?
508   if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS) {
509     // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
510     // aligned, 8 byte access in a single operation using ds_read2/write2_b32
511     // with adjacent offsets.
512     bool AlignedBy4 = (Align % 4 == 0);
513     if (IsFast)
514       *IsFast = AlignedBy4;
515     return AlignedBy4;
516   }
517 
518   // Smaller than dword value must be aligned.
519   // FIXME: This should be allowed on CI+
520   if (VT.bitsLT(MVT::i32))
521     return false;
522 
523   // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
524   // byte-address are ignored, thus forcing Dword alignment.
525   // This applies to private, global, and constant memory.
526   if (IsFast)
527     *IsFast = true;
528 
529   return VT.bitsGT(MVT::i32) && Align % 4 == 0;
530 }
531 
532 EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
533                                           unsigned SrcAlign, bool IsMemset,
534                                           bool ZeroMemset,
535                                           bool MemcpyStrSrc,
536                                           MachineFunction &MF) const {
537   // FIXME: Should account for address space here.
538 
539   // The default fallback uses the private pointer size as a guess for a type to
540   // use. Make sure we switch these to 64-bit accesses.
541 
542   if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global
543     return MVT::v4i32;
544 
545   if (Size >= 8 && DstAlign >= 4)
546     return MVT::v2i32;
547 
548   // Use the default.
549   return MVT::Other;
550 }
551 
552 static bool isFlatGlobalAddrSpace(unsigned AS) {
553   return AS == AMDGPUAS::GLOBAL_ADDRESS ||
554     AS == AMDGPUAS::FLAT_ADDRESS ||
555     AS == AMDGPUAS::CONSTANT_ADDRESS;
556 }
557 
558 bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
559                                            unsigned DestAS) const {
560   return isFlatGlobalAddrSpace(SrcAS) &&  isFlatGlobalAddrSpace(DestAS);
561 }
562 
563 
564 bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
565   const MemSDNode *MemNode = cast<MemSDNode>(N);
566   const Value *Ptr = MemNode->getMemOperand()->getValue();
567 
568   // UndefValue means this is a load of a kernel input.  These are uniform.
569   // Sometimes LDS instructions have constant pointers
570   if (isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || isa<Constant>(Ptr) ||
571       isa<GlobalValue>(Ptr))
572     return true;
573 
574   const Instruction *I = dyn_cast_or_null<Instruction>(Ptr);
575   return I && I->getMetadata("amdgpu.uniform");
576 }
577 
578 TargetLoweringBase::LegalizeTypeAction
579 SITargetLowering::getPreferredVectorAction(EVT VT) const {
580   if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
581     return TypeSplitVector;
582 
583   return TargetLoweringBase::getPreferredVectorAction(VT);
584 }
585 
586 bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
587                                                          Type *Ty) const {
588   const SIInstrInfo *TII =
589       static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
590   return TII->isInlineConstant(Imm);
591 }
592 
593 bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
594 
595   // SimplifySetCC uses this function to determine whether or not it should
596   // create setcc with i1 operands.  We don't have instructions for i1 setcc.
597   if (VT == MVT::i1 && Op == ISD::SETCC)
598     return false;
599 
600   return TargetLowering::isTypeDesirableForOp(Op, VT);
601 }
602 
603 SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
604                                          SDLoc SL, SDValue Chain,
605                                          unsigned Offset, bool Signed) const {
606   const DataLayout &DL = DAG.getDataLayout();
607   MachineFunction &MF = DAG.getMachineFunction();
608   const SIRegisterInfo *TRI =
609       static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
610   unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
611 
612   Type *Ty = VT.getTypeForEVT(*DAG.getContext());
613 
614   MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
615   MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
616   PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
617   SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
618                                        MRI.getLiveInVirtReg(InputPtrReg), PtrVT);
619   SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
620                             DAG.getConstant(Offset, SL, PtrVT));
621   SDValue PtrOffset = DAG.getUNDEF(PtrVT);
622   MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
623 
624   unsigned Align = DL.getABITypeAlignment(Ty);
625 
626   ISD::LoadExtType ExtTy = Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
627   if (MemVT.isFloatingPoint())
628     ExtTy = ISD::EXTLOAD;
629 
630   return DAG.getLoad(ISD::UNINDEXED, ExtTy,
631                      VT, SL, Chain, Ptr, PtrOffset, PtrInfo, MemVT,
632                      false, // isVolatile
633                      true, // isNonTemporal
634                      true, // isInvariant
635                      Align); // Alignment
636 }
637 
638 SDValue SITargetLowering::LowerFormalArguments(
639     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
640     const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
641     SmallVectorImpl<SDValue> &InVals) const {
642   const SIRegisterInfo *TRI =
643       static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
644 
645   MachineFunction &MF = DAG.getMachineFunction();
646   FunctionType *FType = MF.getFunction()->getFunctionType();
647   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
648   const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
649 
650   if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
651     const Function *Fn = MF.getFunction();
652     DiagnosticInfoUnsupported NoGraphicsHSA(
653         *Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
654     DAG.getContext()->diagnose(NoGraphicsHSA);
655     return SDValue();
656   }
657 
658   SmallVector<ISD::InputArg, 16> Splits;
659   BitVector Skipped(Ins.size());
660 
661   for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) {
662     const ISD::InputArg &Arg = Ins[i];
663 
664     // First check if it's a PS input addr
665     if (CallConv == CallingConv::AMDGPU_PS && !Arg.Flags.isInReg() &&
666         !Arg.Flags.isByVal() && PSInputNum <= 15) {
667 
668       if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) {
669         // We can safely skip PS inputs
670         Skipped.set(i);
671         ++PSInputNum;
672         continue;
673       }
674 
675       Info->markPSInputAllocated(PSInputNum);
676       if (Arg.Used)
677         Info->PSInputEna |= 1 << PSInputNum;
678 
679       ++PSInputNum;
680     }
681 
682     // Second split vertices into their elements
683     if (AMDGPU::isShader(CallConv) &&
684         Arg.VT.isVector()) {
685       ISD::InputArg NewArg = Arg;
686       NewArg.Flags.setSplit();
687       NewArg.VT = Arg.VT.getVectorElementType();
688 
689       // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
690       // three or five element vertex only needs three or five registers,
691       // NOT four or eight.
692       Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
693       unsigned NumElements = ParamType->getVectorNumElements();
694 
695       for (unsigned j = 0; j != NumElements; ++j) {
696         Splits.push_back(NewArg);
697         NewArg.PartOffset += NewArg.VT.getStoreSize();
698       }
699 
700     } else if (AMDGPU::isShader(CallConv)) {
701       Splits.push_back(Arg);
702     }
703   }
704 
705   SmallVector<CCValAssign, 16> ArgLocs;
706   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
707                  *DAG.getContext());
708 
709   // At least one interpolation mode must be enabled or else the GPU will hang.
710   //
711   // Check PSInputAddr instead of PSInputEna. The idea is that if the user set
712   // PSInputAddr, the user wants to enable some bits after the compilation
713   // based on run-time states. Since we can't know what the final PSInputEna
714   // will look like, so we shouldn't do anything here and the user should take
715   // responsibility for the correct programming.
716   //
717   // Otherwise, the following restrictions apply:
718   // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
719   // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
720   //   enabled too.
721   if (CallConv == CallingConv::AMDGPU_PS &&
722       ((Info->getPSInputAddr() & 0x7F) == 0 ||
723        ((Info->getPSInputAddr() & 0xF) == 0 &&
724 	Info->isPSInputAllocated(11)))) {
725     CCInfo.AllocateReg(AMDGPU::VGPR0);
726     CCInfo.AllocateReg(AMDGPU::VGPR1);
727     Info->markPSInputAllocated(0);
728     Info->PSInputEna |= 1;
729   }
730 
731   if (!AMDGPU::isShader(CallConv)) {
732     getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
733                             Splits);
734 
735     assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
736   } else {
737     assert(!Info->hasPrivateSegmentBuffer() && !Info->hasDispatchPtr() &&
738            !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
739            !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
740            !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
741            !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
742            !Info->hasWorkItemIDZ());
743   }
744 
745   // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
746   if (Info->hasPrivateSegmentBuffer()) {
747     unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI);
748     MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass);
749     CCInfo.AllocateReg(PrivateSegmentBufferReg);
750   }
751 
752   if (Info->hasDispatchPtr()) {
753     unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI);
754     MF.addLiveIn(DispatchPtrReg, &AMDGPU::SReg_64RegClass);
755     CCInfo.AllocateReg(DispatchPtrReg);
756   }
757 
758   if (Info->hasQueuePtr()) {
759     unsigned QueuePtrReg = Info->addQueuePtr(*TRI);
760     MF.addLiveIn(QueuePtrReg, &AMDGPU::SReg_64RegClass);
761     CCInfo.AllocateReg(QueuePtrReg);
762   }
763 
764   if (Info->hasKernargSegmentPtr()) {
765     unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI);
766     MF.addLiveIn(InputPtrReg, &AMDGPU::SReg_64RegClass);
767     CCInfo.AllocateReg(InputPtrReg);
768   }
769 
770   if (Info->hasFlatScratchInit()) {
771     unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI);
772     MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SReg_64RegClass);
773     CCInfo.AllocateReg(FlatScratchInitReg);
774   }
775 
776   AnalyzeFormalArguments(CCInfo, Splits);
777 
778   SmallVector<SDValue, 16> Chains;
779 
780   for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
781 
782     const ISD::InputArg &Arg = Ins[i];
783     if (Skipped[i]) {
784       InVals.push_back(DAG.getUNDEF(Arg.VT));
785       continue;
786     }
787 
788     CCValAssign &VA = ArgLocs[ArgIdx++];
789     MVT VT = VA.getLocVT();
790 
791     if (VA.isMemLoc()) {
792       VT = Ins[i].VT;
793       EVT MemVT = Splits[i].VT;
794       const unsigned Offset = Subtarget->getExplicitKernelArgOffset() +
795                               VA.getLocMemOffset();
796       // The first 36 bytes of the input buffer contains information about
797       // thread group and global sizes.
798       SDValue Arg = LowerParameter(DAG, VT, MemVT,  DL, Chain,
799                                    Offset, Ins[i].Flags.isSExt());
800       Chains.push_back(Arg.getValue(1));
801 
802       auto *ParamTy =
803         dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
804       if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
805           ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
806         // On SI local pointers are just offsets into LDS, so they are always
807         // less than 16-bits.  On CI and newer they could potentially be
808         // real pointers, so we can't guarantee their size.
809         Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
810                           DAG.getValueType(MVT::i16));
811       }
812 
813       InVals.push_back(Arg);
814       Info->ABIArgOffset = Offset + MemVT.getStoreSize();
815       continue;
816     }
817     assert(VA.isRegLoc() && "Parameter must be in a register!");
818 
819     unsigned Reg = VA.getLocReg();
820 
821     if (VT == MVT::i64) {
822       // For now assume it is a pointer
823       Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0,
824                                      &AMDGPU::SReg_64RegClass);
825       Reg = MF.addLiveIn(Reg, &AMDGPU::SReg_64RegClass);
826       SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
827       InVals.push_back(Copy);
828       continue;
829     }
830 
831     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
832 
833     Reg = MF.addLiveIn(Reg, RC);
834     SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
835 
836     if (Arg.VT.isVector()) {
837 
838       // Build a vector from the registers
839       Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
840       unsigned NumElements = ParamType->getVectorNumElements();
841 
842       SmallVector<SDValue, 4> Regs;
843       Regs.push_back(Val);
844       for (unsigned j = 1; j != NumElements; ++j) {
845         Reg = ArgLocs[ArgIdx++].getLocReg();
846         Reg = MF.addLiveIn(Reg, RC);
847 
848         SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
849         Regs.push_back(Copy);
850       }
851 
852       // Fill up the missing vector elements
853       NumElements = Arg.VT.getVectorNumElements() - NumElements;
854       Regs.append(NumElements, DAG.getUNDEF(VT));
855 
856       InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs));
857       continue;
858     }
859 
860     InVals.push_back(Val);
861   }
862 
863   // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
864   // these from the dispatch pointer.
865 
866   // Start adding system SGPRs.
867   if (Info->hasWorkGroupIDX()) {
868     unsigned Reg = Info->addWorkGroupIDX();
869     MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
870     CCInfo.AllocateReg(Reg);
871   }
872 
873   if (Info->hasWorkGroupIDY()) {
874     unsigned Reg = Info->addWorkGroupIDY();
875     MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
876     CCInfo.AllocateReg(Reg);
877   }
878 
879   if (Info->hasWorkGroupIDZ()) {
880     unsigned Reg = Info->addWorkGroupIDZ();
881     MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
882     CCInfo.AllocateReg(Reg);
883   }
884 
885   if (Info->hasWorkGroupInfo()) {
886     unsigned Reg = Info->addWorkGroupInfo();
887     MF.addLiveIn(Reg, &AMDGPU::SReg_32RegClass);
888     CCInfo.AllocateReg(Reg);
889   }
890 
891   if (Info->hasPrivateSegmentWaveByteOffset()) {
892     // Scratch wave offset passed in system SGPR.
893     unsigned PrivateSegmentWaveByteOffsetReg;
894 
895     if (AMDGPU::isShader(CallConv)) {
896       PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
897       Info->setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
898     } else
899       PrivateSegmentWaveByteOffsetReg = Info->addPrivateSegmentWaveByteOffset();
900 
901     MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
902     CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
903   }
904 
905   // Now that we've figured out where the scratch register inputs are, see if
906   // should reserve the arguments and use them directly.
907   bool HasStackObjects = MF.getFrameInfo()->hasStackObjects();
908   // Record that we know we have non-spill stack objects so we don't need to
909   // check all stack objects later.
910   if (HasStackObjects)
911     Info->setHasNonSpillStackObjects(true);
912 
913   if (ST.isAmdHsaOS()) {
914     // TODO: Assume we will spill without optimizations.
915     if (HasStackObjects) {
916       // If we have stack objects, we unquestionably need the private buffer
917       // resource. For the HSA ABI, this will be the first 4 user SGPR
918       // inputs. We can reserve those and use them directly.
919 
920       unsigned PrivateSegmentBufferReg = TRI->getPreloadedValue(
921         MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
922       Info->setScratchRSrcReg(PrivateSegmentBufferReg);
923 
924       unsigned PrivateSegmentWaveByteOffsetReg = TRI->getPreloadedValue(
925         MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
926       Info->setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
927     } else {
928       unsigned ReservedBufferReg
929         = TRI->reservedPrivateSegmentBufferReg(MF);
930       unsigned ReservedOffsetReg
931         = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
932 
933       // We tentatively reserve the last registers (skipping the last two
934       // which may contain VCC). After register allocation, we'll replace
935       // these with the ones immediately after those which were really
936       // allocated. In the prologue copies will be inserted from the argument
937       // to these reserved registers.
938       Info->setScratchRSrcReg(ReservedBufferReg);
939       Info->setScratchWaveOffsetReg(ReservedOffsetReg);
940     }
941   } else {
942     unsigned ReservedBufferReg = TRI->reservedPrivateSegmentBufferReg(MF);
943 
944     // Without HSA, relocations are used for the scratch pointer and the
945     // buffer resource setup is always inserted in the prologue. Scratch wave
946     // offset is still in an input SGPR.
947     Info->setScratchRSrcReg(ReservedBufferReg);
948 
949     if (HasStackObjects) {
950       unsigned ScratchWaveOffsetReg = TRI->getPreloadedValue(
951         MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
952       Info->setScratchWaveOffsetReg(ScratchWaveOffsetReg);
953     } else {
954       unsigned ReservedOffsetReg
955         = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
956       Info->setScratchWaveOffsetReg(ReservedOffsetReg);
957     }
958   }
959 
960   if (Info->hasWorkItemIDX()) {
961     unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X);
962     MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
963     CCInfo.AllocateReg(Reg);
964   }
965 
966   if (Info->hasWorkItemIDY()) {
967     unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y);
968     MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
969     CCInfo.AllocateReg(Reg);
970   }
971 
972   if (Info->hasWorkItemIDZ()) {
973     unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z);
974     MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
975     CCInfo.AllocateReg(Reg);
976   }
977 
978   if (Chains.empty())
979     return Chain;
980 
981   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
982 }
983 
984 SDValue SITargetLowering::LowerReturn(SDValue Chain,
985                                       CallingConv::ID CallConv,
986                                       bool isVarArg,
987                                       const SmallVectorImpl<ISD::OutputArg> &Outs,
988                                       const SmallVectorImpl<SDValue> &OutVals,
989                                       SDLoc DL, SelectionDAG &DAG) const {
990   MachineFunction &MF = DAG.getMachineFunction();
991   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
992 
993   if (!AMDGPU::isShader(CallConv))
994     return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
995                                              OutVals, DL, DAG);
996 
997   Info->setIfReturnsVoid(Outs.size() == 0);
998 
999   SmallVector<ISD::OutputArg, 48> Splits;
1000   SmallVector<SDValue, 48> SplitVals;
1001 
1002   // Split vectors into their elements.
1003   for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
1004     const ISD::OutputArg &Out = Outs[i];
1005 
1006     if (Out.VT.isVector()) {
1007       MVT VT = Out.VT.getVectorElementType();
1008       ISD::OutputArg NewOut = Out;
1009       NewOut.Flags.setSplit();
1010       NewOut.VT = VT;
1011 
1012       // We want the original number of vector elements here, e.g.
1013       // three or five, not four or eight.
1014       unsigned NumElements = Out.ArgVT.getVectorNumElements();
1015 
1016       for (unsigned j = 0; j != NumElements; ++j) {
1017         SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, OutVals[i],
1018                                    DAG.getConstant(j, DL, MVT::i32));
1019         SplitVals.push_back(Elem);
1020         Splits.push_back(NewOut);
1021         NewOut.PartOffset += NewOut.VT.getStoreSize();
1022       }
1023     } else {
1024       SplitVals.push_back(OutVals[i]);
1025       Splits.push_back(Out);
1026     }
1027   }
1028 
1029   // CCValAssign - represent the assignment of the return value to a location.
1030   SmallVector<CCValAssign, 48> RVLocs;
1031 
1032   // CCState - Info about the registers and stack slots.
1033   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
1034                  *DAG.getContext());
1035 
1036   // Analyze outgoing return values.
1037   AnalyzeReturn(CCInfo, Splits);
1038 
1039   SDValue Flag;
1040   SmallVector<SDValue, 48> RetOps;
1041   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
1042 
1043   // Copy the result values into the output registers.
1044   for (unsigned i = 0, realRVLocIdx = 0;
1045        i != RVLocs.size();
1046        ++i, ++realRVLocIdx) {
1047     CCValAssign &VA = RVLocs[i];
1048     assert(VA.isRegLoc() && "Can only return in registers!");
1049 
1050     SDValue Arg = SplitVals[realRVLocIdx];
1051 
1052     // Copied from other backends.
1053     switch (VA.getLocInfo()) {
1054     default: llvm_unreachable("Unknown loc info!");
1055     case CCValAssign::Full:
1056       break;
1057     case CCValAssign::BCvt:
1058       Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
1059       break;
1060     }
1061 
1062     Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
1063     Flag = Chain.getValue(1);
1064     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
1065   }
1066 
1067   // Update chain and glue.
1068   RetOps[0] = Chain;
1069   if (Flag.getNode())
1070     RetOps.push_back(Flag);
1071 
1072   return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, RetOps);
1073 }
1074 
1075 unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
1076                                              SelectionDAG &DAG) const {
1077   unsigned Reg = StringSwitch<unsigned>(RegName)
1078     .Case("m0", AMDGPU::M0)
1079     .Case("exec", AMDGPU::EXEC)
1080     .Case("exec_lo", AMDGPU::EXEC_LO)
1081     .Case("exec_hi", AMDGPU::EXEC_HI)
1082     .Case("flat_scratch", AMDGPU::FLAT_SCR)
1083     .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
1084     .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
1085     .Default(AMDGPU::NoRegister);
1086 
1087   if (Reg == AMDGPU::NoRegister) {
1088     report_fatal_error(Twine("invalid register name \""
1089                              + StringRef(RegName)  + "\"."));
1090 
1091   }
1092 
1093   if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
1094       Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
1095     report_fatal_error(Twine("invalid register \""
1096                              + StringRef(RegName)  + "\" for subtarget."));
1097   }
1098 
1099   switch (Reg) {
1100   case AMDGPU::M0:
1101   case AMDGPU::EXEC_LO:
1102   case AMDGPU::EXEC_HI:
1103   case AMDGPU::FLAT_SCR_LO:
1104   case AMDGPU::FLAT_SCR_HI:
1105     if (VT.getSizeInBits() == 32)
1106       return Reg;
1107     break;
1108   case AMDGPU::EXEC:
1109   case AMDGPU::FLAT_SCR:
1110     if (VT.getSizeInBits() == 64)
1111       return Reg;
1112     break;
1113   default:
1114     llvm_unreachable("missing register type checking");
1115   }
1116 
1117   report_fatal_error(Twine("invalid type for register \""
1118                            + StringRef(RegName) + "\"."));
1119 }
1120 
1121 MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
1122   MachineInstr *MI, MachineBasicBlock *BB) const {
1123   switch (MI->getOpcode()) {
1124   case AMDGPU::SI_INIT_M0: {
1125     const SIInstrInfo *TII =
1126       static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
1127     BuildMI(*BB, MI->getIterator(), MI->getDebugLoc(),
1128             TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1129       .addOperand(MI->getOperand(0));
1130     MI->eraseFromParent();
1131     break;
1132   }
1133   case AMDGPU::BRANCH:
1134     return BB;
1135   case AMDGPU::GET_GROUPSTATICSIZE: {
1136     const SIInstrInfo *TII =
1137       static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
1138     MachineFunction *MF = BB->getParent();
1139     SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1140     DebugLoc DL = MI->getDebugLoc();
1141     BuildMI (*BB, MI, DL, TII->get(AMDGPU::S_MOVK_I32))
1142       .addOperand(MI->getOperand(0))
1143       .addImm(MFI->LDSSize);
1144     MI->eraseFromParent();
1145     return BB;
1146   }
1147   default:
1148     return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
1149   }
1150   return BB;
1151 }
1152 
1153 bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
1154   // This currently forces unfolding various combinations of fsub into fma with
1155   // free fneg'd operands. As long as we have fast FMA (controlled by
1156   // isFMAFasterThanFMulAndFAdd), we should perform these.
1157 
1158   // When fma is quarter rate, for f64 where add / sub are at best half rate,
1159   // most of these combines appear to be cycle neutral but save on instruction
1160   // count / code size.
1161   return true;
1162 }
1163 
1164 EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
1165                                          EVT VT) const {
1166   if (!VT.isVector()) {
1167     return MVT::i1;
1168   }
1169   return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
1170 }
1171 
1172 MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT) const {
1173   return MVT::i32;
1174 }
1175 
1176 // Answering this is somewhat tricky and depends on the specific device which
1177 // have different rates for fma or all f64 operations.
1178 //
1179 // v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
1180 // regardless of which device (although the number of cycles differs between
1181 // devices), so it is always profitable for f64.
1182 //
1183 // v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
1184 // only on full rate devices. Normally, we should prefer selecting v_mad_f32
1185 // which we can always do even without fused FP ops since it returns the same
1186 // result as the separate operations and since it is always full
1187 // rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
1188 // however does not support denormals, so we do report fma as faster if we have
1189 // a fast fma device and require denormals.
1190 //
1191 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
1192   VT = VT.getScalarType();
1193 
1194   if (!VT.isSimple())
1195     return false;
1196 
1197   switch (VT.getSimpleVT().SimpleTy) {
1198   case MVT::f32:
1199     // This is as fast on some subtargets. However, we always have full rate f32
1200     // mad available which returns the same result as the separate operations
1201     // which we should prefer over fma. We can't use this if we want to support
1202     // denormals, so only report this in these cases.
1203     return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32();
1204   case MVT::f64:
1205     return true;
1206   default:
1207     break;
1208   }
1209 
1210   return false;
1211 }
1212 
1213 //===----------------------------------------------------------------------===//
1214 // Custom DAG Lowering Operations
1215 //===----------------------------------------------------------------------===//
1216 
1217 SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
1218   switch (Op.getOpcode()) {
1219   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
1220   case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
1221   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
1222   case ISD::LOAD: {
1223     SDValue Result = LowerLOAD(Op, DAG);
1224     assert((!Result.getNode() ||
1225             Result.getNode()->getNumValues() == 2) &&
1226            "Load should return a value and a chain");
1227     return Result;
1228   }
1229 
1230   case ISD::FSIN:
1231   case ISD::FCOS:
1232     return LowerTrig(Op, DAG);
1233   case ISD::SELECT: return LowerSELECT(Op, DAG);
1234   case ISD::FDIV: return LowerFDIV(Op, DAG);
1235   case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
1236   case ISD::STORE: return LowerSTORE(Op, DAG);
1237   case ISD::GlobalAddress: {
1238     MachineFunction &MF = DAG.getMachineFunction();
1239     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1240     return LowerGlobalAddress(MFI, Op, DAG);
1241   }
1242   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
1243   case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
1244   case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
1245   case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
1246   }
1247   return SDValue();
1248 }
1249 
1250 /// \brief Helper function for LowerBRCOND
1251 static SDNode *findUser(SDValue Value, unsigned Opcode) {
1252 
1253   SDNode *Parent = Value.getNode();
1254   for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
1255        I != E; ++I) {
1256 
1257     if (I.getUse().get() != Value)
1258       continue;
1259 
1260     if (I->getOpcode() == Opcode)
1261       return *I;
1262   }
1263   return nullptr;
1264 }
1265 
1266 SDValue SITargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
1267 
1268   SDLoc SL(Op);
1269   FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Op);
1270   unsigned FrameIndex = FINode->getIndex();
1271 
1272   // A FrameIndex node represents a 32-bit offset into scratch memory. If the
1273   // high bit of a frame index offset were to be set, this would mean that it
1274   // represented an offset of ~2GB * 64 = ~128GB from the start of the scratch
1275   // buffer, with 64 being the number of threads per wave.
1276   //
1277   // The maximum private allocation for the entire GPU is 4G, and we are
1278   // concerned with the largest the index could ever be for an individual
1279   // workitem. This will occur with the minmum dispatch size. If a program
1280   // requires more, the dispatch size will be reduced.
1281   //
1282   // With this limit, we can mark the high bit of the FrameIndex node as known
1283   // zero, which is important, because it means in most situations we can prove
1284   // that values derived from FrameIndex nodes are non-negative. This enables us
1285   // to take advantage of more addressing modes when accessing scratch buffers,
1286   // since for scratch reads/writes, the register offset must always be
1287   // positive.
1288 
1289   uint64_t MaxGPUAlloc = UINT64_C(4) * 1024 * 1024 * 1024;
1290 
1291   // XXX - It is unclear if partial dispatch works. Assume it works at half wave
1292   // granularity. It is probably a full wave.
1293   uint64_t MinGranularity = 32;
1294 
1295   unsigned KnownBits = Log2_64(MaxGPUAlloc / MinGranularity);
1296   EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), KnownBits);
1297 
1298   SDValue TFI = DAG.getTargetFrameIndex(FrameIndex, MVT::i32);
1299   return DAG.getNode(ISD::AssertZext, SL, MVT::i32, TFI,
1300                      DAG.getValueType(ExtVT));
1301 }
1302 
1303 bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
1304   if (Intr->getOpcode() != ISD::INTRINSIC_W_CHAIN)
1305     return false;
1306 
1307   switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
1308   default: return false;
1309   case AMDGPUIntrinsic::amdgcn_if:
1310   case AMDGPUIntrinsic::amdgcn_else:
1311   case AMDGPUIntrinsic::amdgcn_break:
1312   case AMDGPUIntrinsic::amdgcn_if_break:
1313   case AMDGPUIntrinsic::amdgcn_else_break:
1314   case AMDGPUIntrinsic::amdgcn_loop:
1315   case AMDGPUIntrinsic::amdgcn_end_cf:
1316     return true;
1317   }
1318 }
1319 
1320 /// This transforms the control flow intrinsics to get the branch destination as
1321 /// last parameter, also switches branch target with BR if the need arise
1322 SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
1323                                       SelectionDAG &DAG) const {
1324 
1325   SDLoc DL(BRCOND);
1326 
1327   SDNode *Intr = BRCOND.getOperand(1).getNode();
1328   SDValue Target = BRCOND.getOperand(2);
1329   SDNode *BR = nullptr;
1330   SDNode *SetCC = nullptr;
1331 
1332   if (Intr->getOpcode() == ISD::SETCC) {
1333     // As long as we negate the condition everything is fine
1334     SetCC = Intr;
1335     Intr = SetCC->getOperand(0).getNode();
1336 
1337   } else {
1338     // Get the target from BR if we don't negate the condition
1339     BR = findUser(BRCOND, ISD::BR);
1340     Target = BR->getOperand(1);
1341   }
1342 
1343   if (Intr->getOpcode() != ISD::INTRINSIC_W_CHAIN) {
1344     // This is a uniform branch so we don't need to legalize.
1345     return BRCOND;
1346   }
1347 
1348   assert(!SetCC ||
1349         (SetCC->getConstantOperandVal(1) == 1 &&
1350          isCFIntrinsic(Intr) &&
1351          cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
1352                                                              ISD::SETNE));
1353 
1354   // Build the result and
1355   ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
1356 
1357   // operands of the new intrinsic call
1358   SmallVector<SDValue, 4> Ops;
1359   Ops.push_back(BRCOND.getOperand(0));
1360   Ops.append(Intr->op_begin() + 1, Intr->op_end());
1361   Ops.push_back(Target);
1362 
1363   // build the new intrinsic call
1364   SDNode *Result = DAG.getNode(
1365     Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL,
1366     DAG.getVTList(Res), Ops).getNode();
1367 
1368   if (BR) {
1369     // Give the branch instruction our target
1370     SDValue Ops[] = {
1371       BR->getOperand(0),
1372       BRCOND.getOperand(2)
1373     };
1374     SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
1375     DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
1376     BR = NewBR.getNode();
1377   }
1378 
1379   SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
1380 
1381   // Copy the intrinsic results to registers
1382   for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
1383     SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
1384     if (!CopyToReg)
1385       continue;
1386 
1387     Chain = DAG.getCopyToReg(
1388       Chain, DL,
1389       CopyToReg->getOperand(1),
1390       SDValue(Result, i - 1),
1391       SDValue());
1392 
1393     DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
1394   }
1395 
1396   // Remove the old intrinsic from the chain
1397   DAG.ReplaceAllUsesOfValueWith(
1398     SDValue(Intr, Intr->getNumValues() - 1),
1399     Intr->getOperand(0));
1400 
1401   return Chain;
1402 }
1403 
1404 SDValue SITargetLowering::getSegmentAperture(unsigned AS,
1405                                              SelectionDAG &DAG) const {
1406   SDLoc SL;
1407   MachineFunction &MF = DAG.getMachineFunction();
1408   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1409   SDValue QueuePtr = CreateLiveInRegister(
1410     DAG, &AMDGPU::SReg_64RegClass, Info->getQueuePtrUserSGPR(), MVT::i64);
1411 
1412   // Offset into amd_queue_t for group_segment_aperture_base_hi /
1413   // private_segment_aperture_base_hi.
1414   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
1415 
1416   SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i64, QueuePtr,
1417                             DAG.getConstant(StructOffset, SL, MVT::i64));
1418 
1419   // TODO: Use custom target PseudoSourceValue.
1420   // TODO: We should use the value from the IR intrinsic call, but it might not
1421   // be available and how do we get it?
1422   Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()),
1423                                               AMDGPUAS::CONSTANT_ADDRESS));
1424 
1425   MachinePointerInfo PtrInfo(V, StructOffset);
1426   return DAG.getLoad(MVT::i32, SL, QueuePtr.getValue(1), Ptr,
1427                      PtrInfo, false,
1428                      false, true,
1429                      MinAlign(64, StructOffset));
1430 }
1431 
1432 SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
1433                                              SelectionDAG &DAG) const {
1434   SDLoc SL(Op);
1435   const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
1436 
1437   SDValue Src = ASC->getOperand(0);
1438 
1439   // FIXME: Really support non-0 null pointers.
1440   SDValue SegmentNullPtr = DAG.getConstant(-1, SL, MVT::i32);
1441   SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
1442 
1443   // flat -> local/private
1444   if (ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
1445     if (ASC->getDestAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1446         ASC->getDestAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
1447       SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
1448       SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
1449 
1450       return DAG.getNode(ISD::SELECT, SL, MVT::i32,
1451                          NonNull, Ptr, SegmentNullPtr);
1452     }
1453   }
1454 
1455   // local/private -> flat
1456   if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
1457     if (ASC->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1458         ASC->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
1459       SDValue NonNull
1460         = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
1461 
1462       SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), DAG);
1463       SDValue CvtPtr
1464         = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
1465 
1466       return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull,
1467                          DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr),
1468                          FlatNullPtr);
1469     }
1470   }
1471 
1472   // global <-> flat are no-ops and never emitted.
1473 
1474   const MachineFunction &MF = DAG.getMachineFunction();
1475   DiagnosticInfoUnsupported InvalidAddrSpaceCast(
1476     *MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
1477   DAG.getContext()->diagnose(InvalidAddrSpaceCast);
1478 
1479   return DAG.getUNDEF(ASC->getValueType(0));
1480 }
1481 
1482 SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
1483                                              SDValue Op,
1484                                              SelectionDAG &DAG) const {
1485   GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
1486 
1487   if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
1488     return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
1489 
1490   SDLoc DL(GSD);
1491   const GlobalValue *GV = GSD->getGlobal();
1492   MVT PtrVT = getPointerTy(DAG.getDataLayout(), GSD->getAddressSpace());
1493 
1494   SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32);
1495   return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, DL, PtrVT, GA);
1496 }
1497 
1498 SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain, SDLoc DL,
1499                                    SDValue V) const {
1500   // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
1501   // the destination register.
1502   //
1503   // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
1504   // so we will end up with redundant moves to m0.
1505   //
1506   // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
1507 
1508   // A Null SDValue creates a glue result.
1509   SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
1510                                   V, Chain);
1511   return SDValue(M0, 0);
1512 }
1513 
1514 SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
1515                                                  SDValue Op,
1516                                                  MVT VT,
1517                                                  unsigned Offset) const {
1518   SDLoc SL(Op);
1519   SDValue Param = LowerParameter(DAG, MVT::i32, MVT::i32, SL,
1520                                  DAG.getEntryNode(), Offset, false);
1521   // The local size values will have the hi 16-bits as zero.
1522   return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
1523                      DAG.getValueType(VT));
1524 }
1525 
1526 static SDValue emitNonHSAIntrinsicError(SelectionDAG& DAG, EVT VT) {
1527   DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(),
1528                                       "non-hsa intrinsic with hsa target");
1529   DAG.getContext()->diagnose(BadIntrin);
1530   return DAG.getUNDEF(VT);
1531 }
1532 
1533 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
1534                                                   SelectionDAG &DAG) const {
1535   MachineFunction &MF = DAG.getMachineFunction();
1536   auto MFI = MF.getInfo<SIMachineFunctionInfo>();
1537   const SIRegisterInfo *TRI =
1538       static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
1539 
1540   EVT VT = Op.getValueType();
1541   SDLoc DL(Op);
1542   unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
1543 
1544   // TODO: Should this propagate fast-math-flags?
1545 
1546   switch (IntrinsicID) {
1547   case Intrinsic::amdgcn_dispatch_ptr:
1548   case Intrinsic::amdgcn_queue_ptr: {
1549     if (!Subtarget->isAmdHsaOS()) {
1550       DiagnosticInfoUnsupported BadIntrin(
1551           *MF.getFunction(), "unsupported hsa intrinsic without hsa target",
1552           DL.getDebugLoc());
1553       DAG.getContext()->diagnose(BadIntrin);
1554       return DAG.getUNDEF(VT);
1555     }
1556 
1557     auto Reg = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
1558       SIRegisterInfo::DISPATCH_PTR : SIRegisterInfo::QUEUE_PTR;
1559     return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass,
1560                                 TRI->getPreloadedValue(MF, Reg), VT);
1561   }
1562   case Intrinsic::amdgcn_kernarg_segment_ptr: {
1563     unsigned Reg
1564       = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
1565     return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
1566   }
1567   case Intrinsic::amdgcn_rcp:
1568     return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
1569   case Intrinsic::amdgcn_rsq:
1570   case AMDGPUIntrinsic::AMDGPU_rsq: // Legacy name
1571     return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
1572   case Intrinsic::amdgcn_rsq_clamp:
1573   case AMDGPUIntrinsic::AMDGPU_rsq_clamped: { // Legacy name
1574     if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
1575       return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
1576 
1577     Type *Type = VT.getTypeForEVT(*DAG.getContext());
1578     APFloat Max = APFloat::getLargest(Type->getFltSemantics());
1579     APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
1580 
1581     SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
1582     SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
1583                               DAG.getConstantFP(Max, DL, VT));
1584     return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
1585                        DAG.getConstantFP(Min, DL, VT));
1586   }
1587   case Intrinsic::r600_read_ngroups_x:
1588     if (Subtarget->isAmdHsaOS())
1589       return emitNonHSAIntrinsicError(DAG, VT);
1590 
1591     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
1592                           SI::KernelInputOffsets::NGROUPS_X, false);
1593   case Intrinsic::r600_read_ngroups_y:
1594     if (Subtarget->isAmdHsaOS())
1595       return emitNonHSAIntrinsicError(DAG, VT);
1596 
1597     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
1598                           SI::KernelInputOffsets::NGROUPS_Y, false);
1599   case Intrinsic::r600_read_ngroups_z:
1600     if (Subtarget->isAmdHsaOS())
1601       return emitNonHSAIntrinsicError(DAG, VT);
1602 
1603     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
1604                           SI::KernelInputOffsets::NGROUPS_Z, false);
1605   case Intrinsic::r600_read_global_size_x:
1606     if (Subtarget->isAmdHsaOS())
1607       return emitNonHSAIntrinsicError(DAG, VT);
1608 
1609     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
1610                           SI::KernelInputOffsets::GLOBAL_SIZE_X, false);
1611   case Intrinsic::r600_read_global_size_y:
1612     if (Subtarget->isAmdHsaOS())
1613       return emitNonHSAIntrinsicError(DAG, VT);
1614 
1615     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
1616                           SI::KernelInputOffsets::GLOBAL_SIZE_Y, false);
1617   case Intrinsic::r600_read_global_size_z:
1618     if (Subtarget->isAmdHsaOS())
1619       return emitNonHSAIntrinsicError(DAG, VT);
1620 
1621     return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
1622                           SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
1623   case Intrinsic::r600_read_local_size_x:
1624     if (Subtarget->isAmdHsaOS())
1625       return emitNonHSAIntrinsicError(DAG, VT);
1626 
1627     return lowerImplicitZextParam(DAG, Op, MVT::i16,
1628                                   SI::KernelInputOffsets::LOCAL_SIZE_X);
1629   case Intrinsic::r600_read_local_size_y:
1630     if (Subtarget->isAmdHsaOS())
1631       return emitNonHSAIntrinsicError(DAG, VT);
1632 
1633     return lowerImplicitZextParam(DAG, Op, MVT::i16,
1634                                   SI::KernelInputOffsets::LOCAL_SIZE_Y);
1635   case Intrinsic::r600_read_local_size_z:
1636     if (Subtarget->isAmdHsaOS())
1637       return emitNonHSAIntrinsicError(DAG, VT);
1638 
1639     return lowerImplicitZextParam(DAG, Op, MVT::i16,
1640                                   SI::KernelInputOffsets::LOCAL_SIZE_Z);
1641   case Intrinsic::amdgcn_read_workdim:
1642   case AMDGPUIntrinsic::AMDGPU_read_workdim: // Legacy name.
1643     // Really only 2 bits.
1644     return lowerImplicitZextParam(DAG, Op, MVT::i8,
1645                                   getImplicitParameterOffset(MFI, GRID_DIM));
1646   case Intrinsic::amdgcn_workgroup_id_x:
1647   case Intrinsic::r600_read_tgid_x:
1648     return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
1649       TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_X), VT);
1650   case Intrinsic::amdgcn_workgroup_id_y:
1651   case Intrinsic::r600_read_tgid_y:
1652     return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
1653       TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Y), VT);
1654   case Intrinsic::amdgcn_workgroup_id_z:
1655   case Intrinsic::r600_read_tgid_z:
1656     return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
1657       TRI->getPreloadedValue(MF, SIRegisterInfo::WORKGROUP_ID_Z), VT);
1658   case Intrinsic::amdgcn_workitem_id_x:
1659   case Intrinsic::r600_read_tidig_x:
1660     return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
1661       TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X), VT);
1662   case Intrinsic::amdgcn_workitem_id_y:
1663   case Intrinsic::r600_read_tidig_y:
1664     return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
1665       TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y), VT);
1666   case Intrinsic::amdgcn_workitem_id_z:
1667   case Intrinsic::r600_read_tidig_z:
1668     return CreateLiveInRegister(DAG, &AMDGPU::VGPR_32RegClass,
1669       TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z), VT);
1670   case AMDGPUIntrinsic::SI_load_const: {
1671     SDValue Ops[] = {
1672       Op.getOperand(1),
1673       Op.getOperand(2)
1674     };
1675 
1676     MachineMemOperand *MMO = MF.getMachineMemOperand(
1677       MachinePointerInfo(),
1678       MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant,
1679       VT.getStoreSize(), 4);
1680     return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
1681                                    Op->getVTList(), Ops, VT, MMO);
1682   }
1683   case AMDGPUIntrinsic::SI_vs_load_input:
1684     return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT,
1685                        Op.getOperand(1),
1686                        Op.getOperand(2),
1687                        Op.getOperand(3));
1688 
1689   case AMDGPUIntrinsic::SI_fs_constant: {
1690     SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3));
1691     SDValue Glue = M0.getValue(1);
1692     return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32,
1693                        DAG.getConstant(2, DL, MVT::i32), // P0
1694                        Op.getOperand(1), Op.getOperand(2), Glue);
1695   }
1696   case AMDGPUIntrinsic::SI_packf16:
1697     if (Op.getOperand(1).isUndef() && Op.getOperand(2).isUndef())
1698       return DAG.getUNDEF(MVT::i32);
1699     return Op;
1700   case AMDGPUIntrinsic::SI_fs_interp: {
1701     SDValue IJ = Op.getOperand(4);
1702     SDValue I = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ,
1703                             DAG.getConstant(0, DL, MVT::i32));
1704     SDValue J = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ,
1705                             DAG.getConstant(1, DL, MVT::i32));
1706     SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3));
1707     SDValue Glue = M0.getValue(1);
1708     SDValue P1 = DAG.getNode(AMDGPUISD::INTERP_P1, DL,
1709                              DAG.getVTList(MVT::f32, MVT::Glue),
1710                              I, Op.getOperand(1), Op.getOperand(2), Glue);
1711     Glue = SDValue(P1.getNode(), 1);
1712     return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, P1, J,
1713                              Op.getOperand(1), Op.getOperand(2), Glue);
1714   }
1715   case Intrinsic::amdgcn_interp_p1: {
1716     SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
1717     SDValue Glue = M0.getValue(1);
1718     return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1),
1719                        Op.getOperand(2), Op.getOperand(3), Glue);
1720   }
1721   case Intrinsic::amdgcn_interp_p2: {
1722     SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
1723     SDValue Glue = SDValue(M0.getNode(), 1);
1724     return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1),
1725                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
1726                        Glue);
1727   }
1728   case Intrinsic::amdgcn_sin:
1729     return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
1730 
1731   case Intrinsic::amdgcn_cos:
1732     return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
1733 
1734   case Intrinsic::amdgcn_log_clamp: {
1735     if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
1736       return SDValue();
1737 
1738     DiagnosticInfoUnsupported BadIntrin(
1739       *MF.getFunction(), "intrinsic not supported on subtarget",
1740       DL.getDebugLoc());
1741       DAG.getContext()->diagnose(BadIntrin);
1742       return DAG.getUNDEF(VT);
1743   }
1744   case Intrinsic::amdgcn_ldexp:
1745     return DAG.getNode(AMDGPUISD::LDEXP, DL, VT,
1746                        Op.getOperand(1), Op.getOperand(2));
1747   case Intrinsic::amdgcn_class:
1748     return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
1749                        Op.getOperand(1), Op.getOperand(2));
1750   case Intrinsic::amdgcn_div_fmas:
1751     return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
1752                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
1753                        Op.getOperand(4));
1754 
1755   case Intrinsic::amdgcn_div_fixup:
1756     return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
1757                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
1758 
1759   case Intrinsic::amdgcn_trig_preop:
1760     return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
1761                        Op.getOperand(1), Op.getOperand(2));
1762   case Intrinsic::amdgcn_div_scale: {
1763     // 3rd parameter required to be a constant.
1764     const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
1765     if (!Param)
1766       return DAG.getUNDEF(VT);
1767 
1768     // Translate to the operands expected by the machine instruction. The
1769     // first parameter must be the same as the first instruction.
1770     SDValue Numerator = Op.getOperand(1);
1771     SDValue Denominator = Op.getOperand(2);
1772 
1773     // Note this order is opposite of the machine instruction's operations,
1774     // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
1775     // intrinsic has the numerator as the first operand to match a normal
1776     // division operation.
1777 
1778     SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
1779 
1780     return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
1781                        Denominator, Numerator);
1782   }
1783   case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte0:
1784     return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Op.getOperand(1));
1785   case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte1:
1786     return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE1, DL, VT, Op.getOperand(1));
1787   case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte2:
1788     return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE2, DL, VT, Op.getOperand(1));
1789   case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte3:
1790     return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE3, DL, VT, Op.getOperand(1));
1791   default:
1792     return AMDGPUTargetLowering::LowerOperation(Op, DAG);
1793   }
1794 }
1795 
1796 SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
1797                                                  SelectionDAG &DAG) const {
1798   unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
1799   switch (IntrID) {
1800   case Intrinsic::amdgcn_atomic_inc:
1801   case Intrinsic::amdgcn_atomic_dec: {
1802     MemSDNode *M = cast<MemSDNode>(Op);
1803     unsigned Opc = (IntrID == Intrinsic::amdgcn_atomic_inc) ?
1804       AMDGPUISD::ATOMIC_INC : AMDGPUISD::ATOMIC_DEC;
1805     SDValue Ops[] = {
1806       M->getOperand(0), // Chain
1807       M->getOperand(2), // Ptr
1808       M->getOperand(3)  // Value
1809     };
1810 
1811     return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
1812                                    M->getMemoryVT(), M->getMemOperand());
1813   }
1814   default:
1815     return SDValue();
1816   }
1817 }
1818 
1819 SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
1820                                               SelectionDAG &DAG) const {
1821   MachineFunction &MF = DAG.getMachineFunction();
1822   SDLoc DL(Op);
1823   SDValue Chain = Op.getOperand(0);
1824   unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
1825 
1826   switch (IntrinsicID) {
1827   case AMDGPUIntrinsic::SI_sendmsg: {
1828     Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
1829     SDValue Glue = Chain.getValue(1);
1830     return DAG.getNode(AMDGPUISD::SENDMSG, DL, MVT::Other, Chain,
1831                        Op.getOperand(2), Glue);
1832   }
1833   case AMDGPUIntrinsic::SI_tbuffer_store: {
1834     SDValue Ops[] = {
1835       Chain,
1836       Op.getOperand(2),
1837       Op.getOperand(3),
1838       Op.getOperand(4),
1839       Op.getOperand(5),
1840       Op.getOperand(6),
1841       Op.getOperand(7),
1842       Op.getOperand(8),
1843       Op.getOperand(9),
1844       Op.getOperand(10),
1845       Op.getOperand(11),
1846       Op.getOperand(12),
1847       Op.getOperand(13),
1848       Op.getOperand(14)
1849     };
1850 
1851     EVT VT = Op.getOperand(3).getValueType();
1852 
1853     MachineMemOperand *MMO = MF.getMachineMemOperand(
1854       MachinePointerInfo(),
1855       MachineMemOperand::MOStore,
1856       VT.getStoreSize(), 4);
1857     return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
1858                                    Op->getVTList(), Ops, VT, MMO);
1859   }
1860   default:
1861     return SDValue();
1862   }
1863 }
1864 
1865 SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
1866   SDLoc DL(Op);
1867   LoadSDNode *Load = cast<LoadSDNode>(Op);
1868   ISD::LoadExtType ExtType = Load->getExtensionType();
1869   EVT MemVT = Load->getMemoryVT();
1870 
1871   if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
1872     assert(MemVT == MVT::i1 && "Only i1 non-extloads expected");
1873     // FIXME: Copied from PPC
1874     // First, load into 32 bits, then truncate to 1 bit.
1875 
1876     SDValue Chain = Load->getChain();
1877     SDValue BasePtr = Load->getBasePtr();
1878     MachineMemOperand *MMO = Load->getMemOperand();
1879 
1880     SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
1881                                    BasePtr, MVT::i8, MMO);
1882 
1883     SDValue Ops[] = {
1884       DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
1885       NewLD.getValue(1)
1886     };
1887 
1888     return DAG.getMergeValues(Ops, DL);
1889   }
1890 
1891   if (!MemVT.isVector())
1892     return SDValue();
1893 
1894   assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
1895          "Custom lowering for non-i32 vectors hasn't been implemented.");
1896   unsigned NumElements = MemVT.getVectorNumElements();
1897   assert(NumElements != 2 && "v2 loads are supported for all address spaces.");
1898 
1899   switch (Load->getAddressSpace()) {
1900   case AMDGPUAS::CONSTANT_ADDRESS:
1901     if (isMemOpUniform(Load))
1902       return SDValue();
1903     // Non-uniform loads will be selected to MUBUF instructions, so they
1904     // have the same legalization requires ments as global and private
1905     // loads.
1906     //
1907     // Fall-through
1908   case AMDGPUAS::GLOBAL_ADDRESS:
1909   case AMDGPUAS::FLAT_ADDRESS:
1910     if (NumElements > 4)
1911       return SplitVectorLoad(Op, DAG);
1912     // v4 loads are supported for private and global memory.
1913     return SDValue();
1914   case AMDGPUAS::PRIVATE_ADDRESS: {
1915     // Depending on the setting of the private_element_size field in the
1916     // resource descriptor, we can only make private accesses up to a certain
1917     // size.
1918     switch (Subtarget->getMaxPrivateElementSize()) {
1919     case 4:
1920       return scalarizeVectorLoad(Load, DAG);
1921     case 8:
1922       if (NumElements > 2)
1923         return SplitVectorLoad(Op, DAG);
1924       return SDValue();
1925     case 16:
1926       // Same as global/flat
1927       if (NumElements > 4)
1928         return SplitVectorLoad(Op, DAG);
1929       return SDValue();
1930     default:
1931       llvm_unreachable("unsupported private_element_size");
1932     }
1933   }
1934   case AMDGPUAS::LOCAL_ADDRESS:
1935     // If properly aligned, if we split we might be able to use ds_read_b64.
1936     return SplitVectorLoad(Op, DAG);
1937   default:
1938     return SDValue();
1939   }
1940 }
1941 
1942 SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
1943   if (Op.getValueType() != MVT::i64)
1944     return SDValue();
1945 
1946   SDLoc DL(Op);
1947   SDValue Cond = Op.getOperand(0);
1948 
1949   SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
1950   SDValue One = DAG.getConstant(1, DL, MVT::i32);
1951 
1952   SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
1953   SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
1954 
1955   SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
1956   SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
1957 
1958   SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
1959 
1960   SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
1961   SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
1962 
1963   SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
1964 
1965   SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
1966   return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res);
1967 }
1968 
1969 // Catch division cases where we can use shortcuts with rcp and rsq
1970 // instructions.
1971 SDValue SITargetLowering::LowerFastFDIV(SDValue Op, SelectionDAG &DAG) const {
1972   SDLoc SL(Op);
1973   SDValue LHS = Op.getOperand(0);
1974   SDValue RHS = Op.getOperand(1);
1975   EVT VT = Op.getValueType();
1976   bool Unsafe = DAG.getTarget().Options.UnsafeFPMath;
1977 
1978   if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
1979     if ((Unsafe || (VT == MVT::f32 && !Subtarget->hasFP32Denormals())) &&
1980         CLHS->isExactlyValue(1.0)) {
1981       // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
1982       // the CI documentation has a worst case error of 1 ulp.
1983       // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
1984       // use it as long as we aren't trying to use denormals.
1985 
1986       // 1.0 / sqrt(x) -> rsq(x)
1987       //
1988       // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
1989       // error seems really high at 2^29 ULP.
1990       if (RHS.getOpcode() == ISD::FSQRT)
1991         return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
1992 
1993       // 1.0 / x -> rcp(x)
1994       return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
1995     }
1996   }
1997 
1998   if (Unsafe) {
1999     // Turn into multiply by the reciprocal.
2000     // x / y -> x * (1.0 / y)
2001     SDNodeFlags Flags;
2002     Flags.setUnsafeAlgebra(true);
2003     SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
2004     return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, &Flags);
2005   }
2006 
2007   return SDValue();
2008 }
2009 
2010 SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
2011   if (SDValue FastLowered = LowerFastFDIV(Op, DAG))
2012     return FastLowered;
2013 
2014   // This uses v_rcp_f32 which does not handle denormals. Let this hit a
2015   // selection error for now rather than do something incorrect.
2016   if (Subtarget->hasFP32Denormals())
2017     return SDValue();
2018 
2019   SDLoc SL(Op);
2020   SDValue LHS = Op.getOperand(0);
2021   SDValue RHS = Op.getOperand(1);
2022 
2023   SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
2024 
2025   const APFloat K0Val(BitsToFloat(0x6f800000));
2026   const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
2027 
2028   const APFloat K1Val(BitsToFloat(0x2f800000));
2029   const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
2030 
2031   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
2032 
2033   EVT SetCCVT =
2034       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
2035 
2036   SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
2037 
2038   SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
2039 
2040   // TODO: Should this propagate fast-math-flags?
2041 
2042   r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
2043 
2044   SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
2045 
2046   SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
2047 
2048   return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
2049 }
2050 
2051 SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
2052   if (DAG.getTarget().Options.UnsafeFPMath)
2053     return LowerFastFDIV(Op, DAG);
2054 
2055   SDLoc SL(Op);
2056   SDValue X = Op.getOperand(0);
2057   SDValue Y = Op.getOperand(1);
2058 
2059   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2060 
2061   SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
2062 
2063   SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
2064 
2065   SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
2066 
2067   SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
2068 
2069   SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
2070 
2071   SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
2072 
2073   SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
2074 
2075   SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
2076 
2077   SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
2078   SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
2079 
2080   SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
2081                              NegDivScale0, Mul, DivScale1);
2082 
2083   SDValue Scale;
2084 
2085   if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
2086     // Workaround a hardware bug on SI where the condition output from div_scale
2087     // is not usable.
2088 
2089     const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
2090 
2091     // Figure out if the scale to use for div_fmas.
2092     SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
2093     SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
2094     SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
2095     SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
2096 
2097     SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
2098     SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
2099 
2100     SDValue Scale0Hi
2101       = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
2102     SDValue Scale1Hi
2103       = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
2104 
2105     SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
2106     SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
2107     Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
2108   } else {
2109     Scale = DivScale1.getValue(1);
2110   }
2111 
2112   SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
2113                              Fma4, Fma3, Mul, Scale);
2114 
2115   return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
2116 }
2117 
2118 SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
2119   EVT VT = Op.getValueType();
2120 
2121   if (VT == MVT::f32)
2122     return LowerFDIV32(Op, DAG);
2123 
2124   if (VT == MVT::f64)
2125     return LowerFDIV64(Op, DAG);
2126 
2127   llvm_unreachable("Unexpected type for fdiv");
2128 }
2129 
2130 SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
2131   SDLoc DL(Op);
2132   StoreSDNode *Store = cast<StoreSDNode>(Op);
2133   EVT VT = Store->getMemoryVT();
2134 
2135   if (VT == MVT::i1) {
2136     return DAG.getTruncStore(Store->getChain(), DL,
2137        DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
2138        Store->getBasePtr(), MVT::i1, Store->getMemOperand());
2139   }
2140 
2141   assert(Store->getValue().getValueType().getScalarType() == MVT::i32);
2142 
2143   unsigned NumElements = VT.getVectorNumElements();
2144   switch (Store->getAddressSpace()) {
2145   case AMDGPUAS::GLOBAL_ADDRESS:
2146   case AMDGPUAS::FLAT_ADDRESS:
2147     if (NumElements > 4)
2148       return SplitVectorStore(Op, DAG);
2149     return SDValue();
2150   case AMDGPUAS::PRIVATE_ADDRESS: {
2151     switch (Subtarget->getMaxPrivateElementSize()) {
2152     case 4:
2153       return scalarizeVectorStore(Store, DAG);
2154     case 8:
2155       if (NumElements > 2)
2156         return SplitVectorStore(Op, DAG);
2157       return SDValue();
2158     case 16:
2159       if (NumElements > 4)
2160         return SplitVectorStore(Op, DAG);
2161       return SDValue();
2162     default:
2163       llvm_unreachable("unsupported private_element_size");
2164     }
2165   }
2166   case AMDGPUAS::LOCAL_ADDRESS:
2167     // If properly aligned, if we split we might be able to use ds_write_b64.
2168     return SplitVectorStore(Op, DAG);
2169   default:
2170     llvm_unreachable("unhandled address space");
2171   }
2172 }
2173 
2174 SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
2175   SDLoc DL(Op);
2176   EVT VT = Op.getValueType();
2177   SDValue Arg = Op.getOperand(0);
2178   // TODO: Should this propagate fast-math-flags?
2179   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
2180                                   DAG.getNode(ISD::FMUL, DL, VT, Arg,
2181                                               DAG.getConstantFP(0.5/M_PI, DL,
2182                                                                 VT)));
2183 
2184   switch (Op.getOpcode()) {
2185   case ISD::FCOS:
2186     return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, FractPart);
2187   case ISD::FSIN:
2188     return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, FractPart);
2189   default:
2190     llvm_unreachable("Wrong trig opcode");
2191   }
2192 }
2193 
2194 SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
2195   AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
2196   assert(AtomicNode->isCompareAndSwap());
2197   unsigned AS = AtomicNode->getAddressSpace();
2198 
2199   // No custom lowering required for local address space
2200   if (!isFlatGlobalAddrSpace(AS))
2201     return Op;
2202 
2203   // Non-local address space requires custom lowering for atomic compare
2204   // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
2205   SDLoc DL(Op);
2206   SDValue ChainIn = Op.getOperand(0);
2207   SDValue Addr = Op.getOperand(1);
2208   SDValue Old = Op.getOperand(2);
2209   SDValue New = Op.getOperand(3);
2210   EVT VT = Op.getValueType();
2211   MVT SimpleVT = VT.getSimpleVT();
2212   MVT VecType = MVT::getVectorVT(SimpleVT, 2);
2213 
2214   SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
2215   SDValue Ops[] = { ChainIn, Addr, NewOld };
2216   SDVTList VTList = DAG.getVTList(VT, MVT::Other);
2217   return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL,
2218                                  VTList, Ops, VT, AtomicNode->getMemOperand());
2219 }
2220 
2221 //===----------------------------------------------------------------------===//
2222 // Custom DAG optimizations
2223 //===----------------------------------------------------------------------===//
2224 
2225 SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
2226                                                      DAGCombinerInfo &DCI) const {
2227   EVT VT = N->getValueType(0);
2228   EVT ScalarVT = VT.getScalarType();
2229   if (ScalarVT != MVT::f32)
2230     return SDValue();
2231 
2232   SelectionDAG &DAG = DCI.DAG;
2233   SDLoc DL(N);
2234 
2235   SDValue Src = N->getOperand(0);
2236   EVT SrcVT = Src.getValueType();
2237 
2238   // TODO: We could try to match extracting the higher bytes, which would be
2239   // easier if i8 vectors weren't promoted to i32 vectors, particularly after
2240   // types are legalized. v4i8 -> v4f32 is probably the only case to worry
2241   // about in practice.
2242   if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) {
2243     if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
2244       SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src);
2245       DCI.AddToWorklist(Cvt.getNode());
2246       return Cvt;
2247     }
2248   }
2249 
2250   // We are primarily trying to catch operations on illegal vector types
2251   // before they are expanded.
2252   // For scalars, we can use the more flexible method of checking masked bits
2253   // after legalization.
2254   if (!DCI.isBeforeLegalize() ||
2255       !SrcVT.isVector() ||
2256       SrcVT.getVectorElementType() != MVT::i8) {
2257     return SDValue();
2258   }
2259 
2260   assert(DCI.isBeforeLegalize() && "Unexpected legal type");
2261 
2262   // Weird sized vectors are a pain to handle, but we know 3 is really the same
2263   // size as 4.
2264   unsigned NElts = SrcVT.getVectorNumElements();
2265   if (!SrcVT.isSimple() && NElts != 3)
2266     return SDValue();
2267 
2268   // Handle v4i8 -> v4f32 extload. Replace the v4i8 with a legal i32 load to
2269   // prevent a mess from expanding to v4i32 and repacking.
2270   if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
2271     EVT LoadVT = getEquivalentMemType(*DAG.getContext(), SrcVT);
2272     EVT RegVT = getEquivalentLoadRegType(*DAG.getContext(), SrcVT);
2273     EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, NElts);
2274     LoadSDNode *Load = cast<LoadSDNode>(Src);
2275 
2276     unsigned AS = Load->getAddressSpace();
2277     unsigned Align = Load->getAlignment();
2278     Type *Ty = LoadVT.getTypeForEVT(*DAG.getContext());
2279     unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty);
2280 
2281     // Don't try to replace the load if we have to expand it due to alignment
2282     // problems. Otherwise we will end up scalarizing the load, and trying to
2283     // repack into the vector for no real reason.
2284     if (Align < ABIAlignment &&
2285         !allowsMisalignedMemoryAccesses(LoadVT, AS, Align, nullptr)) {
2286       return SDValue();
2287     }
2288 
2289     SDValue NewLoad = DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegVT,
2290                                      Load->getChain(),
2291                                      Load->getBasePtr(),
2292                                      LoadVT,
2293                                      Load->getMemOperand());
2294 
2295     // Make sure successors of the original load stay after it by updating
2296     // them to use the new Chain.
2297     DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), NewLoad.getValue(1));
2298 
2299     SmallVector<SDValue, 4> Elts;
2300     if (RegVT.isVector())
2301       DAG.ExtractVectorElements(NewLoad, Elts);
2302     else
2303       Elts.push_back(NewLoad);
2304 
2305     SmallVector<SDValue, 4> Ops;
2306 
2307     unsigned EltIdx = 0;
2308     for (SDValue Elt : Elts) {
2309       unsigned ComponentsInElt = std::min(4u, NElts - 4 * EltIdx);
2310       for (unsigned I = 0; I < ComponentsInElt; ++I) {
2311         unsigned Opc = AMDGPUISD::CVT_F32_UBYTE0 + I;
2312         SDValue Cvt = DAG.getNode(Opc, DL, MVT::f32, Elt);
2313         DCI.AddToWorklist(Cvt.getNode());
2314         Ops.push_back(Cvt);
2315       }
2316 
2317       ++EltIdx;
2318     }
2319 
2320     assert(Ops.size() == NElts);
2321 
2322     return DAG.getBuildVector(FloatVT, DL, Ops);
2323   }
2324 
2325   return SDValue();
2326 }
2327 
2328 /// \brief Return true if the given offset Size in bytes can be folded into
2329 /// the immediate offsets of a memory instruction for the given address space.
2330 static bool canFoldOffset(unsigned OffsetSize, unsigned AS,
2331                           const AMDGPUSubtarget &STI) {
2332   switch (AS) {
2333   case AMDGPUAS::GLOBAL_ADDRESS: {
2334     // MUBUF instructions a 12-bit offset in bytes.
2335     return isUInt<12>(OffsetSize);
2336   }
2337   case AMDGPUAS::CONSTANT_ADDRESS: {
2338     // SMRD instructions have an 8-bit offset in dwords on SI and
2339     // a 20-bit offset in bytes on VI.
2340     if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
2341       return isUInt<20>(OffsetSize);
2342     else
2343       return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4);
2344   }
2345   case AMDGPUAS::LOCAL_ADDRESS:
2346   case AMDGPUAS::REGION_ADDRESS: {
2347     // The single offset versions have a 16-bit offset in bytes.
2348     return isUInt<16>(OffsetSize);
2349   }
2350   case AMDGPUAS::PRIVATE_ADDRESS:
2351   // Indirect register addressing does not use any offsets.
2352   default:
2353     return 0;
2354   }
2355 }
2356 
2357 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
2358 
2359 // This is a variant of
2360 // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
2361 //
2362 // The normal DAG combiner will do this, but only if the add has one use since
2363 // that would increase the number of instructions.
2364 //
2365 // This prevents us from seeing a constant offset that can be folded into a
2366 // memory instruction's addressing mode. If we know the resulting add offset of
2367 // a pointer can be folded into an addressing offset, we can replace the pointer
2368 // operand with the add of new constant offset. This eliminates one of the uses,
2369 // and may allow the remaining use to also be simplified.
2370 //
2371 SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
2372                                                unsigned AddrSpace,
2373                                                DAGCombinerInfo &DCI) const {
2374   SDValue N0 = N->getOperand(0);
2375   SDValue N1 = N->getOperand(1);
2376 
2377   if (N0.getOpcode() != ISD::ADD)
2378     return SDValue();
2379 
2380   const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
2381   if (!CN1)
2382     return SDValue();
2383 
2384   const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
2385   if (!CAdd)
2386     return SDValue();
2387 
2388   // If the resulting offset is too large, we can't fold it into the addressing
2389   // mode offset.
2390   APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
2391   if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *Subtarget))
2392     return SDValue();
2393 
2394   SelectionDAG &DAG = DCI.DAG;
2395   SDLoc SL(N);
2396   EVT VT = N->getValueType(0);
2397 
2398   SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
2399   SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32);
2400 
2401   return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset);
2402 }
2403 
2404 SDValue SITargetLowering::performAndCombine(SDNode *N,
2405                                             DAGCombinerInfo &DCI) const {
2406   if (DCI.isBeforeLegalize())
2407     return SDValue();
2408 
2409   if (SDValue Base = AMDGPUTargetLowering::performAndCombine(N, DCI))
2410     return Base;
2411 
2412   SelectionDAG &DAG = DCI.DAG;
2413 
2414   // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
2415   // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
2416   SDValue LHS = N->getOperand(0);
2417   SDValue RHS = N->getOperand(1);
2418 
2419   if (LHS.getOpcode() == ISD::SETCC &&
2420       RHS.getOpcode() == ISD::SETCC) {
2421     ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
2422     ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
2423 
2424     SDValue X = LHS.getOperand(0);
2425     SDValue Y = RHS.getOperand(0);
2426     if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X)
2427       return SDValue();
2428 
2429     if (LCC == ISD::SETO) {
2430       if (X != LHS.getOperand(1))
2431         return SDValue();
2432 
2433       if (RCC == ISD::SETUNE) {
2434         const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
2435         if (!C1 || !C1->isInfinity() || C1->isNegative())
2436           return SDValue();
2437 
2438         const uint32_t Mask = SIInstrFlags::N_NORMAL |
2439                               SIInstrFlags::N_SUBNORMAL |
2440                               SIInstrFlags::N_ZERO |
2441                               SIInstrFlags::P_ZERO |
2442                               SIInstrFlags::P_SUBNORMAL |
2443                               SIInstrFlags::P_NORMAL;
2444 
2445         static_assert(((~(SIInstrFlags::S_NAN |
2446                           SIInstrFlags::Q_NAN |
2447                           SIInstrFlags::N_INFINITY |
2448                           SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
2449                       "mask not equal");
2450 
2451         SDLoc DL(N);
2452         return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
2453                            X, DAG.getConstant(Mask, DL, MVT::i32));
2454       }
2455     }
2456   }
2457 
2458   return SDValue();
2459 }
2460 
2461 SDValue SITargetLowering::performOrCombine(SDNode *N,
2462                                            DAGCombinerInfo &DCI) const {
2463   SelectionDAG &DAG = DCI.DAG;
2464   SDValue LHS = N->getOperand(0);
2465   SDValue RHS = N->getOperand(1);
2466 
2467   EVT VT = N->getValueType(0);
2468   if (VT == MVT::i64) {
2469     // TODO: This could be a generic combine with a predicate for extracting the
2470     // high half of an integer being free.
2471 
2472     // (or i64:x, (zero_extend i32:y)) ->
2473     //   i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
2474     if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
2475         RHS.getOpcode() != ISD::ZERO_EXTEND)
2476       std::swap(LHS, RHS);
2477 
2478     if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
2479       SDValue ExtSrc = RHS.getOperand(0);
2480       EVT SrcVT = ExtSrc.getValueType();
2481       if (SrcVT == MVT::i32) {
2482         SDLoc SL(N);
2483         SDValue LowLHS, HiBits;
2484         std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
2485         SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
2486 
2487         DCI.AddToWorklist(LowOr.getNode());
2488         DCI.AddToWorklist(HiBits.getNode());
2489 
2490         SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
2491                                   LowOr, HiBits);
2492         return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
2493       }
2494     }
2495   }
2496 
2497   // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
2498   if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
2499       RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
2500     SDValue Src = LHS.getOperand(0);
2501     if (Src != RHS.getOperand(0))
2502       return SDValue();
2503 
2504     const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
2505     const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
2506     if (!CLHS || !CRHS)
2507       return SDValue();
2508 
2509     // Only 10 bits are used.
2510     static const uint32_t MaxMask = 0x3ff;
2511 
2512     uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
2513     SDLoc DL(N);
2514     return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
2515                        Src, DAG.getConstant(NewMask, DL, MVT::i32));
2516   }
2517 
2518   return SDValue();
2519 }
2520 
2521 SDValue SITargetLowering::performClassCombine(SDNode *N,
2522                                               DAGCombinerInfo &DCI) const {
2523   SelectionDAG &DAG = DCI.DAG;
2524   SDValue Mask = N->getOperand(1);
2525 
2526   // fp_class x, 0 -> false
2527   if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) {
2528     if (CMask->isNullValue())
2529       return DAG.getConstant(0, SDLoc(N), MVT::i1);
2530   }
2531 
2532   return SDValue();
2533 }
2534 
2535 // Constant fold canonicalize.
2536 SDValue SITargetLowering::performFCanonicalizeCombine(
2537   SDNode *N,
2538   DAGCombinerInfo &DCI) const {
2539   ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
2540   if (!CFP)
2541     return SDValue();
2542 
2543   SelectionDAG &DAG = DCI.DAG;
2544   const APFloat &C = CFP->getValueAPF();
2545 
2546   // Flush denormals to 0 if not enabled.
2547   if (C.isDenormal()) {
2548     EVT VT = N->getValueType(0);
2549     if (VT == MVT::f32 && !Subtarget->hasFP32Denormals())
2550       return DAG.getConstantFP(0.0, SDLoc(N), VT);
2551 
2552     if (VT == MVT::f64 && !Subtarget->hasFP64Denormals())
2553       return DAG.getConstantFP(0.0, SDLoc(N), VT);
2554   }
2555 
2556   if (C.isNaN()) {
2557     EVT VT = N->getValueType(0);
2558     APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
2559     if (C.isSignaling()) {
2560       // Quiet a signaling NaN.
2561       return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
2562     }
2563 
2564     // Make sure it is the canonical NaN bitpattern.
2565     //
2566     // TODO: Can we use -1 as the canonical NaN value since it's an inline
2567     // immediate?
2568     if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
2569       return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
2570   }
2571 
2572   return SDValue(CFP, 0);
2573 }
2574 
2575 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
2576   switch (Opc) {
2577   case ISD::FMAXNUM:
2578     return AMDGPUISD::FMAX3;
2579   case ISD::SMAX:
2580     return AMDGPUISD::SMAX3;
2581   case ISD::UMAX:
2582     return AMDGPUISD::UMAX3;
2583   case ISD::FMINNUM:
2584     return AMDGPUISD::FMIN3;
2585   case ISD::SMIN:
2586     return AMDGPUISD::SMIN3;
2587   case ISD::UMIN:
2588     return AMDGPUISD::UMIN3;
2589   default:
2590     llvm_unreachable("Not a min/max opcode");
2591   }
2592 }
2593 
2594 static SDValue performIntMed3ImmCombine(SelectionDAG &DAG,
2595                                         SDLoc SL,
2596                                         SDValue Op0,
2597                                         SDValue Op1,
2598                                         bool Signed) {
2599   ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1);
2600   if (!K1)
2601     return SDValue();
2602 
2603   ConstantSDNode *K0 = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
2604   if (!K0)
2605     return SDValue();
2606 
2607 
2608   if (Signed) {
2609     if (K0->getAPIntValue().sge(K1->getAPIntValue()))
2610       return SDValue();
2611   } else {
2612     if (K0->getAPIntValue().uge(K1->getAPIntValue()))
2613       return SDValue();
2614   }
2615 
2616   EVT VT = K0->getValueType(0);
2617   return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT,
2618                      Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
2619 }
2620 
2621 static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
2622   if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions())
2623     return true;
2624 
2625   return DAG.isKnownNeverNaN(Op);
2626 }
2627 
2628 static SDValue performFPMed3ImmCombine(SelectionDAG &DAG,
2629                                        SDLoc SL,
2630                                        SDValue Op0,
2631                                        SDValue Op1) {
2632   ConstantFPSDNode *K1 = dyn_cast<ConstantFPSDNode>(Op1);
2633   if (!K1)
2634     return SDValue();
2635 
2636   ConstantFPSDNode *K0 = dyn_cast<ConstantFPSDNode>(Op0.getOperand(1));
2637   if (!K0)
2638     return SDValue();
2639 
2640   // Ordered >= (although NaN inputs should have folded away by now).
2641   APFloat::cmpResult Cmp = K0->getValueAPF().compare(K1->getValueAPF());
2642   if (Cmp == APFloat::cmpGreaterThan)
2643     return SDValue();
2644 
2645   // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
2646   // signaling NaN gives a quiet NaN. The quiet NaN input to the min would then
2647   // give the other result, which is different from med3 with a NaN input.
2648   SDValue Var = Op0.getOperand(0);
2649   if (!isKnownNeverSNan(DAG, Var))
2650     return SDValue();
2651 
2652   return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
2653                      Var, SDValue(K0, 0), SDValue(K1, 0));
2654 }
2655 
2656 SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
2657                                                DAGCombinerInfo &DCI) const {
2658   SelectionDAG &DAG = DCI.DAG;
2659 
2660   unsigned Opc = N->getOpcode();
2661   SDValue Op0 = N->getOperand(0);
2662   SDValue Op1 = N->getOperand(1);
2663 
2664   // Only do this if the inner op has one use since this will just increases
2665   // register pressure for no benefit.
2666 
2667   if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY) {
2668     // max(max(a, b), c) -> max3(a, b, c)
2669     // min(min(a, b), c) -> min3(a, b, c)
2670     if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
2671       SDLoc DL(N);
2672       return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
2673                          DL,
2674                          N->getValueType(0),
2675                          Op0.getOperand(0),
2676                          Op0.getOperand(1),
2677                          Op1);
2678     }
2679 
2680     // Try commuted.
2681     // max(a, max(b, c)) -> max3(a, b, c)
2682     // min(a, min(b, c)) -> min3(a, b, c)
2683     if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
2684       SDLoc DL(N);
2685       return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
2686                          DL,
2687                          N->getValueType(0),
2688                          Op0,
2689                          Op1.getOperand(0),
2690                          Op1.getOperand(1));
2691     }
2692   }
2693 
2694   // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
2695   if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
2696     if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true))
2697       return Med3;
2698   }
2699 
2700   if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
2701     if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false))
2702       return Med3;
2703   }
2704 
2705   // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
2706   if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
2707        (Opc == AMDGPUISD::FMIN_LEGACY &&
2708         Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
2709       N->getValueType(0) == MVT::f32 && Op0.hasOneUse()) {
2710     if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
2711       return Res;
2712   }
2713 
2714   return SDValue();
2715 }
2716 
2717 SDValue SITargetLowering::performSetCCCombine(SDNode *N,
2718                                               DAGCombinerInfo &DCI) const {
2719   SelectionDAG &DAG = DCI.DAG;
2720   SDLoc SL(N);
2721 
2722   SDValue LHS = N->getOperand(0);
2723   SDValue RHS = N->getOperand(1);
2724   EVT VT = LHS.getValueType();
2725 
2726   if (VT != MVT::f32 && VT != MVT::f64)
2727     return SDValue();
2728 
2729   // Match isinf pattern
2730   // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
2731   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
2732   if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) {
2733     const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
2734     if (!CRHS)
2735       return SDValue();
2736 
2737     const APFloat &APF = CRHS->getValueAPF();
2738     if (APF.isInfinity() && !APF.isNegative()) {
2739       unsigned Mask = SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY;
2740       return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
2741                          DAG.getConstant(Mask, SL, MVT::i32));
2742     }
2743   }
2744 
2745   return SDValue();
2746 }
2747 
2748 SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
2749                                             DAGCombinerInfo &DCI) const {
2750   SelectionDAG &DAG = DCI.DAG;
2751   SDLoc DL(N);
2752 
2753   switch (N->getOpcode()) {
2754   default:
2755     return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
2756   case ISD::SETCC:
2757     return performSetCCCombine(N, DCI);
2758   case ISD::FMAXNUM:
2759   case ISD::FMINNUM:
2760   case ISD::SMAX:
2761   case ISD::SMIN:
2762   case ISD::UMAX:
2763   case ISD::UMIN:
2764   case AMDGPUISD::FMIN_LEGACY:
2765   case AMDGPUISD::FMAX_LEGACY: {
2766     if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
2767         N->getValueType(0) != MVT::f64 &&
2768         getTargetMachine().getOptLevel() > CodeGenOpt::None)
2769       return performMinMaxCombine(N, DCI);
2770     break;
2771   }
2772 
2773   case AMDGPUISD::CVT_F32_UBYTE0:
2774   case AMDGPUISD::CVT_F32_UBYTE1:
2775   case AMDGPUISD::CVT_F32_UBYTE2:
2776   case AMDGPUISD::CVT_F32_UBYTE3: {
2777     unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
2778 
2779     SDValue Src = N->getOperand(0);
2780     APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
2781 
2782     APInt KnownZero, KnownOne;
2783     TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
2784                                           !DCI.isBeforeLegalizeOps());
2785     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2786     if (TLO.ShrinkDemandedConstant(Src, Demanded) ||
2787         TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) {
2788       DCI.CommitTargetLoweringOpt(TLO);
2789     }
2790 
2791     break;
2792   }
2793 
2794   case ISD::UINT_TO_FP: {
2795     return performUCharToFloatCombine(N, DCI);
2796   }
2797   case ISD::FADD: {
2798     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
2799       break;
2800 
2801     EVT VT = N->getValueType(0);
2802     if (VT != MVT::f32)
2803       break;
2804 
2805     // Only do this if we are not trying to support denormals. v_mad_f32 does
2806     // not support denormals ever.
2807     if (Subtarget->hasFP32Denormals())
2808       break;
2809 
2810     SDValue LHS = N->getOperand(0);
2811     SDValue RHS = N->getOperand(1);
2812 
2813     // These should really be instruction patterns, but writing patterns with
2814     // source modiifiers is a pain.
2815 
2816     // fadd (fadd (a, a), b) -> mad 2.0, a, b
2817     if (LHS.getOpcode() == ISD::FADD) {
2818       SDValue A = LHS.getOperand(0);
2819       if (A == LHS.getOperand(1)) {
2820         const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32);
2821         return DAG.getNode(ISD::FMAD, DL, VT, Two, A, RHS);
2822       }
2823     }
2824 
2825     // fadd (b, fadd (a, a)) -> mad 2.0, a, b
2826     if (RHS.getOpcode() == ISD::FADD) {
2827       SDValue A = RHS.getOperand(0);
2828       if (A == RHS.getOperand(1)) {
2829         const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32);
2830         return DAG.getNode(ISD::FMAD, DL, VT, Two, A, LHS);
2831       }
2832     }
2833 
2834     return SDValue();
2835   }
2836   case ISD::FSUB: {
2837     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
2838       break;
2839 
2840     EVT VT = N->getValueType(0);
2841 
2842     // Try to get the fneg to fold into the source modifier. This undoes generic
2843     // DAG combines and folds them into the mad.
2844     //
2845     // Only do this if we are not trying to support denormals. v_mad_f32 does
2846     // not support denormals ever.
2847     if (VT == MVT::f32 &&
2848         !Subtarget->hasFP32Denormals()) {
2849       SDValue LHS = N->getOperand(0);
2850       SDValue RHS = N->getOperand(1);
2851       if (LHS.getOpcode() == ISD::FADD) {
2852         // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
2853 
2854         SDValue A = LHS.getOperand(0);
2855         if (A == LHS.getOperand(1)) {
2856           const SDValue Two = DAG.getConstantFP(2.0, DL, MVT::f32);
2857           SDValue NegRHS = DAG.getNode(ISD::FNEG, DL, VT, RHS);
2858 
2859           return DAG.getNode(ISD::FMAD, DL, VT, Two, A, NegRHS);
2860         }
2861       }
2862 
2863       if (RHS.getOpcode() == ISD::FADD) {
2864         // (fsub c, (fadd a, a)) -> mad -2.0, a, c
2865 
2866         SDValue A = RHS.getOperand(0);
2867         if (A == RHS.getOperand(1)) {
2868           const SDValue NegTwo = DAG.getConstantFP(-2.0, DL, MVT::f32);
2869           return DAG.getNode(ISD::FMAD, DL, VT, NegTwo, A, LHS);
2870         }
2871       }
2872 
2873       return SDValue();
2874     }
2875 
2876     break;
2877   }
2878   case ISD::LOAD:
2879   case ISD::STORE:
2880   case ISD::ATOMIC_LOAD:
2881   case ISD::ATOMIC_STORE:
2882   case ISD::ATOMIC_CMP_SWAP:
2883   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
2884   case ISD::ATOMIC_SWAP:
2885   case ISD::ATOMIC_LOAD_ADD:
2886   case ISD::ATOMIC_LOAD_SUB:
2887   case ISD::ATOMIC_LOAD_AND:
2888   case ISD::ATOMIC_LOAD_OR:
2889   case ISD::ATOMIC_LOAD_XOR:
2890   case ISD::ATOMIC_LOAD_NAND:
2891   case ISD::ATOMIC_LOAD_MIN:
2892   case ISD::ATOMIC_LOAD_MAX:
2893   case ISD::ATOMIC_LOAD_UMIN:
2894   case ISD::ATOMIC_LOAD_UMAX:
2895   case AMDGPUISD::ATOMIC_INC:
2896   case AMDGPUISD::ATOMIC_DEC: { // TODO: Target mem intrinsics.
2897     if (DCI.isBeforeLegalize())
2898       break;
2899 
2900     MemSDNode *MemNode = cast<MemSDNode>(N);
2901     SDValue Ptr = MemNode->getBasePtr();
2902 
2903     // TODO: We could also do this for multiplies.
2904     unsigned AS = MemNode->getAddressSpace();
2905     if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) {
2906       SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI);
2907       if (NewPtr) {
2908         SmallVector<SDValue, 8> NewOps(MemNode->op_begin(), MemNode->op_end());
2909 
2910         NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr;
2911         return SDValue(DAG.UpdateNodeOperands(MemNode, NewOps), 0);
2912       }
2913     }
2914     break;
2915   }
2916   case ISD::AND:
2917     return performAndCombine(N, DCI);
2918   case ISD::OR:
2919     return performOrCombine(N, DCI);
2920   case AMDGPUISD::FP_CLASS:
2921     return performClassCombine(N, DCI);
2922   case ISD::FCANONICALIZE:
2923     return performFCanonicalizeCombine(N, DCI);
2924   }
2925   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
2926 }
2927 
2928 /// \brief Analyze the possible immediate value Op
2929 ///
2930 /// Returns -1 if it isn't an immediate, 0 if it's and inline immediate
2931 /// and the immediate value if it's a literal immediate
2932 int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const {
2933 
2934   const SIInstrInfo *TII =
2935       static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
2936 
2937   if (const ConstantSDNode *Node = dyn_cast<ConstantSDNode>(N)) {
2938     if (TII->isInlineConstant(Node->getAPIntValue()))
2939       return 0;
2940 
2941     uint64_t Val = Node->getZExtValue();
2942     return isUInt<32>(Val) ? Val : -1;
2943   }
2944 
2945   if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) {
2946     if (TII->isInlineConstant(Node->getValueAPF().bitcastToAPInt()))
2947       return 0;
2948 
2949     if (Node->getValueType(0) == MVT::f32)
2950       return FloatToBits(Node->getValueAPF().convertToFloat());
2951 
2952     return -1;
2953   }
2954 
2955   return -1;
2956 }
2957 
2958 /// \brief Helper function for adjustWritemask
2959 static unsigned SubIdx2Lane(unsigned Idx) {
2960   switch (Idx) {
2961   default: return 0;
2962   case AMDGPU::sub0: return 0;
2963   case AMDGPU::sub1: return 1;
2964   case AMDGPU::sub2: return 2;
2965   case AMDGPU::sub3: return 3;
2966   }
2967 }
2968 
2969 /// \brief Adjust the writemask of MIMG instructions
2970 void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
2971                                        SelectionDAG &DAG) const {
2972   SDNode *Users[4] = { };
2973   unsigned Lane = 0;
2974   unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3;
2975   unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
2976   unsigned NewDmask = 0;
2977 
2978   // Try to figure out the used register components
2979   for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
2980        I != E; ++I) {
2981 
2982     // Abort if we can't understand the usage
2983     if (!I->isMachineOpcode() ||
2984         I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
2985       return;
2986 
2987     // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used.
2988     // Note that subregs are packed, i.e. Lane==0 is the first bit set
2989     // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
2990     // set, etc.
2991     Lane = SubIdx2Lane(I->getConstantOperandVal(1));
2992 
2993     // Set which texture component corresponds to the lane.
2994     unsigned Comp;
2995     for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) {
2996       assert(Dmask);
2997       Comp = countTrailingZeros(Dmask);
2998       Dmask &= ~(1 << Comp);
2999     }
3000 
3001     // Abort if we have more than one user per component
3002     if (Users[Lane])
3003       return;
3004 
3005     Users[Lane] = *I;
3006     NewDmask |= 1 << Comp;
3007   }
3008 
3009   // Abort if there's no change
3010   if (NewDmask == OldDmask)
3011     return;
3012 
3013   // Adjust the writemask in the node
3014   std::vector<SDValue> Ops;
3015   Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
3016   Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
3017   Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
3018   Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops);
3019 
3020   // If we only got one lane, replace it with a copy
3021   // (if NewDmask has only one bit set...)
3022   if (NewDmask && (NewDmask & (NewDmask-1)) == 0) {
3023     SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, SDLoc(),
3024                                        MVT::i32);
3025     SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
3026                                       SDLoc(), Users[Lane]->getValueType(0),
3027                                       SDValue(Node, 0), RC);
3028     DAG.ReplaceAllUsesWith(Users[Lane], Copy);
3029     return;
3030   }
3031 
3032   // Update the users of the node with the new indices
3033   for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) {
3034 
3035     SDNode *User = Users[i];
3036     if (!User)
3037       continue;
3038 
3039     SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
3040     DAG.UpdateNodeOperands(User, User->getOperand(0), Op);
3041 
3042     switch (Idx) {
3043     default: break;
3044     case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
3045     case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
3046     case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
3047     }
3048   }
3049 }
3050 
3051 static bool isFrameIndexOp(SDValue Op) {
3052   if (Op.getOpcode() == ISD::AssertZext)
3053     Op = Op.getOperand(0);
3054 
3055   return isa<FrameIndexSDNode>(Op);
3056 }
3057 
3058 /// \brief Legalize target independent instructions (e.g. INSERT_SUBREG)
3059 /// with frame index operands.
3060 /// LLVM assumes that inputs are to these instructions are registers.
3061 void SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
3062                                                      SelectionDAG &DAG) const {
3063 
3064   SmallVector<SDValue, 8> Ops;
3065   for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
3066     if (!isFrameIndexOp(Node->getOperand(i))) {
3067       Ops.push_back(Node->getOperand(i));
3068       continue;
3069     }
3070 
3071     SDLoc DL(Node);
3072     Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
3073                                      Node->getOperand(i).getValueType(),
3074                                      Node->getOperand(i)), 0));
3075   }
3076 
3077   DAG.UpdateNodeOperands(Node, Ops);
3078 }
3079 
3080 /// \brief Fold the instructions after selecting them.
3081 SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
3082                                           SelectionDAG &DAG) const {
3083   const SIInstrInfo *TII =
3084       static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
3085   unsigned Opcode = Node->getMachineOpcode();
3086 
3087   if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore())
3088     adjustWritemask(Node, DAG);
3089 
3090   if (Opcode == AMDGPU::INSERT_SUBREG ||
3091       Opcode == AMDGPU::REG_SEQUENCE) {
3092     legalizeTargetIndependentNode(Node, DAG);
3093     return Node;
3094   }
3095   return Node;
3096 }
3097 
3098 /// \brief Assign the register class depending on the number of
3099 /// bits set in the writemask
3100 void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
3101                                                      SDNode *Node) const {
3102   const SIInstrInfo *TII =
3103       static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
3104 
3105   MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
3106 
3107   if (TII->isVOP3(MI->getOpcode())) {
3108     // Make sure constant bus requirements are respected.
3109     TII->legalizeOperandsVOP3(MRI, MI);
3110     return;
3111   }
3112 
3113   if (TII->isMIMG(*MI)) {
3114     unsigned VReg = MI->getOperand(0).getReg();
3115     unsigned DmaskIdx = MI->getNumOperands() == 12 ? 3 : 4;
3116     unsigned Writemask = MI->getOperand(DmaskIdx).getImm();
3117     unsigned BitsSet = 0;
3118     for (unsigned i = 0; i < 4; ++i)
3119       BitsSet += Writemask & (1 << i) ? 1 : 0;
3120 
3121     const TargetRegisterClass *RC;
3122     switch (BitsSet) {
3123     default: return;
3124     case 1:  RC = &AMDGPU::VGPR_32RegClass; break;
3125     case 2:  RC = &AMDGPU::VReg_64RegClass; break;
3126     case 3:  RC = &AMDGPU::VReg_96RegClass; break;
3127     }
3128 
3129     unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet);
3130     MI->setDesc(TII->get(NewOpcode));
3131     MRI.setRegClass(VReg, RC);
3132     return;
3133   }
3134 
3135   // Replace unused atomics with the no return version.
3136   int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI->getOpcode());
3137   if (NoRetAtomicOp != -1) {
3138     if (!Node->hasAnyUseOfValue(0)) {
3139       MI->setDesc(TII->get(NoRetAtomicOp));
3140       MI->RemoveOperand(0);
3141       return;
3142     }
3143 
3144     // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg
3145     // instruction, because the return type of these instructions is a vec2 of
3146     // the memory type, so it can be tied to the input operand.
3147     // This means these instructions always have a use, so we need to add a
3148     // special case to check if the atomic has only one extract_subreg use,
3149     // which itself has no uses.
3150     if ((Node->hasNUsesOfValue(1, 0) &&
3151          Node->use_begin()->isMachineOpcode() &&
3152          Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG &&
3153          !Node->use_begin()->hasAnyUseOfValue(0))) {
3154       unsigned Def = MI->getOperand(0).getReg();
3155 
3156       // Change this into a noret atomic.
3157       MI->setDesc(TII->get(NoRetAtomicOp));
3158       MI->RemoveOperand(0);
3159 
3160       // If we only remove the def operand from the atomic instruction, the
3161       // extract_subreg will be left with a use of a vreg without a def.
3162       // So we need to insert an implicit_def to avoid machine verifier
3163       // errors.
3164       BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
3165               TII->get(AMDGPU::IMPLICIT_DEF), Def);
3166     }
3167     return;
3168   }
3169 }
3170 
3171 static SDValue buildSMovImm32(SelectionDAG &DAG, SDLoc DL, uint64_t Val) {
3172   SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
3173   return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
3174 }
3175 
3176 MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
3177                                                 SDLoc DL,
3178                                                 SDValue Ptr) const {
3179   const SIInstrInfo *TII =
3180     static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
3181 
3182   // Build the half of the subregister with the constants before building the
3183   // full 128-bit register. If we are building multiple resource descriptors,
3184   // this will allow CSEing of the 2-component register.
3185   const SDValue Ops0[] = {
3186     DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
3187     buildSMovImm32(DAG, DL, 0),
3188     DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
3189     buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
3190     DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
3191   };
3192 
3193   SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
3194                                                 MVT::v2i32, Ops0), 0);
3195 
3196   // Combine the constants and the pointer.
3197   const SDValue Ops1[] = {
3198     DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
3199     Ptr,
3200     DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
3201     SubRegHi,
3202     DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
3203   };
3204 
3205   return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
3206 }
3207 
3208 /// \brief Return a resource descriptor with the 'Add TID' bit enabled
3209 ///        The TID (Thread ID) is multiplied by the stride value (bits [61:48]
3210 ///        of the resource descriptor) to create an offset, which is added to
3211 ///        the resource pointer.
3212 MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG,
3213                                            SDLoc DL,
3214                                            SDValue Ptr,
3215                                            uint32_t RsrcDword1,
3216                                            uint64_t RsrcDword2And3) const {
3217   SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
3218   SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
3219   if (RsrcDword1) {
3220     PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
3221                                      DAG.getConstant(RsrcDword1, DL, MVT::i32)),
3222                     0);
3223   }
3224 
3225   SDValue DataLo = buildSMovImm32(DAG, DL,
3226                                   RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
3227   SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
3228 
3229   const SDValue Ops[] = {
3230     DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
3231     PtrLo,
3232     DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
3233     PtrHi,
3234     DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
3235     DataLo,
3236     DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
3237     DataHi,
3238     DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
3239   };
3240 
3241   return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
3242 }
3243 
3244 SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
3245                                                const TargetRegisterClass *RC,
3246                                                unsigned Reg, EVT VT) const {
3247   SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT);
3248 
3249   return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()),
3250                             cast<RegisterSDNode>(VReg)->getReg(), VT);
3251 }
3252 
3253 //===----------------------------------------------------------------------===//
3254 //                         SI Inline Assembly Support
3255 //===----------------------------------------------------------------------===//
3256 
3257 std::pair<unsigned, const TargetRegisterClass *>
3258 SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
3259                                                StringRef Constraint,
3260                                                MVT VT) const {
3261 
3262   if (Constraint.size() == 1) {
3263     switch (Constraint[0]) {
3264     case 's':
3265     case 'r':
3266       switch (VT.getSizeInBits()) {
3267       default:
3268         return std::make_pair(0U, nullptr);
3269       case 32:
3270         return std::make_pair(0U, &AMDGPU::SGPR_32RegClass);
3271       case 64:
3272         return std::make_pair(0U, &AMDGPU::SGPR_64RegClass);
3273       case 128:
3274         return std::make_pair(0U, &AMDGPU::SReg_128RegClass);
3275       case 256:
3276         return std::make_pair(0U, &AMDGPU::SReg_256RegClass);
3277       }
3278 
3279     case 'v':
3280       switch (VT.getSizeInBits()) {
3281       default:
3282         return std::make_pair(0U, nullptr);
3283       case 32:
3284         return std::make_pair(0U, &AMDGPU::VGPR_32RegClass);
3285       case 64:
3286         return std::make_pair(0U, &AMDGPU::VReg_64RegClass);
3287       case 96:
3288         return std::make_pair(0U, &AMDGPU::VReg_96RegClass);
3289       case 128:
3290         return std::make_pair(0U, &AMDGPU::VReg_128RegClass);
3291       case 256:
3292         return std::make_pair(0U, &AMDGPU::VReg_256RegClass);
3293       case 512:
3294         return std::make_pair(0U, &AMDGPU::VReg_512RegClass);
3295       }
3296     }
3297   }
3298 
3299   if (Constraint.size() > 1) {
3300     const TargetRegisterClass *RC = nullptr;
3301     if (Constraint[1] == 'v') {
3302       RC = &AMDGPU::VGPR_32RegClass;
3303     } else if (Constraint[1] == 's') {
3304       RC = &AMDGPU::SGPR_32RegClass;
3305     }
3306 
3307     if (RC) {
3308       uint32_t Idx;
3309       bool Failed = Constraint.substr(2).getAsInteger(10, Idx);
3310       if (!Failed && Idx < RC->getNumRegs())
3311         return std::make_pair(RC->getRegister(Idx), RC);
3312     }
3313   }
3314   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
3315 }
3316 
3317 SITargetLowering::ConstraintType
3318 SITargetLowering::getConstraintType(StringRef Constraint) const {
3319   if (Constraint.size() == 1) {
3320     switch (Constraint[0]) {
3321     default: break;
3322     case 's':
3323     case 'v':
3324       return C_RegisterClass;
3325     }
3326   }
3327   return TargetLowering::getConstraintType(Constraint);
3328 }
3329