1 //===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief This is the parent TargetLowering class for hardware code gen
12 /// targets.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPUISelLowering.h"
17 #include "AMDGPU.h"
18 #include "AMDGPUCallLowering.h"
19 #include "AMDGPUFrameLowering.h"
20 #include "AMDGPUIntrinsicInfo.h"
21 #include "AMDGPURegisterInfo.h"
22 #include "AMDGPUSubtarget.h"
23 #include "R600MachineFunctionInfo.h"
24 #include "SIMachineFunctionInfo.h"
25 #include "llvm/CodeGen/CallingConvLower.h"
26 #include "llvm/CodeGen/MachineFunction.h"
27 #include "llvm/CodeGen/MachineRegisterInfo.h"
28 #include "llvm/CodeGen/SelectionDAG.h"
29 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
30 #include "llvm/IR/DataLayout.h"
31 #include "llvm/IR/DiagnosticInfo.h"
32 #include "SIInstrInfo.h"
33 using namespace llvm;
34 
35 static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT,
36                             CCValAssign::LocInfo LocInfo,
37                             ISD::ArgFlagsTy ArgFlags, CCState &State) {
38   MachineFunction &MF = State.getMachineFunction();
39   AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
40 
41   uint64_t Offset = MFI->allocateKernArg(LocVT.getStoreSize(),
42                                          ArgFlags.getOrigAlign());
43   State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));
44   return true;
45 }
46 
47 #include "AMDGPUGenCallingConv.inc"
48 
49 // Find a larger type to do a load / store of a vector with.
50 EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
51   unsigned StoreSize = VT.getStoreSizeInBits();
52   if (StoreSize <= 32)
53     return EVT::getIntegerVT(Ctx, StoreSize);
54 
55   assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
56   return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
57 }
58 
59 AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
60                                            const AMDGPUSubtarget &STI)
61     : TargetLowering(TM), Subtarget(&STI) {
62   // Lower floating point store/load to integer store/load to reduce the number
63   // of patterns in tablegen.
64   setOperationAction(ISD::LOAD, MVT::f32, Promote);
65   AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
66 
67   setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
68   AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
69 
70   setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
71   AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
72 
73   setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
74   AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
75 
76   setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
77   AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
78 
79   setOperationAction(ISD::LOAD, MVT::i64, Promote);
80   AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
81 
82   setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
83   AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
84 
85   setOperationAction(ISD::LOAD, MVT::f64, Promote);
86   AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
87 
88   setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
89   AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
90 
91   // There are no 64-bit extloads. These should be done as a 32-bit extload and
92   // an extension to 64-bit.
93   for (MVT VT : MVT::integer_valuetypes()) {
94     setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand);
95     setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand);
96     setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand);
97   }
98 
99   for (MVT VT : MVT::integer_valuetypes()) {
100     if (VT == MVT::i64)
101       continue;
102 
103     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
104     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal);
105     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal);
106     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
107 
108     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
109     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal);
110     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal);
111     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);
112 
113     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
114     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal);
115     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal);
116     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
117   }
118 
119   for (MVT VT : MVT::integer_vector_valuetypes()) {
120     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
121     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
122     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
123     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand);
124     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand);
125     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand);
126     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
127     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
128     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
129     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
130     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
131     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
132   }
133 
134   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
135   setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
136   setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
137   setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
138 
139   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
140   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
141   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
142   setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
143 
144   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
145   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
146   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
147   setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
148 
149   setOperationAction(ISD::STORE, MVT::f32, Promote);
150   AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
151 
152   setOperationAction(ISD::STORE, MVT::v2f32, Promote);
153   AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
154 
155   setOperationAction(ISD::STORE, MVT::v4f32, Promote);
156   AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
157 
158   setOperationAction(ISD::STORE, MVT::v8f32, Promote);
159   AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
160 
161   setOperationAction(ISD::STORE, MVT::v16f32, Promote);
162   AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
163 
164   setOperationAction(ISD::STORE, MVT::i64, Promote);
165   AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
166 
167   setOperationAction(ISD::STORE, MVT::v2i64, Promote);
168   AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
169 
170   setOperationAction(ISD::STORE, MVT::f64, Promote);
171   AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
172 
173   setOperationAction(ISD::STORE, MVT::v2f64, Promote);
174   AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
175 
176   setTruncStoreAction(MVT::i64, MVT::i1, Expand);
177   setTruncStoreAction(MVT::i64, MVT::i8, Expand);
178   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
179   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
180 
181   setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
182   setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
183   setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
184   setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
185 
186   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
187   setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
188   setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
189   setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
190 
191   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
192   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
193 
194   setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
195   setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
196 
197   setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
198   setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
199 
200   setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
201   setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
202 
203 
204   setOperationAction(ISD::Constant, MVT::i32, Legal);
205   setOperationAction(ISD::Constant, MVT::i64, Legal);
206   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
207   setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
208 
209   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
210   setOperationAction(ISD::BRIND, MVT::Other, Expand);
211 
212   // This is totally unsupported, just custom lower to produce an error.
213   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
214 
215   // We need to custom lower some of the intrinsics
216   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
217   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
218 
219   // Library functions.  These default to Expand, but we have instructions
220   // for them.
221   setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
222   setOperationAction(ISD::FEXP2,  MVT::f32, Legal);
223   setOperationAction(ISD::FPOW,   MVT::f32, Legal);
224   setOperationAction(ISD::FLOG2,  MVT::f32, Legal);
225   setOperationAction(ISD::FABS,   MVT::f32, Legal);
226   setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
227   setOperationAction(ISD::FRINT,  MVT::f32, Legal);
228   setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
229   setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
230   setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
231 
232   setOperationAction(ISD::FROUND, MVT::f32, Custom);
233   setOperationAction(ISD::FROUND, MVT::f64, Custom);
234 
235   setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
236   setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
237 
238   setOperationAction(ISD::FREM, MVT::f32, Custom);
239   setOperationAction(ISD::FREM, MVT::f64, Custom);
240 
241   // v_mad_f32 does not support denormals according to some sources.
242   if (!Subtarget->hasFP32Denormals())
243     setOperationAction(ISD::FMAD, MVT::f32, Legal);
244 
245   // Expand to fneg + fadd.
246   setOperationAction(ISD::FSUB, MVT::f64, Expand);
247 
248   setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
249   setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
250   setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
251   setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
252   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
253   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
254   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom);
255   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
256   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
257   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
258 
259   if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
260     setOperationAction(ISD::FCEIL, MVT::f64, Custom);
261     setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
262     setOperationAction(ISD::FRINT, MVT::f64, Custom);
263     setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
264   }
265 
266   if (!Subtarget->hasBFI()) {
267     // fcopysign can be done in a single instruction with BFI.
268     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
269     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
270   }
271 
272   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
273   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);
274   setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom);
275 
276   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
277   for (MVT VT : ScalarIntVTs) {
278     // These should use [SU]DIVREM, so set them to expand
279     setOperationAction(ISD::SDIV, VT, Expand);
280     setOperationAction(ISD::UDIV, VT, Expand);
281     setOperationAction(ISD::SREM, VT, Expand);
282     setOperationAction(ISD::UREM, VT, Expand);
283 
284     // GPU does not have divrem function for signed or unsigned.
285     setOperationAction(ISD::SDIVREM, VT, Custom);
286     setOperationAction(ISD::UDIVREM, VT, Custom);
287 
288     // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
289     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
290     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
291 
292     setOperationAction(ISD::BSWAP, VT, Expand);
293     setOperationAction(ISD::CTTZ, VT, Expand);
294     setOperationAction(ISD::CTLZ, VT, Expand);
295   }
296 
297   if (!Subtarget->hasBCNT(32))
298     setOperationAction(ISD::CTPOP, MVT::i32, Expand);
299 
300   if (!Subtarget->hasBCNT(64))
301     setOperationAction(ISD::CTPOP, MVT::i64, Expand);
302 
303   // The hardware supports 32-bit ROTR, but not ROTL.
304   setOperationAction(ISD::ROTL, MVT::i32, Expand);
305   setOperationAction(ISD::ROTL, MVT::i64, Expand);
306   setOperationAction(ISD::ROTR, MVT::i64, Expand);
307 
308   setOperationAction(ISD::MUL, MVT::i64, Expand);
309   setOperationAction(ISD::MULHU, MVT::i64, Expand);
310   setOperationAction(ISD::MULHS, MVT::i64, Expand);
311   setOperationAction(ISD::UDIV, MVT::i32, Expand);
312   setOperationAction(ISD::UREM, MVT::i32, Expand);
313   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
314   setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
315   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
316   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
317   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
318 
319   setOperationAction(ISD::SMIN, MVT::i32, Legal);
320   setOperationAction(ISD::UMIN, MVT::i32, Legal);
321   setOperationAction(ISD::SMAX, MVT::i32, Legal);
322   setOperationAction(ISD::UMAX, MVT::i32, Legal);
323 
324   if (Subtarget->hasFFBH())
325     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
326 
327   if (Subtarget->hasFFBL())
328     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Legal);
329 
330   setOperationAction(ISD::CTLZ, MVT::i64, Custom);
331   setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
332 
333   // We only really have 32-bit BFE instructions (and 16-bit on VI).
334   //
335   // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
336   // effort to match them now. We want this to be false for i64 cases when the
337   // extraction isn't restricted to the upper or lower half. Ideally we would
338   // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
339   // span the midpoint are probably relatively rare, so don't worry about them
340   // for now.
341   if (Subtarget->hasBFE())
342     setHasExtractBitsInsn(true);
343 
344   static const MVT::SimpleValueType VectorIntTypes[] = {
345     MVT::v2i32, MVT::v4i32
346   };
347 
348   for (MVT VT : VectorIntTypes) {
349     // Expand the following operations for the current type by default.
350     setOperationAction(ISD::ADD,  VT, Expand);
351     setOperationAction(ISD::AND,  VT, Expand);
352     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
353     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
354     setOperationAction(ISD::MUL,  VT, Expand);
355     setOperationAction(ISD::MULHU, VT, Expand);
356     setOperationAction(ISD::MULHS, VT, Expand);
357     setOperationAction(ISD::OR,   VT, Expand);
358     setOperationAction(ISD::SHL,  VT, Expand);
359     setOperationAction(ISD::SRA,  VT, Expand);
360     setOperationAction(ISD::SRL,  VT, Expand);
361     setOperationAction(ISD::ROTL, VT, Expand);
362     setOperationAction(ISD::ROTR, VT, Expand);
363     setOperationAction(ISD::SUB,  VT, Expand);
364     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
365     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
366     setOperationAction(ISD::SDIV, VT, Expand);
367     setOperationAction(ISD::UDIV, VT, Expand);
368     setOperationAction(ISD::SREM, VT, Expand);
369     setOperationAction(ISD::UREM, VT, Expand);
370     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
371     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
372     setOperationAction(ISD::SDIVREM, VT, Custom);
373     setOperationAction(ISD::UDIVREM, VT, Expand);
374     setOperationAction(ISD::ADDC, VT, Expand);
375     setOperationAction(ISD::SUBC, VT, Expand);
376     setOperationAction(ISD::ADDE, VT, Expand);
377     setOperationAction(ISD::SUBE, VT, Expand);
378     setOperationAction(ISD::SELECT, VT, Expand);
379     setOperationAction(ISD::VSELECT, VT, Expand);
380     setOperationAction(ISD::SELECT_CC, VT, Expand);
381     setOperationAction(ISD::XOR,  VT, Expand);
382     setOperationAction(ISD::BSWAP, VT, Expand);
383     setOperationAction(ISD::CTPOP, VT, Expand);
384     setOperationAction(ISD::CTTZ, VT, Expand);
385     setOperationAction(ISD::CTLZ, VT, Expand);
386     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
387   }
388 
389   static const MVT::SimpleValueType FloatVectorTypes[] = {
390     MVT::v2f32, MVT::v4f32
391   };
392 
393   for (MVT VT : FloatVectorTypes) {
394     setOperationAction(ISD::FABS, VT, Expand);
395     setOperationAction(ISD::FMINNUM, VT, Expand);
396     setOperationAction(ISD::FMAXNUM, VT, Expand);
397     setOperationAction(ISD::FADD, VT, Expand);
398     setOperationAction(ISD::FCEIL, VT, Expand);
399     setOperationAction(ISD::FCOS, VT, Expand);
400     setOperationAction(ISD::FDIV, VT, Expand);
401     setOperationAction(ISD::FEXP2, VT, Expand);
402     setOperationAction(ISD::FLOG2, VT, Expand);
403     setOperationAction(ISD::FREM, VT, Expand);
404     setOperationAction(ISD::FPOW, VT, Expand);
405     setOperationAction(ISD::FFLOOR, VT, Expand);
406     setOperationAction(ISD::FTRUNC, VT, Expand);
407     setOperationAction(ISD::FMUL, VT, Expand);
408     setOperationAction(ISD::FMA, VT, Expand);
409     setOperationAction(ISD::FRINT, VT, Expand);
410     setOperationAction(ISD::FNEARBYINT, VT, Expand);
411     setOperationAction(ISD::FSQRT, VT, Expand);
412     setOperationAction(ISD::FSIN, VT, Expand);
413     setOperationAction(ISD::FSUB, VT, Expand);
414     setOperationAction(ISD::FNEG, VT, Expand);
415     setOperationAction(ISD::VSELECT, VT, Expand);
416     setOperationAction(ISD::SELECT_CC, VT, Expand);
417     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
418     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
419   }
420 
421   // This causes using an unrolled select operation rather than expansion with
422   // bit operations. This is in general better, but the alternative using BFI
423   // instructions may be better if the select sources are SGPRs.
424   setOperationAction(ISD::SELECT, MVT::v2f32, Promote);
425   AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
426 
427   setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
428   AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
429 
430   // There are no libcalls of any kind.
431   for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
432     setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
433 
434   setBooleanContents(ZeroOrNegativeOneBooleanContent);
435   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
436 
437   setSchedulingPreference(Sched::RegPressure);
438   setJumpIsExpensive(true);
439 
440   // FIXME: This is only partially true. If we have to do vector compares, any
441   // SGPR pair can be a condition register. If we have a uniform condition, we
442   // are better off doing SALU operations, where there is only one SCC. For now,
443   // we don't have a way of knowing during instruction selection if a condition
444   // will be uniform and we always use vector compares. Assume we are using
445   // vector compares until that is fixed.
446   setHasMultipleConditionRegisters(true);
447 
448   // SI at least has hardware support for floating point exceptions, but no way
449   // of using or handling them is implemented. They are also optional in OpenCL
450   // (Section 7.3)
451   setHasFloatingPointExceptions(Subtarget->hasFPExceptions());
452 
453   PredictableSelectIsExpensive = false;
454 
455   // We want to find all load dependencies for long chains of stores to enable
456   // merging into very wide vectors. The problem is with vectors with > 4
457   // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
458   // vectors are a legal type, even though we have to split the loads
459   // usually. When we can more precisely specify load legality per address
460   // space, we should be able to make FindBetterChain/MergeConsecutiveStores
461   // smarter so that they can figure out what to do in 2 iterations without all
462   // N > 4 stores on the same chain.
463   GatherAllAliasesMaxDepth = 16;
464 
465   // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
466   // about these during lowering.
467   MaxStoresPerMemcpy  = 0xffffffff;
468   MaxStoresPerMemmove = 0xffffffff;
469   MaxStoresPerMemset  = 0xffffffff;
470 
471   setTargetDAGCombine(ISD::BITCAST);
472   setTargetDAGCombine(ISD::SHL);
473   setTargetDAGCombine(ISD::SRA);
474   setTargetDAGCombine(ISD::SRL);
475   setTargetDAGCombine(ISD::MUL);
476   setTargetDAGCombine(ISD::MULHU);
477   setTargetDAGCombine(ISD::MULHS);
478   setTargetDAGCombine(ISD::SELECT);
479   setTargetDAGCombine(ISD::SELECT_CC);
480   setTargetDAGCombine(ISD::STORE);
481   setTargetDAGCombine(ISD::FADD);
482   setTargetDAGCombine(ISD::FSUB);
483   setTargetDAGCombine(ISD::FNEG);
484   setTargetDAGCombine(ISD::FABS);
485 }
486 
487 //===----------------------------------------------------------------------===//
488 // Target Information
489 //===----------------------------------------------------------------------===//
490 
491 LLVM_READNONE
492 static bool fnegFoldsIntoOp(unsigned Opc) {
493   switch (Opc) {
494   case ISD::FADD:
495   case ISD::FSUB:
496   case ISD::FMUL:
497   case ISD::FMA:
498   case ISD::FMAD:
499   case ISD::FMINNUM:
500   case ISD::FMAXNUM:
501   case ISD::FSIN:
502   case ISD::FTRUNC:
503   case ISD::FRINT:
504   case ISD::FNEARBYINT:
505   case AMDGPUISD::RCP:
506   case AMDGPUISD::RCP_LEGACY:
507   case AMDGPUISD::SIN_HW:
508   case AMDGPUISD::FMUL_LEGACY:
509   case AMDGPUISD::FMIN_LEGACY:
510   case AMDGPUISD::FMAX_LEGACY:
511     return true;
512   default:
513     return false;
514   }
515 }
516 
517 /// \p returns true if the operation will definitely need to use a 64-bit
518 /// encoding, and thus will use a VOP3 encoding regardless of the source
519 /// modifiers.
520 LLVM_READONLY
521 static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
522   return N->getNumOperands() > 2 || VT == MVT::f64;
523 }
524 
525 // Most FP instructions support source modifiers, but this could be refined
526 // slightly.
527 LLVM_READONLY
528 static bool hasSourceMods(const SDNode *N) {
529   if (isa<MemSDNode>(N))
530     return false;
531 
532   switch (N->getOpcode()) {
533   case ISD::CopyToReg:
534   case ISD::SELECT:
535   case ISD::FDIV:
536   case ISD::FREM:
537   case ISD::INLINEASM:
538   case AMDGPUISD::INTERP_P1:
539   case AMDGPUISD::INTERP_P2:
540   case AMDGPUISD::DIV_SCALE:
541     return false;
542   default:
543     return true;
544   }
545 }
546 
547 static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold = 4) {
548   // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
549   // it is truly free to use a source modifier in all cases. If there are
550   // multiple users but for each one will necessitate using VOP3, there will be
551   // a code size increase. Try to avoid increasing code size unless we know it
552   // will save on the instruction count.
553   unsigned NumMayIncreaseSize = 0;
554   MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
555 
556   // XXX - Should this limit number of uses to check?
557   for (const SDNode *U : N->uses()) {
558     if (!hasSourceMods(U))
559       return false;
560 
561     if (!opMustUseVOP3Encoding(U, VT)) {
562       if (++NumMayIncreaseSize > CostThreshold)
563         return false;
564     }
565   }
566 
567   return true;
568 }
569 
570 MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
571   return MVT::i32;
572 }
573 
574 bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
575   return true;
576 }
577 
578 // The backend supports 32 and 64 bit floating point immediates.
579 // FIXME: Why are we reporting vectors of FP immediates as legal?
580 bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
581   EVT ScalarVT = VT.getScalarType();
582   return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
583          (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
584 }
585 
586 // We don't want to shrink f64 / f32 constants.
587 bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
588   EVT ScalarVT = VT.getScalarType();
589   return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
590 }
591 
592 bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
593                                                  ISD::LoadExtType,
594                                                  EVT NewVT) const {
595 
596   unsigned NewSize = NewVT.getStoreSizeInBits();
597 
598   // If we are reducing to a 32-bit load, this is always better.
599   if (NewSize == 32)
600     return true;
601 
602   EVT OldVT = N->getValueType(0);
603   unsigned OldSize = OldVT.getStoreSizeInBits();
604 
605   // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
606   // extloads, so doing one requires using a buffer_load. In cases where we
607   // still couldn't use a scalar load, using the wider load shouldn't really
608   // hurt anything.
609 
610   // If the old size already had to be an extload, there's no harm in continuing
611   // to reduce the width.
612   return (OldSize < 32);
613 }
614 
615 bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
616                                                    EVT CastTy) const {
617 
618   assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
619 
620   if (LoadTy.getScalarType() == MVT::i32)
621     return false;
622 
623   unsigned LScalarSize = LoadTy.getScalarSizeInBits();
624   unsigned CastScalarSize = CastTy.getScalarSizeInBits();
625 
626   return (LScalarSize < CastScalarSize) ||
627          (CastScalarSize >= 32);
628 }
629 
630 // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
631 // profitable with the expansion for 64-bit since it's generally good to
632 // speculate things.
633 // FIXME: These should really have the size as a parameter.
634 bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const {
635   return true;
636 }
637 
638 bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
639   return true;
640 }
641 
642 //===---------------------------------------------------------------------===//
643 // Target Properties
644 //===---------------------------------------------------------------------===//
645 
646 bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
647   assert(VT.isFloatingPoint());
648 
649   // Packed operations do not have a fabs modifier.
650   return VT == MVT::f32 || VT == MVT::f64 ||
651          (Subtarget->has16BitInsts() && VT == MVT::f16);
652 }
653 
654 bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
655   assert(VT.isFloatingPoint());
656   return VT == MVT::f32 || VT == MVT::f64 ||
657          (Subtarget->has16BitInsts() && VT == MVT::f16) ||
658          (Subtarget->hasVOP3PInsts() && VT == MVT::v2f16);
659 }
660 
661 bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
662                                                          unsigned NumElem,
663                                                          unsigned AS) const {
664   return true;
665 }
666 
667 bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
668   // There are few operations which truly have vector input operands. Any vector
669   // operation is going to involve operations on each component, and a
670   // build_vector will be a copy per element, so it always makes sense to use a
671   // build_vector input in place of the extracted element to avoid a copy into a
672   // super register.
673   //
674   // We should probably only do this if all users are extracts only, but this
675   // should be the common case.
676   return true;
677 }
678 
679 bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
680   // Truncate is just accessing a subregister.
681 
682   unsigned SrcSize = Source.getSizeInBits();
683   unsigned DestSize = Dest.getSizeInBits();
684 
685   return DestSize < SrcSize && DestSize % 32 == 0 ;
686 }
687 
688 bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
689   // Truncate is just accessing a subregister.
690 
691   unsigned SrcSize = Source->getScalarSizeInBits();
692   unsigned DestSize = Dest->getScalarSizeInBits();
693 
694   if (DestSize== 16 && Subtarget->has16BitInsts())
695     return SrcSize >= 32;
696 
697   return DestSize < SrcSize && DestSize % 32 == 0;
698 }
699 
700 bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
701   unsigned SrcSize = Src->getScalarSizeInBits();
702   unsigned DestSize = Dest->getScalarSizeInBits();
703 
704   if (SrcSize == 16 && Subtarget->has16BitInsts())
705     return DestSize >= 32;
706 
707   return SrcSize == 32 && DestSize == 64;
708 }
709 
710 bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
711   // Any register load of a 64-bit value really requires 2 32-bit moves. For all
712   // practical purposes, the extra mov 0 to load a 64-bit is free.  As used,
713   // this will enable reducing 64-bit operations the 32-bit, which is always
714   // good.
715 
716   if (Src == MVT::i16)
717     return Dest == MVT::i32 ||Dest == MVT::i64 ;
718 
719   return Src == MVT::i32 && Dest == MVT::i64;
720 }
721 
722 bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
723   return isZExtFree(Val.getValueType(), VT2);
724 }
725 
726 bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
727   // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
728   // limited number of native 64-bit operations. Shrinking an operation to fit
729   // in a single 32-bit register should always be helpful. As currently used,
730   // this is much less general than the name suggests, and is only used in
731   // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
732   // not profitable, and may actually be harmful.
733   return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
734 }
735 
736 //===---------------------------------------------------------------------===//
737 // TargetLowering Callbacks
738 //===---------------------------------------------------------------------===//
739 
740 CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
741                                                   bool IsVarArg) const {
742   return CC_AMDGPU;
743 }
744 
745 /// The SelectionDAGBuilder will automatically promote function arguments
746 /// with illegal types.  However, this does not work for the AMDGPU targets
747 /// since the function arguments are stored in memory as these illegal types.
748 /// In order to handle this properly we need to get the original types sizes
749 /// from the LLVM IR Function and fixup the ISD:InputArg values before
750 /// passing them to AnalyzeFormalArguments()
751 
752 /// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
753 /// input values across multiple registers.  Each item in the Ins array
754 /// represents a single value that will be stored in regsters.  Ins[x].VT is
755 /// the value type of the value that will be stored in the register, so
756 /// whatever SDNode we lower the argument to needs to be this type.
757 ///
758 /// In order to correctly lower the arguments we need to know the size of each
759 /// argument.  Since Ins[x].VT gives us the size of the register that will
760 /// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
761 /// for the orignal function argument so that we can deduce the correct memory
762 /// type to use for Ins[x].  In most cases the correct memory type will be
763 /// Ins[x].ArgVT.  However, this will not always be the case.  If, for example,
764 /// we have a kernel argument of type v8i8, this argument will be split into
765 /// 8 parts and each part will be represented by its own item in the Ins array.
766 /// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
767 /// the argument before it was split.  From this, we deduce that the memory type
768 /// for each individual part is i8.  We pass the memory type as LocVT to the
769 /// calling convention analysis function and the register type (Ins[x].VT) as
770 /// the ValVT.
771 void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(CCState &State,
772                              const SmallVectorImpl<ISD::InputArg> &Ins) const {
773   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
774     const ISD::InputArg &In = Ins[i];
775     EVT MemVT;
776 
777     unsigned NumRegs = getNumRegisters(State.getContext(), In.ArgVT);
778 
779     if (!Subtarget->isAmdHsaOS() &&
780         (In.ArgVT == MVT::i16 || In.ArgVT == MVT::i8 || In.ArgVT == MVT::f16)) {
781       // The ABI says the caller will extend these values to 32-bits.
782       MemVT = In.ArgVT.isInteger() ? MVT::i32 : MVT::f32;
783     } else if (NumRegs == 1) {
784       // This argument is not split, so the IR type is the memory type.
785       assert(!In.Flags.isSplit());
786       if (In.ArgVT.isExtended()) {
787         // We have an extended type, like i24, so we should just use the register type
788         MemVT = In.VT;
789       } else {
790         MemVT = In.ArgVT;
791       }
792     } else if (In.ArgVT.isVector() && In.VT.isVector() &&
793                In.ArgVT.getScalarType() == In.VT.getScalarType()) {
794       assert(In.ArgVT.getVectorNumElements() > In.VT.getVectorNumElements());
795       // We have a vector value which has been split into a vector with
796       // the same scalar type, but fewer elements.  This should handle
797       // all the floating-point vector types.
798       MemVT = In.VT;
799     } else if (In.ArgVT.isVector() &&
800                In.ArgVT.getVectorNumElements() == NumRegs) {
801       // This arg has been split so that each element is stored in a separate
802       // register.
803       MemVT = In.ArgVT.getScalarType();
804     } else if (In.ArgVT.isExtended()) {
805       // We have an extended type, like i65.
806       MemVT = In.VT;
807     } else {
808       unsigned MemoryBits = In.ArgVT.getStoreSizeInBits() / NumRegs;
809       assert(In.ArgVT.getStoreSizeInBits() % NumRegs == 0);
810       if (In.VT.isInteger()) {
811         MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
812       } else if (In.VT.isVector()) {
813         assert(!In.VT.getScalarType().isFloatingPoint());
814         unsigned NumElements = In.VT.getVectorNumElements();
815         assert(MemoryBits % NumElements == 0);
816         // This vector type has been split into another vector type with
817         // a different elements size.
818         EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
819                                          MemoryBits / NumElements);
820         MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
821       } else {
822         llvm_unreachable("cannot deduce memory type.");
823       }
824     }
825 
826     // Convert one element vectors to scalar.
827     if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
828       MemVT = MemVT.getScalarType();
829 
830     if (MemVT.isExtended()) {
831       // This should really only happen if we have vec3 arguments
832       assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3);
833       MemVT = MemVT.getPow2VectorType(State.getContext());
834     }
835 
836     assert(MemVT.isSimple());
837     allocateKernArg(i, In.VT, MemVT.getSimpleVT(), CCValAssign::Full, In.Flags,
838                     State);
839   }
840 }
841 
842 void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State,
843                               const SmallVectorImpl<ISD::InputArg> &Ins) const {
844   State.AnalyzeFormalArguments(Ins, CC_AMDGPU);
845 }
846 
847 void AMDGPUTargetLowering::AnalyzeReturn(CCState &State,
848                            const SmallVectorImpl<ISD::OutputArg> &Outs) const {
849 
850   State.AnalyzeReturn(Outs, RetCC_SI);
851 }
852 
853 SDValue
854 AMDGPUTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
855                                   bool isVarArg,
856                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
857                                   const SmallVectorImpl<SDValue> &OutVals,
858                                   const SDLoc &DL, SelectionDAG &DAG) const {
859   return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
860 }
861 
862 //===---------------------------------------------------------------------===//
863 // Target specific lowering
864 //===---------------------------------------------------------------------===//
865 
866 SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
867                                         SmallVectorImpl<SDValue> &InVals) const {
868   SDValue Callee = CLI.Callee;
869   SelectionDAG &DAG = CLI.DAG;
870 
871   const Function &Fn = *DAG.getMachineFunction().getFunction();
872 
873   StringRef FuncName("<unknown>");
874 
875   if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
876     FuncName = G->getSymbol();
877   else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
878     FuncName = G->getGlobal()->getName();
879 
880   DiagnosticInfoUnsupported NoCalls(
881       Fn, "unsupported call to function " + FuncName, CLI.DL.getDebugLoc());
882   DAG.getContext()->diagnose(NoCalls);
883 
884   if (!CLI.IsTailCall) {
885     for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
886       InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
887   }
888 
889   return DAG.getEntryNode();
890 }
891 
892 SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
893                                                       SelectionDAG &DAG) const {
894   const Function &Fn = *DAG.getMachineFunction().getFunction();
895 
896   DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
897                                             SDLoc(Op).getDebugLoc());
898   DAG.getContext()->diagnose(NoDynamicAlloca);
899   auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
900   return DAG.getMergeValues(Ops, SDLoc());
901 }
902 
903 SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
904                                              SelectionDAG &DAG) const {
905   switch (Op.getOpcode()) {
906   default:
907     Op->print(errs(), &DAG);
908     llvm_unreachable("Custom lowering code for this"
909                      "instruction is not implemented yet!");
910     break;
911   case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
912   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
913   case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
914   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
915   case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
916   case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
917   case ISD::FREM: return LowerFREM(Op, DAG);
918   case ISD::FCEIL: return LowerFCEIL(Op, DAG);
919   case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
920   case ISD::FRINT: return LowerFRINT(Op, DAG);
921   case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
922   case ISD::FROUND: return LowerFROUND(Op, DAG);
923   case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
924   case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
925   case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
926   case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
927   case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
928   case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
929   case ISD::CTLZ:
930   case ISD::CTLZ_ZERO_UNDEF:
931     return LowerCTLZ(Op, DAG);
932   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
933   }
934   return Op;
935 }
936 
937 void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
938                                               SmallVectorImpl<SDValue> &Results,
939                                               SelectionDAG &DAG) const {
940   switch (N->getOpcode()) {
941   case ISD::SIGN_EXTEND_INREG:
942     // Different parts of legalization seem to interpret which type of
943     // sign_extend_inreg is the one to check for custom lowering. The extended
944     // from type is what really matters, but some places check for custom
945     // lowering of the result type. This results in trying to use
946     // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
947     // nothing here and let the illegal result integer be handled normally.
948     return;
949   default:
950     return;
951   }
952 }
953 
954 static bool hasDefinedInitializer(const GlobalValue *GV) {
955   const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
956   if (!GVar || !GVar->hasInitializer())
957     return false;
958 
959   return !isa<UndefValue>(GVar->getInitializer());
960 }
961 
962 SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
963                                                  SDValue Op,
964                                                  SelectionDAG &DAG) const {
965 
966   const DataLayout &DL = DAG.getDataLayout();
967   GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
968   const GlobalValue *GV = G->getGlobal();
969 
970   switch (G->getAddressSpace()) {
971   case AMDGPUAS::LOCAL_ADDRESS: {
972     // XXX: What does the value of G->getOffset() mean?
973     assert(G->getOffset() == 0 &&
974          "Do not know what to do with an non-zero offset");
975 
976     // TODO: We could emit code to handle the initialization somewhere.
977     if (hasDefinedInitializer(GV))
978       break;
979 
980     unsigned Offset = MFI->allocateLDSGlobal(DL, *GV);
981     return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
982   }
983   }
984 
985   const Function &Fn = *DAG.getMachineFunction().getFunction();
986   DiagnosticInfoUnsupported BadInit(
987       Fn, "unsupported initializer for address space", SDLoc(Op).getDebugLoc());
988   DAG.getContext()->diagnose(BadInit);
989   return SDValue();
990 }
991 
992 SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
993                                                   SelectionDAG &DAG) const {
994   SmallVector<SDValue, 8> Args;
995 
996   for (const SDUse &U : Op->ops())
997     DAG.ExtractVectorElements(U.get(), Args);
998 
999   return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1000 }
1001 
1002 SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
1003                                                      SelectionDAG &DAG) const {
1004 
1005   SmallVector<SDValue, 8> Args;
1006   unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
1007   EVT VT = Op.getValueType();
1008   DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1009                             VT.getVectorNumElements());
1010 
1011   return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1012 }
1013 
1014 SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
1015     SelectionDAG &DAG) const {
1016   unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
1017   SDLoc DL(Op);
1018   EVT VT = Op.getValueType();
1019 
1020   switch (IntrinsicID) {
1021   default: return Op;
1022   case AMDGPUIntrinsic::AMDGPU_bfe_i32:
1023     return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
1024                        Op.getOperand(1),
1025                        Op.getOperand(2),
1026                        Op.getOperand(3));
1027 
1028   case AMDGPUIntrinsic::AMDGPU_bfe_u32:
1029     return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
1030                        Op.getOperand(1),
1031                        Op.getOperand(2),
1032                        Op.getOperand(3));
1033   }
1034 }
1035 
1036 /// \brief Generate Min/Max node
1037 SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
1038                                                    SDValue LHS, SDValue RHS,
1039                                                    SDValue True, SDValue False,
1040                                                    SDValue CC,
1041                                                    DAGCombinerInfo &DCI) const {
1042   if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
1043     return SDValue();
1044 
1045   SelectionDAG &DAG = DCI.DAG;
1046   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1047   switch (CCOpcode) {
1048   case ISD::SETOEQ:
1049   case ISD::SETONE:
1050   case ISD::SETUNE:
1051   case ISD::SETNE:
1052   case ISD::SETUEQ:
1053   case ISD::SETEQ:
1054   case ISD::SETFALSE:
1055   case ISD::SETFALSE2:
1056   case ISD::SETTRUE:
1057   case ISD::SETTRUE2:
1058   case ISD::SETUO:
1059   case ISD::SETO:
1060     break;
1061   case ISD::SETULE:
1062   case ISD::SETULT: {
1063     if (LHS == True)
1064       return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1065     return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1066   }
1067   case ISD::SETOLE:
1068   case ISD::SETOLT:
1069   case ISD::SETLE:
1070   case ISD::SETLT: {
1071     // Ordered. Assume ordered for undefined.
1072 
1073     // Only do this after legalization to avoid interfering with other combines
1074     // which might occur.
1075     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1076         !DCI.isCalledByLegalizer())
1077       return SDValue();
1078 
1079     // We need to permute the operands to get the correct NaN behavior. The
1080     // selected operand is the second one based on the failing compare with NaN,
1081     // so permute it based on the compare type the hardware uses.
1082     if (LHS == True)
1083       return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1084     return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1085   }
1086   case ISD::SETUGE:
1087   case ISD::SETUGT: {
1088     if (LHS == True)
1089       return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1090     return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1091   }
1092   case ISD::SETGT:
1093   case ISD::SETGE:
1094   case ISD::SETOGE:
1095   case ISD::SETOGT: {
1096     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1097         !DCI.isCalledByLegalizer())
1098       return SDValue();
1099 
1100     if (LHS == True)
1101       return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1102     return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1103   }
1104   case ISD::SETCC_INVALID:
1105     llvm_unreachable("Invalid setcc condcode!");
1106   }
1107   return SDValue();
1108 }
1109 
1110 std::pair<SDValue, SDValue>
1111 AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
1112   SDLoc SL(Op);
1113 
1114   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1115 
1116   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1117   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1118 
1119   SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1120   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1121 
1122   return std::make_pair(Lo, Hi);
1123 }
1124 
1125 SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
1126   SDLoc SL(Op);
1127 
1128   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1129   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1130   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1131 }
1132 
1133 SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
1134   SDLoc SL(Op);
1135 
1136   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1137   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1138   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1139 }
1140 
1141 SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
1142                                               SelectionDAG &DAG) const {
1143   LoadSDNode *Load = cast<LoadSDNode>(Op);
1144   EVT VT = Op.getValueType();
1145 
1146 
1147   // If this is a 2 element vector, we really want to scalarize and not create
1148   // weird 1 element vectors.
1149   if (VT.getVectorNumElements() == 2)
1150     return scalarizeVectorLoad(Load, DAG);
1151 
1152   SDValue BasePtr = Load->getBasePtr();
1153   EVT PtrVT = BasePtr.getValueType();
1154   EVT MemVT = Load->getMemoryVT();
1155   SDLoc SL(Op);
1156 
1157   const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1158 
1159   EVT LoVT, HiVT;
1160   EVT LoMemVT, HiMemVT;
1161   SDValue Lo, Hi;
1162 
1163   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
1164   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
1165   std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT);
1166 
1167   unsigned Size = LoMemVT.getStoreSize();
1168   unsigned BaseAlign = Load->getAlignment();
1169   unsigned HiAlign = MinAlign(BaseAlign, Size);
1170 
1171   SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1172                                   Load->getChain(), BasePtr, SrcValue, LoMemVT,
1173                                   BaseAlign, Load->getMemOperand()->getFlags());
1174   SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
1175                               DAG.getConstant(Size, SL, PtrVT));
1176   SDValue HiLoad =
1177       DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1178                      HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1179                      HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1180 
1181   SDValue Ops[] = {
1182     DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad),
1183     DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1184                 LoLoad.getValue(1), HiLoad.getValue(1))
1185   };
1186 
1187   return DAG.getMergeValues(Ops, SL);
1188 }
1189 
1190 SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
1191                                                SelectionDAG &DAG) const {
1192   StoreSDNode *Store = cast<StoreSDNode>(Op);
1193   SDValue Val = Store->getValue();
1194   EVT VT = Val.getValueType();
1195 
1196   // If this is a 2 element vector, we really want to scalarize and not create
1197   // weird 1 element vectors.
1198   if (VT.getVectorNumElements() == 2)
1199     return scalarizeVectorStore(Store, DAG);
1200 
1201   EVT MemVT = Store->getMemoryVT();
1202   SDValue Chain = Store->getChain();
1203   SDValue BasePtr = Store->getBasePtr();
1204   SDLoc SL(Op);
1205 
1206   EVT LoVT, HiVT;
1207   EVT LoMemVT, HiMemVT;
1208   SDValue Lo, Hi;
1209 
1210   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
1211   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
1212   std::tie(Lo, Hi) = DAG.SplitVector(Val, SL, LoVT, HiVT);
1213 
1214   EVT PtrVT = BasePtr.getValueType();
1215   SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
1216                               DAG.getConstant(LoMemVT.getStoreSize(), SL,
1217                                               PtrVT));
1218 
1219   const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1220   unsigned BaseAlign = Store->getAlignment();
1221   unsigned Size = LoMemVT.getStoreSize();
1222   unsigned HiAlign = MinAlign(BaseAlign, Size);
1223 
1224   SDValue LoStore =
1225       DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1226                         Store->getMemOperand()->getFlags());
1227   SDValue HiStore =
1228       DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1229                         HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1230 
1231   return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1232 }
1233 
1234 // This is a shortcut for integer division because we have fast i32<->f32
1235 // conversions, and fast f32 reciprocal instructions. The fractional part of a
1236 // float is enough to accurately represent up to a 24-bit signed integer.
1237 SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
1238                                             bool Sign) const {
1239   SDLoc DL(Op);
1240   EVT VT = Op.getValueType();
1241   SDValue LHS = Op.getOperand(0);
1242   SDValue RHS = Op.getOperand(1);
1243   MVT IntVT = MVT::i32;
1244   MVT FltVT = MVT::f32;
1245 
1246   unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1247   if (LHSSignBits < 9)
1248     return SDValue();
1249 
1250   unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1251   if (RHSSignBits < 9)
1252     return SDValue();
1253 
1254   unsigned BitSize = VT.getSizeInBits();
1255   unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1256   unsigned DivBits = BitSize - SignBits;
1257   if (Sign)
1258     ++DivBits;
1259 
1260   ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
1261   ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
1262 
1263   SDValue jq = DAG.getConstant(1, DL, IntVT);
1264 
1265   if (Sign) {
1266     // char|short jq = ia ^ ib;
1267     jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1268 
1269     // jq = jq >> (bitsize - 2)
1270     jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1271                      DAG.getConstant(BitSize - 2, DL, VT));
1272 
1273     // jq = jq | 0x1
1274     jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1275   }
1276 
1277   // int ia = (int)LHS;
1278   SDValue ia = LHS;
1279 
1280   // int ib, (int)RHS;
1281   SDValue ib = RHS;
1282 
1283   // float fa = (float)ia;
1284   SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1285 
1286   // float fb = (float)ib;
1287   SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1288 
1289   SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1290                            fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1291 
1292   // fq = trunc(fq);
1293   fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1294 
1295   // float fqneg = -fq;
1296   SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1297 
1298   // float fr = mad(fqneg, fb, fa);
1299   unsigned OpCode = Subtarget->hasFP32Denormals() ?
1300                     (unsigned)AMDGPUISD::FMAD_FTZ :
1301                     (unsigned)ISD::FMAD;
1302   SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1303 
1304   // int iq = (int)fq;
1305   SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1306 
1307   // fr = fabs(fr);
1308   fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1309 
1310   // fb = fabs(fb);
1311   fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1312 
1313   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1314 
1315   // int cv = fr >= fb;
1316   SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1317 
1318   // jq = (cv ? jq : 0);
1319   jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1320 
1321   // dst = iq + jq;
1322   SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
1323 
1324   // Rem needs compensation, it's easier to recompute it
1325   SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
1326   Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
1327 
1328   // Truncate to number of bits this divide really is.
1329   if (Sign) {
1330     SDValue InRegSize
1331       = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
1332     Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
1333     Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
1334   } else {
1335     SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
1336     Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
1337     Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
1338   }
1339 
1340   return DAG.getMergeValues({ Div, Rem }, DL);
1341 }
1342 
1343 void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
1344                                       SelectionDAG &DAG,
1345                                       SmallVectorImpl<SDValue> &Results) const {
1346   assert(Op.getValueType() == MVT::i64);
1347 
1348   SDLoc DL(Op);
1349   EVT VT = Op.getValueType();
1350   EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1351 
1352   SDValue one = DAG.getConstant(1, DL, HalfVT);
1353   SDValue zero = DAG.getConstant(0, DL, HalfVT);
1354 
1355   //HiLo split
1356   SDValue LHS = Op.getOperand(0);
1357   SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero);
1358   SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one);
1359 
1360   SDValue RHS = Op.getOperand(1);
1361   SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero);
1362   SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one);
1363 
1364   if (VT == MVT::i64 &&
1365     DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
1366     DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
1367 
1368     SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1369                               LHS_Lo, RHS_Lo);
1370 
1371     SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), zero});
1372     SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), zero});
1373 
1374     Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
1375     Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
1376     return;
1377   }
1378 
1379   // Get Speculative values
1380   SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
1381   SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
1382 
1383   SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ);
1384   SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, zero});
1385   REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
1386 
1387   SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ);
1388   SDValue DIV_Lo = zero;
1389 
1390   const unsigned halfBitWidth = HalfVT.getSizeInBits();
1391 
1392   for (unsigned i = 0; i < halfBitWidth; ++i) {
1393     const unsigned bitPos = halfBitWidth - i - 1;
1394     SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
1395     // Get value of high bit
1396     SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
1397     HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one);
1398     HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
1399 
1400     // Shift
1401     REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
1402     // Add LHS high bit
1403     REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
1404 
1405     SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
1406     SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETUGE);
1407 
1408     DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
1409 
1410     // Update REM
1411     SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
1412     REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
1413   }
1414 
1415   SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
1416   DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
1417   Results.push_back(DIV);
1418   Results.push_back(REM);
1419 }
1420 
1421 SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
1422                                            SelectionDAG &DAG) const {
1423   SDLoc DL(Op);
1424   EVT VT = Op.getValueType();
1425 
1426   if (VT == MVT::i64) {
1427     SmallVector<SDValue, 2> Results;
1428     LowerUDIVREM64(Op, DAG, Results);
1429     return DAG.getMergeValues(Results, DL);
1430   }
1431 
1432   if (VT == MVT::i32) {
1433     if (SDValue Res = LowerDIVREM24(Op, DAG, false))
1434       return Res;
1435   }
1436 
1437   SDValue Num = Op.getOperand(0);
1438   SDValue Den = Op.getOperand(1);
1439 
1440   // RCP =  URECIP(Den) = 2^32 / Den + e
1441   // e is rounding error.
1442   SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
1443 
1444   // RCP_LO = mul(RCP, Den) */
1445   SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den);
1446 
1447   // RCP_HI = mulhu (RCP, Den) */
1448   SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den);
1449 
1450   // NEG_RCP_LO = -RCP_LO
1451   SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
1452                                                      RCP_LO);
1453 
1454   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
1455   SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
1456                                            NEG_RCP_LO, RCP_LO,
1457                                            ISD::SETEQ);
1458   // Calculate the rounding error from the URECIP instruction
1459   // E = mulhu(ABS_RCP_LO, RCP)
1460   SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP);
1461 
1462   // RCP_A_E = RCP + E
1463   SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E);
1464 
1465   // RCP_S_E = RCP - E
1466   SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E);
1467 
1468   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
1469   SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
1470                                      RCP_A_E, RCP_S_E,
1471                                      ISD::SETEQ);
1472   // Quotient = mulhu(Tmp0, Num)
1473   SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num);
1474 
1475   // Num_S_Remainder = Quotient * Den
1476   SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den);
1477 
1478   // Remainder = Num - Num_S_Remainder
1479   SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder);
1480 
1481   // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
1482   SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den,
1483                                                  DAG.getConstant(-1, DL, VT),
1484                                                  DAG.getConstant(0, DL, VT),
1485                                                  ISD::SETUGE);
1486   // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
1487   SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num,
1488                                                   Num_S_Remainder,
1489                                                   DAG.getConstant(-1, DL, VT),
1490                                                   DAG.getConstant(0, DL, VT),
1491                                                   ISD::SETUGE);
1492   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
1493   SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den,
1494                                                Remainder_GE_Zero);
1495 
1496   // Calculate Division result:
1497 
1498   // Quotient_A_One = Quotient + 1
1499   SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient,
1500                                        DAG.getConstant(1, DL, VT));
1501 
1502   // Quotient_S_One = Quotient - 1
1503   SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient,
1504                                        DAG.getConstant(1, DL, VT));
1505 
1506   // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
1507   SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
1508                                      Quotient, Quotient_A_One, ISD::SETEQ);
1509 
1510   // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
1511   Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
1512                             Quotient_S_One, Div, ISD::SETEQ);
1513 
1514   // Calculate Rem result:
1515 
1516   // Remainder_S_Den = Remainder - Den
1517   SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den);
1518 
1519   // Remainder_A_Den = Remainder + Den
1520   SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den);
1521 
1522   // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
1523   SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
1524                                     Remainder, Remainder_S_Den, ISD::SETEQ);
1525 
1526   // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
1527   Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
1528                             Remainder_A_Den, Rem, ISD::SETEQ);
1529   SDValue Ops[2] = {
1530     Div,
1531     Rem
1532   };
1533   return DAG.getMergeValues(Ops, DL);
1534 }
1535 
1536 SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
1537                                            SelectionDAG &DAG) const {
1538   SDLoc DL(Op);
1539   EVT VT = Op.getValueType();
1540 
1541   SDValue LHS = Op.getOperand(0);
1542   SDValue RHS = Op.getOperand(1);
1543 
1544   SDValue Zero = DAG.getConstant(0, DL, VT);
1545   SDValue NegOne = DAG.getConstant(-1, DL, VT);
1546 
1547   if (VT == MVT::i32) {
1548     if (SDValue Res = LowerDIVREM24(Op, DAG, true))
1549       return Res;
1550   }
1551 
1552   if (VT == MVT::i64 &&
1553       DAG.ComputeNumSignBits(LHS) > 32 &&
1554       DAG.ComputeNumSignBits(RHS) > 32) {
1555     EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1556 
1557     //HiLo split
1558     SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
1559     SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
1560     SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1561                                  LHS_Lo, RHS_Lo);
1562     SDValue Res[2] = {
1563       DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
1564       DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
1565     };
1566     return DAG.getMergeValues(Res, DL);
1567   }
1568 
1569   SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
1570   SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
1571   SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
1572   SDValue RSign = LHSign; // Remainder sign is the same as LHS
1573 
1574   LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
1575   RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
1576 
1577   LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
1578   RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
1579 
1580   SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
1581   SDValue Rem = Div.getValue(1);
1582 
1583   Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
1584   Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
1585 
1586   Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
1587   Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
1588 
1589   SDValue Res[2] = {
1590     Div,
1591     Rem
1592   };
1593   return DAG.getMergeValues(Res, DL);
1594 }
1595 
1596 // (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y))
1597 SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
1598   SDLoc SL(Op);
1599   EVT VT = Op.getValueType();
1600   SDValue X = Op.getOperand(0);
1601   SDValue Y = Op.getOperand(1);
1602 
1603   // TODO: Should this propagate fast-math-flags?
1604 
1605   SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y);
1606   SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div);
1607   SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y);
1608 
1609   return DAG.getNode(ISD::FSUB, SL, VT, X, Mul);
1610 }
1611 
1612 SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
1613   SDLoc SL(Op);
1614   SDValue Src = Op.getOperand(0);
1615 
1616   // result = trunc(src)
1617   // if (src > 0.0 && src != result)
1618   //   result += 1.0
1619 
1620   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
1621 
1622   const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
1623   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
1624 
1625   EVT SetCCVT =
1626       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
1627 
1628   SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
1629   SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
1630   SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
1631 
1632   SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
1633   // TODO: Should this propagate fast-math-flags?
1634   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
1635 }
1636 
1637 static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
1638                                   SelectionDAG &DAG) {
1639   const unsigned FractBits = 52;
1640   const unsigned ExpBits = 11;
1641 
1642   SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
1643                                 Hi,
1644                                 DAG.getConstant(FractBits - 32, SL, MVT::i32),
1645                                 DAG.getConstant(ExpBits, SL, MVT::i32));
1646   SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
1647                             DAG.getConstant(1023, SL, MVT::i32));
1648 
1649   return Exp;
1650 }
1651 
1652 SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
1653   SDLoc SL(Op);
1654   SDValue Src = Op.getOperand(0);
1655 
1656   assert(Op.getValueType() == MVT::f64);
1657 
1658   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1659   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1660 
1661   SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
1662 
1663   // Extract the upper half, since this is where we will find the sign and
1664   // exponent.
1665   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One);
1666 
1667   SDValue Exp = extractF64Exponent(Hi, SL, DAG);
1668 
1669   const unsigned FractBits = 52;
1670 
1671   // Extract the sign bit.
1672   const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
1673   SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
1674 
1675   // Extend back to to 64-bits.
1676   SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
1677   SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
1678 
1679   SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
1680   const SDValue FractMask
1681     = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
1682 
1683   SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
1684   SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
1685   SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
1686 
1687   EVT SetCCVT =
1688       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
1689 
1690   const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
1691 
1692   SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
1693   SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
1694 
1695   SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
1696   SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
1697 
1698   return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
1699 }
1700 
1701 SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
1702   SDLoc SL(Op);
1703   SDValue Src = Op.getOperand(0);
1704 
1705   assert(Op.getValueType() == MVT::f64);
1706 
1707   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1708   SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
1709   SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
1710 
1711   // TODO: Should this propagate fast-math-flags?
1712 
1713   SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
1714   SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
1715 
1716   SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
1717 
1718   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1719   SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
1720 
1721   EVT SetCCVT =
1722       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
1723   SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
1724 
1725   return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
1726 }
1727 
1728 SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const {
1729   // FNEARBYINT and FRINT are the same, except in their handling of FP
1730   // exceptions. Those aren't really meaningful for us, and OpenCL only has
1731   // rint, so just treat them as equivalent.
1732   return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
1733 }
1734 
1735 // XXX - May require not supporting f32 denormals?
1736 SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const {
1737   SDLoc SL(Op);
1738   SDValue X = Op.getOperand(0);
1739 
1740   SDValue T = DAG.getNode(ISD::FTRUNC, SL, MVT::f32, X);
1741 
1742   // TODO: Should this propagate fast-math-flags?
1743 
1744   SDValue Diff = DAG.getNode(ISD::FSUB, SL, MVT::f32, X, T);
1745 
1746   SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, MVT::f32, Diff);
1747 
1748   const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f32);
1749   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
1750   const SDValue Half = DAG.getConstantFP(0.5, SL, MVT::f32);
1751 
1752   SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f32, One, X);
1753 
1754   EVT SetCCVT =
1755       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
1756 
1757   SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
1758 
1759   SDValue Sel = DAG.getNode(ISD::SELECT, SL, MVT::f32, Cmp, SignOne, Zero);
1760 
1761   return DAG.getNode(ISD::FADD, SL, MVT::f32, T, Sel);
1762 }
1763 
1764 SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const {
1765   SDLoc SL(Op);
1766   SDValue X = Op.getOperand(0);
1767 
1768   SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X);
1769 
1770   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1771   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1772   const SDValue NegOne = DAG.getConstant(-1, SL, MVT::i32);
1773   const SDValue FiftyOne = DAG.getConstant(51, SL, MVT::i32);
1774   EVT SetCCVT =
1775       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
1776 
1777   SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
1778 
1779   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One);
1780 
1781   SDValue Exp = extractF64Exponent(Hi, SL, DAG);
1782 
1783   const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff), SL,
1784                                        MVT::i64);
1785 
1786   SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp);
1787   SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64,
1788                           DAG.getConstant(INT64_C(0x0008000000000000), SL,
1789                                           MVT::i64),
1790                           Exp);
1791 
1792   SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M);
1793   SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT,
1794                               DAG.getConstant(0, SL, MVT::i64), Tmp0,
1795                               ISD::SETNE);
1796 
1797   SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1,
1798                              D, DAG.getConstant(0, SL, MVT::i64));
1799   SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2);
1800 
1801   K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64));
1802   K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K);
1803 
1804   SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
1805   SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
1806   SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ);
1807 
1808   SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64,
1809                             ExpEqNegOne,
1810                             DAG.getConstantFP(1.0, SL, MVT::f64),
1811                             DAG.getConstantFP(0.0, SL, MVT::f64));
1812 
1813   SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X);
1814 
1815   K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K);
1816   K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K);
1817 
1818   return K;
1819 }
1820 
1821 SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
1822   EVT VT = Op.getValueType();
1823 
1824   if (VT == MVT::f32)
1825     return LowerFROUND32(Op, DAG);
1826 
1827   if (VT == MVT::f64)
1828     return LowerFROUND64(Op, DAG);
1829 
1830   llvm_unreachable("unhandled type");
1831 }
1832 
1833 SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
1834   SDLoc SL(Op);
1835   SDValue Src = Op.getOperand(0);
1836 
1837   // result = trunc(src);
1838   // if (src < 0.0 && src != result)
1839   //   result += -1.0.
1840 
1841   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
1842 
1843   const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
1844   const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
1845 
1846   EVT SetCCVT =
1847       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
1848 
1849   SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
1850   SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
1851   SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
1852 
1853   SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
1854   // TODO: Should this propagate fast-math-flags?
1855   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
1856 }
1857 
1858 SDValue AMDGPUTargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const {
1859   SDLoc SL(Op);
1860   SDValue Src = Op.getOperand(0);
1861   bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF;
1862 
1863   if (ZeroUndef && Src.getValueType() == MVT::i32)
1864     return DAG.getNode(AMDGPUISD::FFBH_U32, SL, MVT::i32, Src);
1865 
1866   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
1867 
1868   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1869   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1870 
1871   SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1872   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1873 
1874   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
1875                                    *DAG.getContext(), MVT::i32);
1876 
1877   SDValue Hi0 = DAG.getSetCC(SL, SetCCVT, Hi, Zero, ISD::SETEQ);
1878 
1879   SDValue CtlzLo = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Lo);
1880   SDValue CtlzHi = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Hi);
1881 
1882   const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32);
1883   SDValue Add = DAG.getNode(ISD::ADD, SL, MVT::i32, CtlzLo, Bits32);
1884 
1885   // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x))
1886   SDValue NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0, Add, CtlzHi);
1887 
1888   if (!ZeroUndef) {
1889     // Test if the full 64-bit input is zero.
1890 
1891     // FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32,
1892     // which we probably don't want.
1893     SDValue Lo0 = DAG.getSetCC(SL, SetCCVT, Lo, Zero, ISD::SETEQ);
1894     SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0, Hi0);
1895 
1896     // TODO: If i64 setcc is half rate, it can result in 1 fewer instruction
1897     // with the same cycles, otherwise it is slower.
1898     // SDValue SrcIsZero = DAG.getSetCC(SL, SetCCVT, Src,
1899     // DAG.getConstant(0, SL, MVT::i64), ISD::SETEQ);
1900 
1901     const SDValue Bits32 = DAG.getConstant(64, SL, MVT::i32);
1902 
1903     // The instruction returns -1 for 0 input, but the defined intrinsic
1904     // behavior is to return the number of bits.
1905     NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32,
1906                           SrcIsZero, Bits32, NewCtlz);
1907   }
1908 
1909   return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewCtlz);
1910 }
1911 
1912 SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
1913                                                bool Signed) const {
1914   // Unsigned
1915   // cul2f(ulong u)
1916   //{
1917   //  uint lz = clz(u);
1918   //  uint e = (u != 0) ? 127U + 63U - lz : 0;
1919   //  u = (u << lz) & 0x7fffffffffffffffUL;
1920   //  ulong t = u & 0xffffffffffUL;
1921   //  uint v = (e << 23) | (uint)(u >> 40);
1922   //  uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
1923   //  return as_float(v + r);
1924   //}
1925   // Signed
1926   // cl2f(long l)
1927   //{
1928   //  long s = l >> 63;
1929   //  float r = cul2f((l + s) ^ s);
1930   //  return s ? -r : r;
1931   //}
1932 
1933   SDLoc SL(Op);
1934   SDValue Src = Op.getOperand(0);
1935   SDValue L = Src;
1936 
1937   SDValue S;
1938   if (Signed) {
1939     const SDValue SignBit = DAG.getConstant(63, SL, MVT::i64);
1940     S = DAG.getNode(ISD::SRA, SL, MVT::i64, L, SignBit);
1941 
1942     SDValue LPlusS = DAG.getNode(ISD::ADD, SL, MVT::i64, L, S);
1943     L = DAG.getNode(ISD::XOR, SL, MVT::i64, LPlusS, S);
1944   }
1945 
1946   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
1947                                    *DAG.getContext(), MVT::f32);
1948 
1949 
1950   SDValue ZeroI32 = DAG.getConstant(0, SL, MVT::i32);
1951   SDValue ZeroI64 = DAG.getConstant(0, SL, MVT::i64);
1952   SDValue LZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i64, L);
1953   LZ = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LZ);
1954 
1955   SDValue K = DAG.getConstant(127U + 63U, SL, MVT::i32);
1956   SDValue E = DAG.getSelect(SL, MVT::i32,
1957     DAG.getSetCC(SL, SetCCVT, L, ZeroI64, ISD::SETNE),
1958     DAG.getNode(ISD::SUB, SL, MVT::i32, K, LZ),
1959     ZeroI32);
1960 
1961   SDValue U = DAG.getNode(ISD::AND, SL, MVT::i64,
1962     DAG.getNode(ISD::SHL, SL, MVT::i64, L, LZ),
1963     DAG.getConstant((-1ULL) >> 1, SL, MVT::i64));
1964 
1965   SDValue T = DAG.getNode(ISD::AND, SL, MVT::i64, U,
1966                           DAG.getConstant(0xffffffffffULL, SL, MVT::i64));
1967 
1968   SDValue UShl = DAG.getNode(ISD::SRL, SL, MVT::i64,
1969                              U, DAG.getConstant(40, SL, MVT::i64));
1970 
1971   SDValue V = DAG.getNode(ISD::OR, SL, MVT::i32,
1972     DAG.getNode(ISD::SHL, SL, MVT::i32, E, DAG.getConstant(23, SL, MVT::i32)),
1973     DAG.getNode(ISD::TRUNCATE, SL, MVT::i32,  UShl));
1974 
1975   SDValue C = DAG.getConstant(0x8000000000ULL, SL, MVT::i64);
1976   SDValue RCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETUGT);
1977   SDValue TCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETEQ);
1978 
1979   SDValue One = DAG.getConstant(1, SL, MVT::i32);
1980 
1981   SDValue VTrunc1 = DAG.getNode(ISD::AND, SL, MVT::i32, V, One);
1982 
1983   SDValue R = DAG.getSelect(SL, MVT::i32,
1984     RCmp,
1985     One,
1986     DAG.getSelect(SL, MVT::i32, TCmp, VTrunc1, ZeroI32));
1987   R = DAG.getNode(ISD::ADD, SL, MVT::i32, V, R);
1988   R = DAG.getNode(ISD::BITCAST, SL, MVT::f32, R);
1989 
1990   if (!Signed)
1991     return R;
1992 
1993   SDValue RNeg = DAG.getNode(ISD::FNEG, SL, MVT::f32, R);
1994   return DAG.getSelect(SL, MVT::f32, DAG.getSExtOrTrunc(S, SL, SetCCVT), RNeg, R);
1995 }
1996 
1997 SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
1998                                                bool Signed) const {
1999   SDLoc SL(Op);
2000   SDValue Src = Op.getOperand(0);
2001 
2002   SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
2003 
2004   SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
2005                            DAG.getConstant(0, SL, MVT::i32));
2006   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
2007                            DAG.getConstant(1, SL, MVT::i32));
2008 
2009   SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
2010                               SL, MVT::f64, Hi);
2011 
2012   SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
2013 
2014   SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
2015                               DAG.getConstant(32, SL, MVT::i32));
2016   // TODO: Should this propagate fast-math-flags?
2017   return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
2018 }
2019 
2020 SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
2021                                                SelectionDAG &DAG) const {
2022   assert(Op.getOperand(0).getValueType() == MVT::i64 &&
2023          "operation should be legal");
2024 
2025   // TODO: Factor out code common with LowerSINT_TO_FP.
2026 
2027   EVT DestVT = Op.getValueType();
2028   if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2029     SDLoc DL(Op);
2030     SDValue Src = Op.getOperand(0);
2031 
2032     SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2033     SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2034     SDValue FPRound =
2035         DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2036 
2037     return FPRound;
2038   }
2039 
2040   if (DestVT == MVT::f32)
2041     return LowerINT_TO_FP32(Op, DAG, false);
2042 
2043   assert(DestVT == MVT::f64);
2044   return LowerINT_TO_FP64(Op, DAG, false);
2045 }
2046 
2047 SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
2048                                               SelectionDAG &DAG) const {
2049   assert(Op.getOperand(0).getValueType() == MVT::i64 &&
2050          "operation should be legal");
2051 
2052   // TODO: Factor out code common with LowerUINT_TO_FP.
2053 
2054   EVT DestVT = Op.getValueType();
2055   if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2056     SDLoc DL(Op);
2057     SDValue Src = Op.getOperand(0);
2058 
2059     SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2060     SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2061     SDValue FPRound =
2062         DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2063 
2064     return FPRound;
2065   }
2066 
2067   if (DestVT == MVT::f32)
2068     return LowerINT_TO_FP32(Op, DAG, true);
2069 
2070   assert(DestVT == MVT::f64);
2071   return LowerINT_TO_FP64(Op, DAG, true);
2072 }
2073 
2074 SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
2075                                                bool Signed) const {
2076   SDLoc SL(Op);
2077 
2078   SDValue Src = Op.getOperand(0);
2079 
2080   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2081 
2082   SDValue K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)), SL,
2083                                  MVT::f64);
2084   SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), SL,
2085                                  MVT::f64);
2086   // TODO: Should this propagate fast-math-flags?
2087   SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0);
2088 
2089   SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul);
2090 
2091 
2092   SDValue Fma = DAG.getNode(ISD::FMA, SL, MVT::f64, FloorMul, K1, Trunc);
2093 
2094   SDValue Hi = DAG.getNode(Signed ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, SL,
2095                            MVT::i32, FloorMul);
2096   SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
2097 
2098   SDValue Result = DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi});
2099 
2100   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result);
2101 }
2102 
2103 SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
2104   SDLoc DL(Op);
2105   SDValue N0 = Op.getOperand(0);
2106 
2107   // Convert to target node to get known bits
2108   if (N0.getValueType() == MVT::f32)
2109     return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
2110 
2111   if (getTargetMachine().Options.UnsafeFPMath) {
2112     // There is a generic expand for FP_TO_FP16 with unsafe fast math.
2113     return SDValue();
2114   }
2115 
2116   assert(N0.getSimpleValueType() == MVT::f64);
2117 
2118   // f64 -> f16 conversion using round-to-nearest-even rounding mode.
2119   const unsigned ExpMask = 0x7ff;
2120   const unsigned ExpBiasf64 = 1023;
2121   const unsigned ExpBiasf16 = 15;
2122   SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
2123   SDValue One = DAG.getConstant(1, DL, MVT::i32);
2124   SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
2125   SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
2126                            DAG.getConstant(32, DL, MVT::i64));
2127   UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
2128   U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
2129   SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2130                           DAG.getConstant(20, DL, MVT::i64));
2131   E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
2132                   DAG.getConstant(ExpMask, DL, MVT::i32));
2133   // Subtract the fp64 exponent bias (1023) to get the real exponent and
2134   // add the f16 bias (15) to get the biased exponent for the f16 format.
2135   E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
2136                   DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
2137 
2138   SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2139                           DAG.getConstant(8, DL, MVT::i32));
2140   M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
2141                   DAG.getConstant(0xffe, DL, MVT::i32));
2142 
2143   SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
2144                                   DAG.getConstant(0x1ff, DL, MVT::i32));
2145   MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
2146 
2147   SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
2148   M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
2149 
2150   // (M != 0 ? 0x0200 : 0) | 0x7c00;
2151   SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
2152       DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
2153                       Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
2154 
2155   // N = M | (E << 12);
2156   SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2157       DAG.getNode(ISD::SHL, DL, MVT::i32, E,
2158                   DAG.getConstant(12, DL, MVT::i32)));
2159 
2160   // B = clamp(1-E, 0, 13);
2161   SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
2162                                   One, E);
2163   SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
2164   B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
2165                   DAG.getConstant(13, DL, MVT::i32));
2166 
2167   SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2168                                    DAG.getConstant(0x1000, DL, MVT::i32));
2169 
2170   SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
2171   SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
2172   SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
2173   D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
2174 
2175   SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
2176   SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
2177                               DAG.getConstant(0x7, DL, MVT::i32));
2178   V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
2179                   DAG.getConstant(2, DL, MVT::i32));
2180   SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
2181                                One, Zero, ISD::SETEQ);
2182   SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
2183                                One, Zero, ISD::SETGT);
2184   V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
2185   V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
2186 
2187   V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
2188                       DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
2189   V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
2190                       I, V, ISD::SETEQ);
2191 
2192   // Extract the sign bit.
2193   SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2194                             DAG.getConstant(16, DL, MVT::i32));
2195   Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
2196                      DAG.getConstant(0x8000, DL, MVT::i32));
2197 
2198   V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
2199   return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
2200 }
2201 
2202 SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op,
2203                                               SelectionDAG &DAG) const {
2204   SDValue Src = Op.getOperand(0);
2205 
2206   // TODO: Factor out code common with LowerFP_TO_UINT.
2207 
2208   EVT SrcVT = Src.getValueType();
2209   if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
2210     SDLoc DL(Op);
2211 
2212     SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
2213     SDValue FpToInt32 =
2214         DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
2215 
2216     return FpToInt32;
2217   }
2218 
2219   if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
2220     return LowerFP64_TO_INT(Op, DAG, true);
2221 
2222   return SDValue();
2223 }
2224 
2225 SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op,
2226                                               SelectionDAG &DAG) const {
2227   SDValue Src = Op.getOperand(0);
2228 
2229   // TODO: Factor out code common with LowerFP_TO_SINT.
2230 
2231   EVT SrcVT = Src.getValueType();
2232   if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
2233     SDLoc DL(Op);
2234 
2235     SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
2236     SDValue FpToInt32 =
2237         DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
2238 
2239     return FpToInt32;
2240   }
2241 
2242   if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
2243     return LowerFP64_TO_INT(Op, DAG, false);
2244 
2245   return SDValue();
2246 }
2247 
2248 SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
2249                                                      SelectionDAG &DAG) const {
2250   EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
2251   MVT VT = Op.getSimpleValueType();
2252   MVT ScalarVT = VT.getScalarType();
2253 
2254   assert(VT.isVector());
2255 
2256   SDValue Src = Op.getOperand(0);
2257   SDLoc DL(Op);
2258 
2259   // TODO: Don't scalarize on Evergreen?
2260   unsigned NElts = VT.getVectorNumElements();
2261   SmallVector<SDValue, 8> Args;
2262   DAG.ExtractVectorElements(Src, Args, 0, NElts);
2263 
2264   SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
2265   for (unsigned I = 0; I < NElts; ++I)
2266     Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
2267 
2268   return DAG.getBuildVector(VT, DL, Args);
2269 }
2270 
2271 //===----------------------------------------------------------------------===//
2272 // Custom DAG optimizations
2273 //===----------------------------------------------------------------------===//
2274 
2275 static bool isU24(SDValue Op, SelectionDAG &DAG) {
2276   APInt KnownZero, KnownOne;
2277   EVT VT = Op.getValueType();
2278   DAG.computeKnownBits(Op, KnownZero, KnownOne);
2279 
2280   return (VT.getSizeInBits() - KnownZero.countLeadingOnes()) <= 24;
2281 }
2282 
2283 static bool isI24(SDValue Op, SelectionDAG &DAG) {
2284   EVT VT = Op.getValueType();
2285 
2286   // In order for this to be a signed 24-bit value, bit 23, must
2287   // be a sign bit.
2288   return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
2289                                      // as unsigned 24-bit values.
2290          (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24;
2291 }
2292 
2293 static bool simplifyI24(SDNode *Node24, unsigned OpIdx,
2294                         TargetLowering::DAGCombinerInfo &DCI) {
2295 
2296   SelectionDAG &DAG = DCI.DAG;
2297   SDValue Op = Node24->getOperand(OpIdx);
2298   EVT VT = Op.getValueType();
2299 
2300   APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24);
2301   APInt KnownZero, KnownOne;
2302   TargetLowering::TargetLoweringOpt TLO(DAG, true, true);
2303   if (TLO.SimplifyDemandedBits(Node24, OpIdx, Demanded, DCI))
2304     return true;
2305 
2306   return false;
2307 }
2308 
2309 template <typename IntTy>
2310 static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
2311                                uint32_t Width, const SDLoc &DL) {
2312   if (Width + Offset < 32) {
2313     uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
2314     IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
2315     return DAG.getConstant(Result, DL, MVT::i32);
2316   }
2317 
2318   return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
2319 }
2320 
2321 static bool hasVolatileUser(SDNode *Val) {
2322   for (SDNode *U : Val->uses()) {
2323     if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
2324       if (M->isVolatile())
2325         return true;
2326     }
2327   }
2328 
2329   return false;
2330 }
2331 
2332 bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
2333   // i32 vectors are the canonical memory type.
2334   if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
2335     return false;
2336 
2337   if (!VT.isByteSized())
2338     return false;
2339 
2340   unsigned Size = VT.getStoreSize();
2341 
2342   if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
2343     return false;
2344 
2345   if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
2346     return false;
2347 
2348   return true;
2349 }
2350 
2351 // Replace load of an illegal type with a store of a bitcast to a friendlier
2352 // type.
2353 SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
2354                                                  DAGCombinerInfo &DCI) const {
2355   if (!DCI.isBeforeLegalize())
2356     return SDValue();
2357 
2358   LoadSDNode *LN = cast<LoadSDNode>(N);
2359   if (LN->isVolatile() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
2360     return SDValue();
2361 
2362   SDLoc SL(N);
2363   SelectionDAG &DAG = DCI.DAG;
2364   EVT VT = LN->getMemoryVT();
2365 
2366   unsigned Size = VT.getStoreSize();
2367   unsigned Align = LN->getAlignment();
2368   if (Align < Size && isTypeLegal(VT)) {
2369     bool IsFast;
2370     unsigned AS = LN->getAddressSpace();
2371 
2372     // Expand unaligned loads earlier than legalization. Due to visitation order
2373     // problems during legalization, the emitted instructions to pack and unpack
2374     // the bytes again are not eliminated in the case of an unaligned copy.
2375     if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) {
2376       if (VT.isVector())
2377         return scalarizeVectorLoad(LN, DAG);
2378 
2379       SDValue Ops[2];
2380       std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
2381       return DAG.getMergeValues(Ops, SDLoc(N));
2382     }
2383 
2384     if (!IsFast)
2385       return SDValue();
2386   }
2387 
2388   if (!shouldCombineMemoryType(VT))
2389     return SDValue();
2390 
2391   EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
2392 
2393   SDValue NewLoad
2394     = DAG.getLoad(NewVT, SL, LN->getChain(),
2395                   LN->getBasePtr(), LN->getMemOperand());
2396 
2397   SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
2398   DCI.CombineTo(N, BC, NewLoad.getValue(1));
2399   return SDValue(N, 0);
2400 }
2401 
2402 // Replace store of an illegal type with a store of a bitcast to a friendlier
2403 // type.
2404 SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
2405                                                   DAGCombinerInfo &DCI) const {
2406   if (!DCI.isBeforeLegalize())
2407     return SDValue();
2408 
2409   StoreSDNode *SN = cast<StoreSDNode>(N);
2410   if (SN->isVolatile() || !ISD::isNormalStore(SN))
2411     return SDValue();
2412 
2413   EVT VT = SN->getMemoryVT();
2414   unsigned Size = VT.getStoreSize();
2415 
2416   SDLoc SL(N);
2417   SelectionDAG &DAG = DCI.DAG;
2418   unsigned Align = SN->getAlignment();
2419   if (Align < Size && isTypeLegal(VT)) {
2420     bool IsFast;
2421     unsigned AS = SN->getAddressSpace();
2422 
2423     // Expand unaligned stores earlier than legalization. Due to visitation
2424     // order problems during legalization, the emitted instructions to pack and
2425     // unpack the bytes again are not eliminated in the case of an unaligned
2426     // copy.
2427     if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) {
2428       if (VT.isVector())
2429         return scalarizeVectorStore(SN, DAG);
2430 
2431       return expandUnalignedStore(SN, DAG);
2432     }
2433 
2434     if (!IsFast)
2435       return SDValue();
2436   }
2437 
2438   if (!shouldCombineMemoryType(VT))
2439     return SDValue();
2440 
2441   EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
2442   SDValue Val = SN->getValue();
2443 
2444   //DCI.AddToWorklist(Val.getNode());
2445 
2446   bool OtherUses = !Val.hasOneUse();
2447   SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
2448   if (OtherUses) {
2449     SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
2450     DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
2451   }
2452 
2453   return DAG.getStore(SN->getChain(), SL, CastVal,
2454                       SN->getBasePtr(), SN->getMemOperand());
2455 }
2456 
2457 SDValue AMDGPUTargetLowering::performClampCombine(SDNode *N,
2458                                                   DAGCombinerInfo &DCI) const {
2459   ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
2460   if (!CSrc)
2461     return SDValue();
2462 
2463   const APFloat &F = CSrc->getValueAPF();
2464   APFloat Zero = APFloat::getZero(F.getSemantics());
2465   APFloat::cmpResult Cmp0 = F.compare(Zero);
2466   if (Cmp0 == APFloat::cmpLessThan ||
2467       (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) {
2468     return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
2469   }
2470 
2471   APFloat One(F.getSemantics(), "1.0");
2472   APFloat::cmpResult Cmp1 = F.compare(One);
2473   if (Cmp1 == APFloat::cmpGreaterThan)
2474     return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
2475 
2476   return SDValue(CSrc, 0);
2477 }
2478 
2479 /// Split the 64-bit value \p LHS into two 32-bit components, and perform the
2480 /// binary operation \p Opc to it with the corresponding constant operands.
2481 SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
2482   DAGCombinerInfo &DCI, const SDLoc &SL,
2483   unsigned Opc, SDValue LHS,
2484   uint32_t ValLo, uint32_t ValHi) const {
2485   SelectionDAG &DAG = DCI.DAG;
2486   SDValue Lo, Hi;
2487   std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
2488 
2489   SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
2490   SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
2491 
2492   SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
2493   SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
2494 
2495   // Re-visit the ands. It's possible we eliminated one of them and it could
2496   // simplify the vector.
2497   DCI.AddToWorklist(Lo.getNode());
2498   DCI.AddToWorklist(Hi.getNode());
2499 
2500   SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
2501   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
2502 }
2503 
2504 SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
2505                                                 DAGCombinerInfo &DCI) const {
2506   if (N->getValueType(0) != MVT::i64)
2507     return SDValue();
2508 
2509   // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
2510 
2511   // On some subtargets, 64-bit shift is a quarter rate instruction. In the
2512   // common case, splitting this into a move and a 32-bit shift is faster and
2513   // the same code size.
2514   const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
2515   if (!RHS)
2516     return SDValue();
2517 
2518   unsigned RHSVal = RHS->getZExtValue();
2519   if (RHSVal < 32)
2520     return SDValue();
2521 
2522   SDValue LHS = N->getOperand(0);
2523 
2524   SDLoc SL(N);
2525   SelectionDAG &DAG = DCI.DAG;
2526 
2527   SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
2528 
2529   SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
2530   SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
2531 
2532   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2533 
2534   SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
2535   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
2536 }
2537 
2538 SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
2539                                                 DAGCombinerInfo &DCI) const {
2540   if (N->getValueType(0) != MVT::i64)
2541     return SDValue();
2542 
2543   const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
2544   if (!RHS)
2545     return SDValue();
2546 
2547   SelectionDAG &DAG = DCI.DAG;
2548   SDLoc SL(N);
2549   unsigned RHSVal = RHS->getZExtValue();
2550 
2551   // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
2552   if (RHSVal == 32) {
2553     SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
2554     SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
2555                                    DAG.getConstant(31, SL, MVT::i32));
2556 
2557     SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
2558     return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
2559   }
2560 
2561   // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
2562   if (RHSVal == 63) {
2563     SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
2564     SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
2565                                    DAG.getConstant(31, SL, MVT::i32));
2566     SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
2567     return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
2568   }
2569 
2570   return SDValue();
2571 }
2572 
2573 SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
2574                                                 DAGCombinerInfo &DCI) const {
2575   if (N->getValueType(0) != MVT::i64)
2576     return SDValue();
2577 
2578   const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
2579   if (!RHS)
2580     return SDValue();
2581 
2582   unsigned ShiftAmt = RHS->getZExtValue();
2583   if (ShiftAmt < 32)
2584     return SDValue();
2585 
2586   // srl i64:x, C for C >= 32
2587   // =>
2588   //   build_pair (srl hi_32(x), C - 32), 0
2589 
2590   SelectionDAG &DAG = DCI.DAG;
2591   SDLoc SL(N);
2592 
2593   SDValue One = DAG.getConstant(1, SL, MVT::i32);
2594   SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2595 
2596   SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, N->getOperand(0));
2597   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
2598                            VecOp, One);
2599 
2600   SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
2601   SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
2602 
2603   SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
2604 
2605   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
2606 }
2607 
2608 // We need to specifically handle i64 mul here to avoid unnecessary conversion
2609 // instructions. If we only match on the legalized i64 mul expansion,
2610 // SimplifyDemandedBits will be unable to remove them because there will be
2611 // multiple uses due to the separate mul + mulh[su].
2612 static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
2613                         SDValue N0, SDValue N1, unsigned Size, bool Signed) {
2614   if (Size <= 32) {
2615     unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
2616     return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
2617   }
2618 
2619   // Because we want to eliminate extension instructions before the
2620   // operation, we need to create a single user here (i.e. not the separate
2621   // mul_lo + mul_hi) so that SimplifyDemandedBits will deal with it.
2622 
2623   unsigned MulOpc = Signed ? AMDGPUISD::MUL_LOHI_I24 : AMDGPUISD::MUL_LOHI_U24;
2624 
2625   SDValue Mul = DAG.getNode(MulOpc, SL,
2626                             DAG.getVTList(MVT::i32, MVT::i32), N0, N1);
2627 
2628   return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64,
2629                      Mul.getValue(0), Mul.getValue(1));
2630 }
2631 
2632 SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
2633                                                 DAGCombinerInfo &DCI) const {
2634   EVT VT = N->getValueType(0);
2635 
2636   unsigned Size = VT.getSizeInBits();
2637   if (VT.isVector() || Size > 64)
2638     return SDValue();
2639 
2640   // There are i16 integer mul/mad.
2641   if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
2642     return SDValue();
2643 
2644   SelectionDAG &DAG = DCI.DAG;
2645   SDLoc DL(N);
2646 
2647   SDValue N0 = N->getOperand(0);
2648   SDValue N1 = N->getOperand(1);
2649   SDValue Mul;
2650 
2651   if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
2652     N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
2653     N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
2654     Mul = getMul24(DAG, DL, N0, N1, Size, false);
2655   } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
2656     N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
2657     N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
2658     Mul = getMul24(DAG, DL, N0, N1, Size, true);
2659   } else {
2660     return SDValue();
2661   }
2662 
2663   // We need to use sext even for MUL_U24, because MUL_U24 is used
2664   // for signed multiply of 8 and 16-bit types.
2665   return DAG.getSExtOrTrunc(Mul, DL, VT);
2666 }
2667 
2668 SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
2669                                                   DAGCombinerInfo &DCI) const {
2670   EVT VT = N->getValueType(0);
2671 
2672   if (!Subtarget->hasMulI24() || VT.isVector())
2673     return SDValue();
2674 
2675   SelectionDAG &DAG = DCI.DAG;
2676   SDLoc DL(N);
2677 
2678   SDValue N0 = N->getOperand(0);
2679   SDValue N1 = N->getOperand(1);
2680 
2681   if (!isI24(N0, DAG) || !isI24(N1, DAG))
2682     return SDValue();
2683 
2684   N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
2685   N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
2686 
2687   SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
2688   DCI.AddToWorklist(Mulhi.getNode());
2689   return DAG.getSExtOrTrunc(Mulhi, DL, VT);
2690 }
2691 
2692 SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
2693                                                   DAGCombinerInfo &DCI) const {
2694   EVT VT = N->getValueType(0);
2695 
2696   if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
2697     return SDValue();
2698 
2699   SelectionDAG &DAG = DCI.DAG;
2700   SDLoc DL(N);
2701 
2702   SDValue N0 = N->getOperand(0);
2703   SDValue N1 = N->getOperand(1);
2704 
2705   if (!isU24(N0, DAG) || !isU24(N1, DAG))
2706     return SDValue();
2707 
2708   N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
2709   N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
2710 
2711   SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
2712   DCI.AddToWorklist(Mulhi.getNode());
2713   return DAG.getZExtOrTrunc(Mulhi, DL, VT);
2714 }
2715 
2716 SDValue AMDGPUTargetLowering::performMulLoHi24Combine(
2717   SDNode *N, DAGCombinerInfo &DCI) const {
2718   SelectionDAG &DAG = DCI.DAG;
2719 
2720   // Simplify demanded bits before splitting into multiple users.
2721   if (simplifyI24(N, 0, DCI) || simplifyI24(N, 1, DCI))
2722     return SDValue();
2723 
2724   SDValue N0 = N->getOperand(0);
2725   SDValue N1 = N->getOperand(1);
2726 
2727   bool Signed = (N->getOpcode() == AMDGPUISD::MUL_LOHI_I24);
2728 
2729   unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
2730   unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
2731 
2732   SDLoc SL(N);
2733 
2734   SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
2735   SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
2736   return DAG.getMergeValues({ MulLo, MulHi }, SL);
2737 }
2738 
2739 static bool isNegativeOne(SDValue Val) {
2740   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
2741     return C->isAllOnesValue();
2742   return false;
2743 }
2744 
2745 static bool isCtlzOpc(unsigned Opc) {
2746   return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
2747 }
2748 
2749 SDValue AMDGPUTargetLowering::getFFBH_U32(SelectionDAG &DAG,
2750                                           SDValue Op,
2751                                           const SDLoc &DL) const {
2752   EVT VT = Op.getValueType();
2753   EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
2754   if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
2755                               LegalVT != MVT::i16))
2756     return SDValue();
2757 
2758   if (VT != MVT::i32)
2759     Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
2760 
2761   SDValue FFBH = DAG.getNode(AMDGPUISD::FFBH_U32, DL, MVT::i32, Op);
2762   if (VT != MVT::i32)
2763     FFBH = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBH);
2764 
2765   return FFBH;
2766 }
2767 
2768 // The native instructions return -1 on 0 input. Optimize out a select that
2769 // produces -1 on 0.
2770 //
2771 // TODO: If zero is not undef, we could also do this if the output is compared
2772 // against the bitwidth.
2773 //
2774 // TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
2775 SDValue AMDGPUTargetLowering::performCtlzCombine(const SDLoc &SL, SDValue Cond,
2776                                                  SDValue LHS, SDValue RHS,
2777                                                  DAGCombinerInfo &DCI) const {
2778   ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
2779   if (!CmpRhs || !CmpRhs->isNullValue())
2780     return SDValue();
2781 
2782   SelectionDAG &DAG = DCI.DAG;
2783   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
2784   SDValue CmpLHS = Cond.getOperand(0);
2785 
2786   // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
2787   if (CCOpcode == ISD::SETEQ &&
2788       isCtlzOpc(RHS.getOpcode()) &&
2789       RHS.getOperand(0) == CmpLHS &&
2790       isNegativeOne(LHS)) {
2791     return getFFBH_U32(DAG, CmpLHS, SL);
2792   }
2793 
2794   // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
2795   if (CCOpcode == ISD::SETNE &&
2796       isCtlzOpc(LHS.getOpcode()) &&
2797       LHS.getOperand(0) == CmpLHS &&
2798       isNegativeOne(RHS)) {
2799     return getFFBH_U32(DAG, CmpLHS, SL);
2800   }
2801 
2802   return SDValue();
2803 }
2804 
2805 static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
2806                                          unsigned Op,
2807                                          const SDLoc &SL,
2808                                          SDValue Cond,
2809                                          SDValue N1,
2810                                          SDValue N2) {
2811   SelectionDAG &DAG = DCI.DAG;
2812   EVT VT = N1.getValueType();
2813 
2814   SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
2815                                   N1.getOperand(0), N2.getOperand(0));
2816   DCI.AddToWorklist(NewSelect.getNode());
2817   return DAG.getNode(Op, SL, VT, NewSelect);
2818 }
2819 
2820 // Pull a free FP operation out of a select so it may fold into uses.
2821 //
2822 // select c, (fneg x), (fneg y) -> fneg (select c, x, y)
2823 // select c, (fneg x), k -> fneg (select c, x, (fneg k))
2824 //
2825 // select c, (fabs x), (fabs y) -> fabs (select c, x, y)
2826 // select c, (fabs x), +k -> fabs (select c, x, k)
2827 static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
2828                                     SDValue N) {
2829   SelectionDAG &DAG = DCI.DAG;
2830   SDValue Cond = N.getOperand(0);
2831   SDValue LHS = N.getOperand(1);
2832   SDValue RHS = N.getOperand(2);
2833 
2834   EVT VT = N.getValueType();
2835   if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
2836       (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
2837     return distributeOpThroughSelect(DCI, LHS.getOpcode(),
2838                                      SDLoc(N), Cond, LHS, RHS);
2839   }
2840 
2841   bool Inv = false;
2842   if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
2843     std::swap(LHS, RHS);
2844     Inv = true;
2845   }
2846 
2847   // TODO: Support vector constants.
2848   ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
2849   if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) {
2850     SDLoc SL(N);
2851     // If one side is an fneg/fabs and the other is a constant, we can push the
2852     // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
2853     SDValue NewLHS = LHS.getOperand(0);
2854     SDValue NewRHS = RHS;
2855 
2856     // Careful: if the neg can be folded up, don't try to pull it back down.
2857     bool ShouldFoldNeg = true;
2858 
2859     if (NewLHS.hasOneUse()) {
2860       unsigned Opc = NewLHS.getOpcode();
2861       if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc))
2862         ShouldFoldNeg = false;
2863       if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
2864         ShouldFoldNeg = false;
2865     }
2866 
2867     if (ShouldFoldNeg) {
2868       if (LHS.getOpcode() == ISD::FNEG)
2869         NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
2870       else if (CRHS->isNegative())
2871         return SDValue();
2872 
2873       if (Inv)
2874         std::swap(NewLHS, NewRHS);
2875 
2876       SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
2877                                       Cond, NewLHS, NewRHS);
2878       DCI.AddToWorklist(NewSelect.getNode());
2879       return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
2880     }
2881   }
2882 
2883   return SDValue();
2884 }
2885 
2886 
2887 SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
2888                                                    DAGCombinerInfo &DCI) const {
2889   if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
2890     return Folded;
2891 
2892   SDValue Cond = N->getOperand(0);
2893   if (Cond.getOpcode() != ISD::SETCC)
2894     return SDValue();
2895 
2896   EVT VT = N->getValueType(0);
2897   SDValue LHS = Cond.getOperand(0);
2898   SDValue RHS = Cond.getOperand(1);
2899   SDValue CC = Cond.getOperand(2);
2900 
2901   SDValue True = N->getOperand(1);
2902   SDValue False = N->getOperand(2);
2903 
2904   if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
2905     SelectionDAG &DAG = DCI.DAG;
2906     if ((DAG.isConstantValueOfAnyType(True) ||
2907          DAG.isConstantValueOfAnyType(True)) &&
2908         (!DAG.isConstantValueOfAnyType(False) &&
2909          !DAG.isConstantValueOfAnyType(False))) {
2910       // Swap cmp + select pair to move constant to false input.
2911       // This will allow using VOPC cndmasks more often.
2912       // select (setcc x, y), k, x -> select (setcc y, x) x, x
2913 
2914       SDLoc SL(N);
2915       ISD::CondCode NewCC = getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
2916                                             LHS.getValueType().isInteger());
2917 
2918       SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
2919       return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
2920     }
2921 
2922     if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
2923       SDValue MinMax
2924         = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
2925       // Revisit this node so we can catch min3/max3/med3 patterns.
2926       //DCI.AddToWorklist(MinMax.getNode());
2927       return MinMax;
2928     }
2929   }
2930 
2931   // There's no reason to not do this if the condition has other uses.
2932   return performCtlzCombine(SDLoc(N), Cond, True, False, DCI);
2933 }
2934 
2935 static bool isConstantFPZero(SDValue N) {
2936   if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
2937     return C->isZero() && !C->isNegative();
2938   return false;
2939 }
2940 
2941 static unsigned inverseMinMax(unsigned Opc) {
2942   switch (Opc) {
2943   case ISD::FMAXNUM:
2944     return ISD::FMINNUM;
2945   case ISD::FMINNUM:
2946     return ISD::FMAXNUM;
2947   case AMDGPUISD::FMAX_LEGACY:
2948     return AMDGPUISD::FMIN_LEGACY;
2949   case AMDGPUISD::FMIN_LEGACY:
2950     return  AMDGPUISD::FMAX_LEGACY;
2951   default:
2952     llvm_unreachable("invalid min/max opcode");
2953   }
2954 }
2955 
2956 SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
2957                                                  DAGCombinerInfo &DCI) const {
2958   SelectionDAG &DAG = DCI.DAG;
2959   SDValue N0 = N->getOperand(0);
2960   EVT VT = N->getValueType(0);
2961 
2962   unsigned Opc = N0.getOpcode();
2963 
2964   // If the input has multiple uses and we can either fold the negate down, or
2965   // the other uses cannot, give up. This both prevents unprofitable
2966   // transformations and infinite loops: we won't repeatedly try to fold around
2967   // a negate that has no 'good' form.
2968   if (N0.hasOneUse()) {
2969     // This may be able to fold into the source, but at a code size cost. Don't
2970     // fold if the fold into the user is free.
2971     if (allUsesHaveSourceMods(N, 0))
2972       return SDValue();
2973   } else {
2974     if (fnegFoldsIntoOp(Opc) &&
2975         (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode())))
2976       return SDValue();
2977   }
2978 
2979   SDLoc SL(N);
2980   switch (Opc) {
2981   case ISD::FADD: {
2982     if (!mayIgnoreSignedZero(N0))
2983       return SDValue();
2984 
2985     // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
2986     SDValue LHS = N0.getOperand(0);
2987     SDValue RHS = N0.getOperand(1);
2988 
2989     if (LHS.getOpcode() != ISD::FNEG)
2990       LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
2991     else
2992       LHS = LHS.getOperand(0);
2993 
2994     if (RHS.getOpcode() != ISD::FNEG)
2995       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
2996     else
2997       RHS = RHS.getOperand(0);
2998 
2999     SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
3000     if (!N0.hasOneUse())
3001       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3002     return Res;
3003   }
3004   case ISD::FMUL:
3005   case AMDGPUISD::FMUL_LEGACY: {
3006     // (fneg (fmul x, y)) -> (fmul x, (fneg y))
3007     // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
3008     SDValue LHS = N0.getOperand(0);
3009     SDValue RHS = N0.getOperand(1);
3010 
3011     if (LHS.getOpcode() == ISD::FNEG)
3012       LHS = LHS.getOperand(0);
3013     else if (RHS.getOpcode() == ISD::FNEG)
3014       RHS = RHS.getOperand(0);
3015     else
3016       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3017 
3018     SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
3019     if (!N0.hasOneUse())
3020       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3021     return Res;
3022   }
3023   case ISD::FMA:
3024   case ISD::FMAD: {
3025     if (!mayIgnoreSignedZero(N0))
3026       return SDValue();
3027 
3028     // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
3029     SDValue LHS = N0.getOperand(0);
3030     SDValue MHS = N0.getOperand(1);
3031     SDValue RHS = N0.getOperand(2);
3032 
3033     if (LHS.getOpcode() == ISD::FNEG)
3034       LHS = LHS.getOperand(0);
3035     else if (MHS.getOpcode() == ISD::FNEG)
3036       MHS = MHS.getOperand(0);
3037     else
3038       MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
3039 
3040     if (RHS.getOpcode() != ISD::FNEG)
3041       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3042     else
3043       RHS = RHS.getOperand(0);
3044 
3045     SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
3046     if (!N0.hasOneUse())
3047       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3048     return Res;
3049   }
3050   case ISD::FMAXNUM:
3051   case ISD::FMINNUM:
3052   case AMDGPUISD::FMAX_LEGACY:
3053   case AMDGPUISD::FMIN_LEGACY: {
3054     // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
3055     // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
3056     // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
3057     // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
3058 
3059     SDValue LHS = N0.getOperand(0);
3060     SDValue RHS = N0.getOperand(1);
3061 
3062     // 0 doesn't have a negated inline immediate.
3063     // TODO: Shouldn't fold 1/2pi either, and should be generalized to other
3064     // operations.
3065     if (isConstantFPZero(RHS))
3066       return SDValue();
3067 
3068     SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3069     SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3070     unsigned Opposite = inverseMinMax(Opc);
3071 
3072     SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
3073     if (!N0.hasOneUse())
3074       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3075     return Res;
3076   }
3077   case ISD::FP_EXTEND:
3078   case ISD::FTRUNC:
3079   case ISD::FRINT:
3080   case ISD::FNEARBYINT: // XXX - Should fround be handled?
3081   case ISD::FSIN:
3082   case AMDGPUISD::RCP:
3083   case AMDGPUISD::RCP_LEGACY:
3084   case AMDGPUISD::SIN_HW: {
3085     SDValue CvtSrc = N0.getOperand(0);
3086     if (CvtSrc.getOpcode() == ISD::FNEG) {
3087       // (fneg (fp_extend (fneg x))) -> (fp_extend x)
3088       // (fneg (rcp (fneg x))) -> (rcp x)
3089       return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
3090     }
3091 
3092     if (!N0.hasOneUse())
3093       return SDValue();
3094 
3095     // (fneg (fp_extend x)) -> (fp_extend (fneg x))
3096     // (fneg (rcp x)) -> (rcp (fneg x))
3097     SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
3098     return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
3099   }
3100   case ISD::FP_ROUND: {
3101     SDValue CvtSrc = N0.getOperand(0);
3102 
3103     if (CvtSrc.getOpcode() == ISD::FNEG) {
3104       // (fneg (fp_round (fneg x))) -> (fp_round x)
3105       return DAG.getNode(ISD::FP_ROUND, SL, VT,
3106                          CvtSrc.getOperand(0), N0.getOperand(1));
3107     }
3108 
3109     if (!N0.hasOneUse())
3110       return SDValue();
3111 
3112     // (fneg (fp_round x)) -> (fp_round (fneg x))
3113     SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
3114     return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
3115   }
3116   case ISD::FP16_TO_FP: {
3117     // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
3118     // f16, but legalization of f16 fneg ends up pulling it out of the source.
3119     // Put the fneg back as a legal source operation that can be matched later.
3120     SDLoc SL(N);
3121 
3122     SDValue Src = N0.getOperand(0);
3123     EVT SrcVT = Src.getValueType();
3124 
3125     // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
3126     SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
3127                                   DAG.getConstant(0x8000, SL, SrcVT));
3128     return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
3129   }
3130   default:
3131     return SDValue();
3132   }
3133 }
3134 
3135 SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
3136                                                  DAGCombinerInfo &DCI) const {
3137   SelectionDAG &DAG = DCI.DAG;
3138   SDValue N0 = N->getOperand(0);
3139 
3140   if (!N0.hasOneUse())
3141     return SDValue();
3142 
3143   switch (N0.getOpcode()) {
3144   case ISD::FP16_TO_FP: {
3145     assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
3146     SDLoc SL(N);
3147     SDValue Src = N0.getOperand(0);
3148     EVT SrcVT = Src.getValueType();
3149 
3150     // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
3151     SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
3152                                   DAG.getConstant(0x7fff, SL, SrcVT));
3153     return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
3154   }
3155   default:
3156     return SDValue();
3157   }
3158 }
3159 
3160 SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
3161                                                 DAGCombinerInfo &DCI) const {
3162   SelectionDAG &DAG = DCI.DAG;
3163   SDLoc DL(N);
3164 
3165   switch(N->getOpcode()) {
3166   default:
3167     break;
3168   case ISD::BITCAST: {
3169     EVT DestVT = N->getValueType(0);
3170 
3171     // Push casts through vector builds. This helps avoid emitting a large
3172     // number of copies when materializing floating point vector constants.
3173     //
3174     // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
3175     //   vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
3176     if (DestVT.isVector()) {
3177       SDValue Src = N->getOperand(0);
3178       if (Src.getOpcode() == ISD::BUILD_VECTOR) {
3179         EVT SrcVT = Src.getValueType();
3180         unsigned NElts = DestVT.getVectorNumElements();
3181 
3182         if (SrcVT.getVectorNumElements() == NElts) {
3183           EVT DestEltVT = DestVT.getVectorElementType();
3184 
3185           SmallVector<SDValue, 8> CastedElts;
3186           SDLoc SL(N);
3187           for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
3188             SDValue Elt = Src.getOperand(I);
3189             CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
3190           }
3191 
3192           return DAG.getBuildVector(DestVT, SL, CastedElts);
3193         }
3194       }
3195     }
3196 
3197     if (DestVT.getSizeInBits() != 64 && !DestVT.isVector())
3198       break;
3199 
3200     // Fold bitcasts of constants.
3201     //
3202     // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
3203     // TODO: Generalize and move to DAGCombiner
3204     SDValue Src = N->getOperand(0);
3205     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
3206       assert(Src.getValueType() == MVT::i64);
3207       SDLoc SL(N);
3208       uint64_t CVal = C->getZExtValue();
3209       return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT,
3210                          DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
3211                          DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
3212     }
3213 
3214     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
3215       const APInt &Val = C->getValueAPF().bitcastToAPInt();
3216       SDLoc SL(N);
3217       uint64_t CVal = Val.getZExtValue();
3218       SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
3219                                 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
3220                                 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
3221 
3222       return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
3223     }
3224 
3225     break;
3226   }
3227   case ISD::SHL: {
3228     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
3229       break;
3230 
3231     return performShlCombine(N, DCI);
3232   }
3233   case ISD::SRL: {
3234     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
3235       break;
3236 
3237     return performSrlCombine(N, DCI);
3238   }
3239   case ISD::SRA: {
3240     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
3241       break;
3242 
3243     return performSraCombine(N, DCI);
3244   }
3245   case ISD::MUL:
3246     return performMulCombine(N, DCI);
3247   case ISD::MULHS:
3248     return performMulhsCombine(N, DCI);
3249   case ISD::MULHU:
3250     return performMulhuCombine(N, DCI);
3251   case AMDGPUISD::MUL_I24:
3252   case AMDGPUISD::MUL_U24:
3253   case AMDGPUISD::MULHI_I24:
3254   case AMDGPUISD::MULHI_U24: {
3255     // If the first call to simplify is successfull, then N may end up being
3256     // deleted, so we shouldn't call simplifyI24 again.
3257     simplifyI24(N, 0, DCI) || simplifyI24(N, 1, DCI);
3258     return SDValue();
3259   }
3260   case AMDGPUISD::MUL_LOHI_I24:
3261   case AMDGPUISD::MUL_LOHI_U24:
3262     return performMulLoHi24Combine(N, DCI);
3263   case ISD::SELECT:
3264     return performSelectCombine(N, DCI);
3265   case ISD::FNEG:
3266     return performFNegCombine(N, DCI);
3267   case ISD::FABS:
3268     return performFAbsCombine(N, DCI);
3269   case AMDGPUISD::BFE_I32:
3270   case AMDGPUISD::BFE_U32: {
3271     assert(!N->getValueType(0).isVector() &&
3272            "Vector handling of BFE not implemented");
3273     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
3274     if (!Width)
3275       break;
3276 
3277     uint32_t WidthVal = Width->getZExtValue() & 0x1f;
3278     if (WidthVal == 0)
3279       return DAG.getConstant(0, DL, MVT::i32);
3280 
3281     ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
3282     if (!Offset)
3283       break;
3284 
3285     SDValue BitsFrom = N->getOperand(0);
3286     uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
3287 
3288     bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
3289 
3290     if (OffsetVal == 0) {
3291       // This is already sign / zero extended, so try to fold away extra BFEs.
3292       unsigned SignBits =  Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
3293 
3294       unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
3295       if (OpSignBits >= SignBits)
3296         return BitsFrom;
3297 
3298       EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
3299       if (Signed) {
3300         // This is a sign_extend_inreg. Replace it to take advantage of existing
3301         // DAG Combines. If not eliminated, we will match back to BFE during
3302         // selection.
3303 
3304         // TODO: The sext_inreg of extended types ends, although we can could
3305         // handle them in a single BFE.
3306         return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
3307                            DAG.getValueType(SmallVT));
3308       }
3309 
3310       return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
3311     }
3312 
3313     if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
3314       if (Signed) {
3315         return constantFoldBFE<int32_t>(DAG,
3316                                         CVal->getSExtValue(),
3317                                         OffsetVal,
3318                                         WidthVal,
3319                                         DL);
3320       }
3321 
3322       return constantFoldBFE<uint32_t>(DAG,
3323                                        CVal->getZExtValue(),
3324                                        OffsetVal,
3325                                        WidthVal,
3326                                        DL);
3327     }
3328 
3329     if ((OffsetVal + WidthVal) >= 32) {
3330       SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
3331       return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
3332                          BitsFrom, ShiftVal);
3333     }
3334 
3335     if (BitsFrom.hasOneUse()) {
3336       APInt Demanded = APInt::getBitsSet(32,
3337                                          OffsetVal,
3338                                          OffsetVal + WidthVal);
3339 
3340       APInt KnownZero, KnownOne;
3341       TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
3342                                             !DCI.isBeforeLegalizeOps());
3343       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3344       if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) ||
3345           TLI.SimplifyDemandedBits(BitsFrom, Demanded,
3346                                    KnownZero, KnownOne, TLO)) {
3347         DCI.CommitTargetLoweringOpt(TLO);
3348       }
3349     }
3350 
3351     break;
3352   }
3353   case ISD::LOAD:
3354     return performLoadCombine(N, DCI);
3355   case ISD::STORE:
3356     return performStoreCombine(N, DCI);
3357   case AMDGPUISD::CLAMP:
3358     return performClampCombine(N, DCI);
3359   case AMDGPUISD::RCP: {
3360     if (const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) {
3361       // XXX - Should this flush denormals?
3362       const APFloat &Val = CFP->getValueAPF();
3363       APFloat One(Val.getSemantics(), "1.0");
3364       return DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
3365     }
3366 
3367     break;
3368   }
3369   }
3370   return SDValue();
3371 }
3372 
3373 //===----------------------------------------------------------------------===//
3374 // Helper functions
3375 //===----------------------------------------------------------------------===//
3376 
3377 SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
3378                                                   const TargetRegisterClass *RC,
3379                                                    unsigned Reg, EVT VT) const {
3380   MachineFunction &MF = DAG.getMachineFunction();
3381   MachineRegisterInfo &MRI = MF.getRegInfo();
3382   unsigned VirtualRegister;
3383   if (!MRI.isLiveIn(Reg)) {
3384     VirtualRegister = MRI.createVirtualRegister(RC);
3385     MRI.addLiveIn(Reg, VirtualRegister);
3386   } else {
3387     VirtualRegister = MRI.getLiveInVirtReg(Reg);
3388   }
3389   return DAG.getRegister(VirtualRegister, VT);
3390 }
3391 
3392 uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
3393     const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const {
3394   unsigned Alignment = Subtarget->getAlignmentForImplicitArgPtr();
3395   uint64_t ArgOffset = alignTo(MFI->getABIArgOffset(), Alignment);
3396   switch (Param) {
3397   case GRID_DIM:
3398     return ArgOffset;
3399   case GRID_OFFSET:
3400     return ArgOffset + 4;
3401   }
3402   llvm_unreachable("unexpected implicit parameter type");
3403 }
3404 
3405 #define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
3406 
3407 const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
3408   switch ((AMDGPUISD::NodeType)Opcode) {
3409   case AMDGPUISD::FIRST_NUMBER: break;
3410   // AMDIL DAG nodes
3411   NODE_NAME_CASE(UMUL);
3412   NODE_NAME_CASE(BRANCH_COND);
3413 
3414   // AMDGPU DAG nodes
3415   NODE_NAME_CASE(IF)
3416   NODE_NAME_CASE(ELSE)
3417   NODE_NAME_CASE(LOOP)
3418   NODE_NAME_CASE(CALL)
3419   NODE_NAME_CASE(RET_FLAG)
3420   NODE_NAME_CASE(RETURN_TO_EPILOG)
3421   NODE_NAME_CASE(ENDPGM)
3422   NODE_NAME_CASE(DWORDADDR)
3423   NODE_NAME_CASE(FRACT)
3424   NODE_NAME_CASE(SETCC)
3425   NODE_NAME_CASE(SETREG)
3426   NODE_NAME_CASE(FMA_W_CHAIN)
3427   NODE_NAME_CASE(FMUL_W_CHAIN)
3428   NODE_NAME_CASE(CLAMP)
3429   NODE_NAME_CASE(COS_HW)
3430   NODE_NAME_CASE(SIN_HW)
3431   NODE_NAME_CASE(FMAX_LEGACY)
3432   NODE_NAME_CASE(FMIN_LEGACY)
3433   NODE_NAME_CASE(FMAX3)
3434   NODE_NAME_CASE(SMAX3)
3435   NODE_NAME_CASE(UMAX3)
3436   NODE_NAME_CASE(FMIN3)
3437   NODE_NAME_CASE(SMIN3)
3438   NODE_NAME_CASE(UMIN3)
3439   NODE_NAME_CASE(FMED3)
3440   NODE_NAME_CASE(SMED3)
3441   NODE_NAME_CASE(UMED3)
3442   NODE_NAME_CASE(URECIP)
3443   NODE_NAME_CASE(DIV_SCALE)
3444   NODE_NAME_CASE(DIV_FMAS)
3445   NODE_NAME_CASE(DIV_FIXUP)
3446   NODE_NAME_CASE(FMAD_FTZ)
3447   NODE_NAME_CASE(TRIG_PREOP)
3448   NODE_NAME_CASE(RCP)
3449   NODE_NAME_CASE(RSQ)
3450   NODE_NAME_CASE(RCP_LEGACY)
3451   NODE_NAME_CASE(RSQ_LEGACY)
3452   NODE_NAME_CASE(FMUL_LEGACY)
3453   NODE_NAME_CASE(RSQ_CLAMP)
3454   NODE_NAME_CASE(LDEXP)
3455   NODE_NAME_CASE(FP_CLASS)
3456   NODE_NAME_CASE(DOT4)
3457   NODE_NAME_CASE(CARRY)
3458   NODE_NAME_CASE(BORROW)
3459   NODE_NAME_CASE(BFE_U32)
3460   NODE_NAME_CASE(BFE_I32)
3461   NODE_NAME_CASE(BFI)
3462   NODE_NAME_CASE(BFM)
3463   NODE_NAME_CASE(FFBH_U32)
3464   NODE_NAME_CASE(FFBH_I32)
3465   NODE_NAME_CASE(MUL_U24)
3466   NODE_NAME_CASE(MUL_I24)
3467   NODE_NAME_CASE(MULHI_U24)
3468   NODE_NAME_CASE(MULHI_I24)
3469   NODE_NAME_CASE(MUL_LOHI_U24)
3470   NODE_NAME_CASE(MUL_LOHI_I24)
3471   NODE_NAME_CASE(MAD_U24)
3472   NODE_NAME_CASE(MAD_I24)
3473   NODE_NAME_CASE(TEXTURE_FETCH)
3474   NODE_NAME_CASE(EXPORT)
3475   NODE_NAME_CASE(EXPORT_DONE)
3476   NODE_NAME_CASE(R600_EXPORT)
3477   NODE_NAME_CASE(CONST_ADDRESS)
3478   NODE_NAME_CASE(REGISTER_LOAD)
3479   NODE_NAME_CASE(REGISTER_STORE)
3480   NODE_NAME_CASE(LOAD_INPUT)
3481   NODE_NAME_CASE(SAMPLE)
3482   NODE_NAME_CASE(SAMPLEB)
3483   NODE_NAME_CASE(SAMPLED)
3484   NODE_NAME_CASE(SAMPLEL)
3485   NODE_NAME_CASE(CVT_F32_UBYTE0)
3486   NODE_NAME_CASE(CVT_F32_UBYTE1)
3487   NODE_NAME_CASE(CVT_F32_UBYTE2)
3488   NODE_NAME_CASE(CVT_F32_UBYTE3)
3489   NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
3490   NODE_NAME_CASE(FP_TO_FP16)
3491   NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
3492   NODE_NAME_CASE(CONST_DATA_PTR)
3493   NODE_NAME_CASE(PC_ADD_REL_OFFSET)
3494   NODE_NAME_CASE(KILL)
3495   NODE_NAME_CASE(DUMMY_CHAIN)
3496   case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
3497   NODE_NAME_CASE(SENDMSG)
3498   NODE_NAME_CASE(SENDMSGHALT)
3499   NODE_NAME_CASE(INTERP_MOV)
3500   NODE_NAME_CASE(INTERP_P1)
3501   NODE_NAME_CASE(INTERP_P2)
3502   NODE_NAME_CASE(STORE_MSKOR)
3503   NODE_NAME_CASE(LOAD_CONSTANT)
3504   NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
3505   NODE_NAME_CASE(ATOMIC_CMP_SWAP)
3506   NODE_NAME_CASE(ATOMIC_INC)
3507   NODE_NAME_CASE(ATOMIC_DEC)
3508   NODE_NAME_CASE(BUFFER_LOAD)
3509   NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
3510   case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
3511   }
3512   return nullptr;
3513 }
3514 
3515 SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
3516                                               SelectionDAG &DAG, int Enabled,
3517                                               int &RefinementSteps,
3518                                               bool &UseOneConstNR,
3519                                               bool Reciprocal) const {
3520   EVT VT = Operand.getValueType();
3521 
3522   if (VT == MVT::f32) {
3523     RefinementSteps = 0;
3524     return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
3525   }
3526 
3527   // TODO: There is also f64 rsq instruction, but the documentation is less
3528   // clear on its precision.
3529 
3530   return SDValue();
3531 }
3532 
3533 SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
3534                                                SelectionDAG &DAG, int Enabled,
3535                                                int &RefinementSteps) const {
3536   EVT VT = Operand.getValueType();
3537 
3538   if (VT == MVT::f32) {
3539     // Reciprocal, < 1 ulp error.
3540     //
3541     // This reciprocal approximation converges to < 0.5 ulp error with one
3542     // newton rhapson performed with two fused multiple adds (FMAs).
3543 
3544     RefinementSteps = 0;
3545     return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
3546   }
3547 
3548   // TODO: There is also f64 rcp instruction, but the documentation is less
3549   // clear on its precision.
3550 
3551   return SDValue();
3552 }
3553 
3554 void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
3555   const SDValue Op,
3556   APInt &KnownZero,
3557   APInt &KnownOne,
3558   const SelectionDAG &DAG,
3559   unsigned Depth) const {
3560 
3561   KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything.
3562 
3563   APInt KnownZero2;
3564   APInt KnownOne2;
3565   unsigned Opc = Op.getOpcode();
3566 
3567   switch (Opc) {
3568   default:
3569     break;
3570   case AMDGPUISD::CARRY:
3571   case AMDGPUISD::BORROW: {
3572     KnownZero = APInt::getHighBitsSet(32, 31);
3573     break;
3574   }
3575 
3576   case AMDGPUISD::BFE_I32:
3577   case AMDGPUISD::BFE_U32: {
3578     ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
3579     if (!CWidth)
3580       return;
3581 
3582     unsigned BitWidth = 32;
3583     uint32_t Width = CWidth->getZExtValue() & 0x1f;
3584 
3585     if (Opc == AMDGPUISD::BFE_U32)
3586       KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - Width);
3587 
3588     break;
3589   }
3590   case AMDGPUISD::FP_TO_FP16: {
3591     unsigned BitWidth = KnownZero.getBitWidth();
3592 
3593     // High bits are zero.
3594     KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
3595     break;
3596   }
3597   }
3598 }
3599 
3600 unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
3601   SDValue Op,
3602   const SelectionDAG &DAG,
3603   unsigned Depth) const {
3604   switch (Op.getOpcode()) {
3605   case AMDGPUISD::BFE_I32: {
3606     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
3607     if (!Width)
3608       return 1;
3609 
3610     unsigned SignBits = 32 - Width->getZExtValue() + 1;
3611     if (!isNullConstant(Op.getOperand(1)))
3612       return SignBits;
3613 
3614     // TODO: Could probably figure something out with non-0 offsets.
3615     unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
3616     return std::max(SignBits, Op0SignBits);
3617   }
3618 
3619   case AMDGPUISD::BFE_U32: {
3620     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
3621     return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
3622   }
3623 
3624   case AMDGPUISD::CARRY:
3625   case AMDGPUISD::BORROW:
3626     return 31;
3627 
3628   default:
3629     return 1;
3630   }
3631 }
3632