1 //===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This is the parent TargetLowering class for hardware code gen
11 /// targets.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUISelLowering.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUFrameLowering.h"
19 #include "AMDGPUSubtarget.h"
20 #include "AMDGPUTargetMachine.h"
21 #include "Utils/AMDGPUBaseInfo.h"
22 #include "R600MachineFunctionInfo.h"
23 #include "SIInstrInfo.h"
24 #include "SIMachineFunctionInfo.h"
25 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
26 #include "llvm/CodeGen/Analysis.h"
27 #include "llvm/CodeGen/CallingConvLower.h"
28 #include "llvm/CodeGen/MachineFunction.h"
29 #include "llvm/CodeGen/MachineRegisterInfo.h"
30 #include "llvm/CodeGen/SelectionDAG.h"
31 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
32 #include "llvm/IR/DataLayout.h"
33 #include "llvm/IR/DiagnosticInfo.h"
34 #include "llvm/Support/KnownBits.h"
35 #include "llvm/Support/MathExtras.h"
36 using namespace llvm;
37 
38 #include "AMDGPUGenCallingConv.inc"
39 
40 static cl::opt<bool> AMDGPUBypassSlowDiv(
41   "amdgpu-bypass-slow-div",
42   cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
43   cl::init(true));
44 
45 // Find a larger type to do a load / store of a vector with.
46 EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
47   unsigned StoreSize = VT.getStoreSizeInBits();
48   if (StoreSize <= 32)
49     return EVT::getIntegerVT(Ctx, StoreSize);
50 
51   assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
52   return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
53 }
54 
55 unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
56   EVT VT = Op.getValueType();
57   KnownBits Known = DAG.computeKnownBits(Op);
58   return VT.getSizeInBits() - Known.countMinLeadingZeros();
59 }
60 
61 unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
62   EVT VT = Op.getValueType();
63 
64   // In order for this to be a signed 24-bit value, bit 23, must
65   // be a sign bit.
66   return VT.getSizeInBits() - DAG.ComputeNumSignBits(Op);
67 }
68 
69 AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
70                                            const AMDGPUSubtarget &STI)
71     : TargetLowering(TM), Subtarget(&STI) {
72   // Lower floating point store/load to integer store/load to reduce the number
73   // of patterns in tablegen.
74   setOperationAction(ISD::LOAD, MVT::f32, Promote);
75   AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
76 
77   setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
78   AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
79 
80   setOperationAction(ISD::LOAD, MVT::v3f32, Promote);
81   AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
82 
83   setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
84   AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
85 
86   setOperationAction(ISD::LOAD, MVT::v5f32, Promote);
87   AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
88 
89   setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
90   AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
91 
92   setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
93   AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
94 
95   setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
96   AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
97 
98   setOperationAction(ISD::LOAD, MVT::i64, Promote);
99   AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
100 
101   setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
102   AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
103 
104   setOperationAction(ISD::LOAD, MVT::f64, Promote);
105   AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
106 
107   setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
108   AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
109 
110   setOperationAction(ISD::LOAD, MVT::v4i64, Promote);
111   AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
112 
113   setOperationAction(ISD::LOAD, MVT::v4f64, Promote);
114   AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
115 
116   setOperationAction(ISD::LOAD, MVT::v8i64, Promote);
117   AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
118 
119   setOperationAction(ISD::LOAD, MVT::v8f64, Promote);
120   AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
121 
122   setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
123   AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
124 
125   setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
126   AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
127 
128   // There are no 64-bit extloads. These should be done as a 32-bit extload and
129   // an extension to 64-bit.
130   for (MVT VT : MVT::integer_valuetypes()) {
131     setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand);
132     setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand);
133     setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand);
134   }
135 
136   for (MVT VT : MVT::integer_valuetypes()) {
137     if (VT == MVT::i64)
138       continue;
139 
140     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
141     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal);
142     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal);
143     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
144 
145     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
146     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal);
147     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal);
148     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);
149 
150     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
151     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal);
152     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal);
153     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
154   }
155 
156   for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) {
157     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
158     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
159     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
160     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand);
161     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand);
162     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand);
163     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
164     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
165     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
166     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v3i16, Expand);
167     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v3i16, Expand);
168     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v3i16, Expand);
169     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
170     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
171     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
172   }
173 
174   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
175   setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
176   setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
177   setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
178   setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
179   setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
180   setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
181 
182   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
183   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
184   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
185   setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
186   setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
187 
188   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
189   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
190   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
191   setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
192   setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
193 
194   setOperationAction(ISD::STORE, MVT::f32, Promote);
195   AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
196 
197   setOperationAction(ISD::STORE, MVT::v2f32, Promote);
198   AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
199 
200   setOperationAction(ISD::STORE, MVT::v3f32, Promote);
201   AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
202 
203   setOperationAction(ISD::STORE, MVT::v4f32, Promote);
204   AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
205 
206   setOperationAction(ISD::STORE, MVT::v5f32, Promote);
207   AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
208 
209   setOperationAction(ISD::STORE, MVT::v8f32, Promote);
210   AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
211 
212   setOperationAction(ISD::STORE, MVT::v16f32, Promote);
213   AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
214 
215   setOperationAction(ISD::STORE, MVT::v32f32, Promote);
216   AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
217 
218   setOperationAction(ISD::STORE, MVT::i64, Promote);
219   AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
220 
221   setOperationAction(ISD::STORE, MVT::v2i64, Promote);
222   AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
223 
224   setOperationAction(ISD::STORE, MVT::f64, Promote);
225   AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
226 
227   setOperationAction(ISD::STORE, MVT::v2f64, Promote);
228   AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
229 
230   setOperationAction(ISD::STORE, MVT::v4i64, Promote);
231   AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
232 
233   setOperationAction(ISD::STORE, MVT::v4f64, Promote);
234   AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
235 
236   setOperationAction(ISD::STORE, MVT::v8i64, Promote);
237   AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
238 
239   setOperationAction(ISD::STORE, MVT::v8f64, Promote);
240   AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
241 
242   setOperationAction(ISD::STORE, MVT::v16i64, Promote);
243   AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
244 
245   setOperationAction(ISD::STORE, MVT::v16f64, Promote);
246   AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
247 
248   setTruncStoreAction(MVT::i64, MVT::i1, Expand);
249   setTruncStoreAction(MVT::i64, MVT::i8, Expand);
250   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
251   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
252 
253   setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
254   setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
255   setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
256   setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
257 
258   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
259   setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
260   setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
261   setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
262   setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
263   setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
264   setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
265 
266   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
267   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
268 
269   setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
270   setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
271 
272   setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
273   setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
274   setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
275   setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
276 
277   setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
278   setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
279 
280   setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
281   setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
282   setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
283   setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
284   setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
285   setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
286   setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
287 
288   setOperationAction(ISD::Constant, MVT::i32, Legal);
289   setOperationAction(ISD::Constant, MVT::i64, Legal);
290   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
291   setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
292 
293   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
294   setOperationAction(ISD::BRIND, MVT::Other, Expand);
295 
296   // This is totally unsupported, just custom lower to produce an error.
297   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
298 
299   // Library functions.  These default to Expand, but we have instructions
300   // for them.
301   setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
302   setOperationAction(ISD::FEXP2,  MVT::f32, Legal);
303   setOperationAction(ISD::FPOW,   MVT::f32, Legal);
304   setOperationAction(ISD::FLOG2,  MVT::f32, Legal);
305   setOperationAction(ISD::FABS,   MVT::f32, Legal);
306   setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
307   setOperationAction(ISD::FRINT,  MVT::f32, Legal);
308   setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
309   setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
310   setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
311 
312   setOperationAction(ISD::FROUND, MVT::f32, Custom);
313   setOperationAction(ISD::FROUND, MVT::f64, Custom);
314 
315   setOperationAction(ISD::FLOG, MVT::f32, Custom);
316   setOperationAction(ISD::FLOG10, MVT::f32, Custom);
317   setOperationAction(ISD::FEXP, MVT::f32, Custom);
318 
319 
320   setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
321   setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
322 
323   setOperationAction(ISD::FREM, MVT::f32, Custom);
324   setOperationAction(ISD::FREM, MVT::f64, Custom);
325 
326   // Expand to fneg + fadd.
327   setOperationAction(ISD::FSUB, MVT::f64, Expand);
328 
329   setOperationAction(ISD::CONCAT_VECTORS, MVT::v3i32, Custom);
330   setOperationAction(ISD::CONCAT_VECTORS, MVT::v3f32, Custom);
331   setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
332   setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
333   setOperationAction(ISD::CONCAT_VECTORS, MVT::v5i32, Custom);
334   setOperationAction(ISD::CONCAT_VECTORS, MVT::v5f32, Custom);
335   setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
336   setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
337   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
338   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
339   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f32, Custom);
340   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3i32, Custom);
341   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom);
342   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
343   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5f32, Custom);
344   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5i32, Custom);
345   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
346   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
347   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f32, Custom);
348   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i32, Custom);
349   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32f32, Custom);
350   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i32, Custom);
351 
352   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
353   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);
354   setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom);
355 
356   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
357   for (MVT VT : ScalarIntVTs) {
358     // These should use [SU]DIVREM, so set them to expand
359     setOperationAction(ISD::SDIV, VT, Expand);
360     setOperationAction(ISD::UDIV, VT, Expand);
361     setOperationAction(ISD::SREM, VT, Expand);
362     setOperationAction(ISD::UREM, VT, Expand);
363 
364     // GPU does not have divrem function for signed or unsigned.
365     setOperationAction(ISD::SDIVREM, VT, Custom);
366     setOperationAction(ISD::UDIVREM, VT, Custom);
367 
368     // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
369     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
370     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
371 
372     setOperationAction(ISD::BSWAP, VT, Expand);
373     setOperationAction(ISD::CTTZ, VT, Expand);
374     setOperationAction(ISD::CTLZ, VT, Expand);
375 
376     // AMDGPU uses ADDC/SUBC/ADDE/SUBE
377     setOperationAction(ISD::ADDC, VT, Legal);
378     setOperationAction(ISD::SUBC, VT, Legal);
379     setOperationAction(ISD::ADDE, VT, Legal);
380     setOperationAction(ISD::SUBE, VT, Legal);
381   }
382 
383   // The hardware supports 32-bit FSHR, but not FSHL.
384   setOperationAction(ISD::FSHR, MVT::i32, Legal);
385 
386   // The hardware supports 32-bit ROTR, but not ROTL.
387   setOperationAction(ISD::ROTL, MVT::i32, Expand);
388   setOperationAction(ISD::ROTL, MVT::i64, Expand);
389   setOperationAction(ISD::ROTR, MVT::i64, Expand);
390 
391   setOperationAction(ISD::MUL, MVT::i64, Expand);
392   setOperationAction(ISD::MULHU, MVT::i64, Expand);
393   setOperationAction(ISD::MULHS, MVT::i64, Expand);
394   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
395   setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
396   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
397   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
398   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
399 
400   setOperationAction(ISD::SMIN, MVT::i32, Legal);
401   setOperationAction(ISD::UMIN, MVT::i32, Legal);
402   setOperationAction(ISD::SMAX, MVT::i32, Legal);
403   setOperationAction(ISD::UMAX, MVT::i32, Legal);
404 
405   setOperationAction(ISD::CTTZ, MVT::i64, Custom);
406   setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Custom);
407   setOperationAction(ISD::CTLZ, MVT::i64, Custom);
408   setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
409 
410   static const MVT::SimpleValueType VectorIntTypes[] = {
411     MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32
412   };
413 
414   for (MVT VT : VectorIntTypes) {
415     // Expand the following operations for the current type by default.
416     setOperationAction(ISD::ADD,  VT, Expand);
417     setOperationAction(ISD::AND,  VT, Expand);
418     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
419     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
420     setOperationAction(ISD::MUL,  VT, Expand);
421     setOperationAction(ISD::MULHU, VT, Expand);
422     setOperationAction(ISD::MULHS, VT, Expand);
423     setOperationAction(ISD::OR,   VT, Expand);
424     setOperationAction(ISD::SHL,  VT, Expand);
425     setOperationAction(ISD::SRA,  VT, Expand);
426     setOperationAction(ISD::SRL,  VT, Expand);
427     setOperationAction(ISD::ROTL, VT, Expand);
428     setOperationAction(ISD::ROTR, VT, Expand);
429     setOperationAction(ISD::SUB,  VT, Expand);
430     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
431     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
432     setOperationAction(ISD::SDIV, VT, Expand);
433     setOperationAction(ISD::UDIV, VT, Expand);
434     setOperationAction(ISD::SREM, VT, Expand);
435     setOperationAction(ISD::UREM, VT, Expand);
436     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
437     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
438     setOperationAction(ISD::SDIVREM, VT, Custom);
439     setOperationAction(ISD::UDIVREM, VT, Expand);
440     setOperationAction(ISD::SELECT, VT, Expand);
441     setOperationAction(ISD::VSELECT, VT, Expand);
442     setOperationAction(ISD::SELECT_CC, VT, Expand);
443     setOperationAction(ISD::XOR,  VT, Expand);
444     setOperationAction(ISD::BSWAP, VT, Expand);
445     setOperationAction(ISD::CTPOP, VT, Expand);
446     setOperationAction(ISD::CTTZ, VT, Expand);
447     setOperationAction(ISD::CTLZ, VT, Expand);
448     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
449     setOperationAction(ISD::SETCC, VT, Expand);
450   }
451 
452   static const MVT::SimpleValueType FloatVectorTypes[] = {
453      MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32
454   };
455 
456   for (MVT VT : FloatVectorTypes) {
457     setOperationAction(ISD::FABS, VT, Expand);
458     setOperationAction(ISD::FMINNUM, VT, Expand);
459     setOperationAction(ISD::FMAXNUM, VT, Expand);
460     setOperationAction(ISD::FADD, VT, Expand);
461     setOperationAction(ISD::FCEIL, VT, Expand);
462     setOperationAction(ISD::FCOS, VT, Expand);
463     setOperationAction(ISD::FDIV, VT, Expand);
464     setOperationAction(ISD::FEXP2, VT, Expand);
465     setOperationAction(ISD::FEXP, VT, Expand);
466     setOperationAction(ISD::FLOG2, VT, Expand);
467     setOperationAction(ISD::FREM, VT, Expand);
468     setOperationAction(ISD::FLOG, VT, Expand);
469     setOperationAction(ISD::FLOG10, VT, Expand);
470     setOperationAction(ISD::FPOW, VT, Expand);
471     setOperationAction(ISD::FFLOOR, VT, Expand);
472     setOperationAction(ISD::FTRUNC, VT, Expand);
473     setOperationAction(ISD::FMUL, VT, Expand);
474     setOperationAction(ISD::FMA, VT, Expand);
475     setOperationAction(ISD::FRINT, VT, Expand);
476     setOperationAction(ISD::FNEARBYINT, VT, Expand);
477     setOperationAction(ISD::FSQRT, VT, Expand);
478     setOperationAction(ISD::FSIN, VT, Expand);
479     setOperationAction(ISD::FSUB, VT, Expand);
480     setOperationAction(ISD::FNEG, VT, Expand);
481     setOperationAction(ISD::VSELECT, VT, Expand);
482     setOperationAction(ISD::SELECT_CC, VT, Expand);
483     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
484     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
485     setOperationAction(ISD::SETCC, VT, Expand);
486     setOperationAction(ISD::FCANONICALIZE, VT, Expand);
487   }
488 
489   // This causes using an unrolled select operation rather than expansion with
490   // bit operations. This is in general better, but the alternative using BFI
491   // instructions may be better if the select sources are SGPRs.
492   setOperationAction(ISD::SELECT, MVT::v2f32, Promote);
493   AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
494 
495   setOperationAction(ISD::SELECT, MVT::v3f32, Promote);
496   AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
497 
498   setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
499   AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
500 
501   setOperationAction(ISD::SELECT, MVT::v5f32, Promote);
502   AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
503 
504   // There are no libcalls of any kind.
505   for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
506     setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
507 
508   setSchedulingPreference(Sched::RegPressure);
509   setJumpIsExpensive(true);
510 
511   // FIXME: This is only partially true. If we have to do vector compares, any
512   // SGPR pair can be a condition register. If we have a uniform condition, we
513   // are better off doing SALU operations, where there is only one SCC. For now,
514   // we don't have a way of knowing during instruction selection if a condition
515   // will be uniform and we always use vector compares. Assume we are using
516   // vector compares until that is fixed.
517   setHasMultipleConditionRegisters(true);
518 
519   setMinCmpXchgSizeInBits(32);
520   setSupportsUnalignedAtomics(false);
521 
522   PredictableSelectIsExpensive = false;
523 
524   // We want to find all load dependencies for long chains of stores to enable
525   // merging into very wide vectors. The problem is with vectors with > 4
526   // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
527   // vectors are a legal type, even though we have to split the loads
528   // usually. When we can more precisely specify load legality per address
529   // space, we should be able to make FindBetterChain/MergeConsecutiveStores
530   // smarter so that they can figure out what to do in 2 iterations without all
531   // N > 4 stores on the same chain.
532   GatherAllAliasesMaxDepth = 16;
533 
534   // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
535   // about these during lowering.
536   MaxStoresPerMemcpy  = 0xffffffff;
537   MaxStoresPerMemmove = 0xffffffff;
538   MaxStoresPerMemset  = 0xffffffff;
539 
540   // The expansion for 64-bit division is enormous.
541   if (AMDGPUBypassSlowDiv)
542     addBypassSlowDiv(64, 32);
543 
544   setTargetDAGCombine(ISD::BITCAST);
545   setTargetDAGCombine(ISD::SHL);
546   setTargetDAGCombine(ISD::SRA);
547   setTargetDAGCombine(ISD::SRL);
548   setTargetDAGCombine(ISD::TRUNCATE);
549   setTargetDAGCombine(ISD::MUL);
550   setTargetDAGCombine(ISD::MULHU);
551   setTargetDAGCombine(ISD::MULHS);
552   setTargetDAGCombine(ISD::SELECT);
553   setTargetDAGCombine(ISD::SELECT_CC);
554   setTargetDAGCombine(ISD::STORE);
555   setTargetDAGCombine(ISD::FADD);
556   setTargetDAGCombine(ISD::FSUB);
557   setTargetDAGCombine(ISD::FNEG);
558   setTargetDAGCombine(ISD::FABS);
559   setTargetDAGCombine(ISD::AssertZext);
560   setTargetDAGCombine(ISD::AssertSext);
561   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
562 }
563 
564 //===----------------------------------------------------------------------===//
565 // Target Information
566 //===----------------------------------------------------------------------===//
567 
568 LLVM_READNONE
569 static bool fnegFoldsIntoOp(unsigned Opc) {
570   switch (Opc) {
571   case ISD::FADD:
572   case ISD::FSUB:
573   case ISD::FMUL:
574   case ISD::FMA:
575   case ISD::FMAD:
576   case ISD::FMINNUM:
577   case ISD::FMAXNUM:
578   case ISD::FMINNUM_IEEE:
579   case ISD::FMAXNUM_IEEE:
580   case ISD::FSIN:
581   case ISD::FTRUNC:
582   case ISD::FRINT:
583   case ISD::FNEARBYINT:
584   case ISD::FCANONICALIZE:
585   case AMDGPUISD::RCP:
586   case AMDGPUISD::RCP_LEGACY:
587   case AMDGPUISD::RCP_IFLAG:
588   case AMDGPUISD::SIN_HW:
589   case AMDGPUISD::FMUL_LEGACY:
590   case AMDGPUISD::FMIN_LEGACY:
591   case AMDGPUISD::FMAX_LEGACY:
592   case AMDGPUISD::FMED3:
593     return true;
594   default:
595     return false;
596   }
597 }
598 
599 /// \p returns true if the operation will definitely need to use a 64-bit
600 /// encoding, and thus will use a VOP3 encoding regardless of the source
601 /// modifiers.
602 LLVM_READONLY
603 static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
604   return N->getNumOperands() > 2 || VT == MVT::f64;
605 }
606 
607 // Most FP instructions support source modifiers, but this could be refined
608 // slightly.
609 LLVM_READONLY
610 static bool hasSourceMods(const SDNode *N) {
611   if (isa<MemSDNode>(N))
612     return false;
613 
614   switch (N->getOpcode()) {
615   case ISD::CopyToReg:
616   case ISD::SELECT:
617   case ISD::FDIV:
618   case ISD::FREM:
619   case ISD::INLINEASM:
620   case ISD::INLINEASM_BR:
621   case AMDGPUISD::DIV_SCALE:
622   case ISD::INTRINSIC_W_CHAIN:
623 
624   // TODO: Should really be looking at the users of the bitcast. These are
625   // problematic because bitcasts are used to legalize all stores to integer
626   // types.
627   case ISD::BITCAST:
628     return false;
629   case ISD::INTRINSIC_WO_CHAIN: {
630     switch (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()) {
631     case Intrinsic::amdgcn_interp_p1:
632     case Intrinsic::amdgcn_interp_p2:
633     case Intrinsic::amdgcn_interp_mov:
634     case Intrinsic::amdgcn_interp_p1_f16:
635     case Intrinsic::amdgcn_interp_p2_f16:
636       return false;
637     default:
638       return true;
639     }
640   }
641   default:
642     return true;
643   }
644 }
645 
646 bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
647                                                  unsigned CostThreshold) {
648   // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
649   // it is truly free to use a source modifier in all cases. If there are
650   // multiple users but for each one will necessitate using VOP3, there will be
651   // a code size increase. Try to avoid increasing code size unless we know it
652   // will save on the instruction count.
653   unsigned NumMayIncreaseSize = 0;
654   MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
655 
656   // XXX - Should this limit number of uses to check?
657   for (const SDNode *U : N->uses()) {
658     if (!hasSourceMods(U))
659       return false;
660 
661     if (!opMustUseVOP3Encoding(U, VT)) {
662       if (++NumMayIncreaseSize > CostThreshold)
663         return false;
664     }
665   }
666 
667   return true;
668 }
669 
670 EVT AMDGPUTargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
671                                               ISD::NodeType ExtendKind) const {
672   assert(!VT.isVector() && "only scalar expected");
673 
674   // Round to the next multiple of 32-bits.
675   unsigned Size = VT.getSizeInBits();
676   if (Size <= 32)
677     return MVT::i32;
678   return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
679 }
680 
681 MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
682   return MVT::i32;
683 }
684 
685 bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
686   return true;
687 }
688 
689 // The backend supports 32 and 64 bit floating point immediates.
690 // FIXME: Why are we reporting vectors of FP immediates as legal?
691 bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
692                                         bool ForCodeSize) const {
693   EVT ScalarVT = VT.getScalarType();
694   return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
695          (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
696 }
697 
698 // We don't want to shrink f64 / f32 constants.
699 bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
700   EVT ScalarVT = VT.getScalarType();
701   return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
702 }
703 
704 bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
705                                                  ISD::LoadExtType ExtTy,
706                                                  EVT NewVT) const {
707   // TODO: This may be worth removing. Check regression tests for diffs.
708   if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT))
709     return false;
710 
711   unsigned NewSize = NewVT.getStoreSizeInBits();
712 
713   // If we are reducing to a 32-bit load or a smaller multi-dword load,
714   // this is always better.
715   if (NewSize >= 32)
716     return true;
717 
718   EVT OldVT = N->getValueType(0);
719   unsigned OldSize = OldVT.getStoreSizeInBits();
720 
721   MemSDNode *MN = cast<MemSDNode>(N);
722   unsigned AS = MN->getAddressSpace();
723   // Do not shrink an aligned scalar load to sub-dword.
724   // Scalar engine cannot do sub-dword loads.
725   if (OldSize >= 32 && NewSize < 32 && MN->getAlignment() >= 4 &&
726       (AS == AMDGPUAS::CONSTANT_ADDRESS ||
727        AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
728        (isa<LoadSDNode>(N) &&
729         AS == AMDGPUAS::GLOBAL_ADDRESS && MN->isInvariant())) &&
730       AMDGPUInstrInfo::isUniformMMO(MN->getMemOperand()))
731     return false;
732 
733   // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
734   // extloads, so doing one requires using a buffer_load. In cases where we
735   // still couldn't use a scalar load, using the wider load shouldn't really
736   // hurt anything.
737 
738   // If the old size already had to be an extload, there's no harm in continuing
739   // to reduce the width.
740   return (OldSize < 32);
741 }
742 
743 bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy,
744                                                    const SelectionDAG &DAG,
745                                                    const MachineMemOperand &MMO) const {
746 
747   assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
748 
749   if (LoadTy.getScalarType() == MVT::i32)
750     return false;
751 
752   unsigned LScalarSize = LoadTy.getScalarSizeInBits();
753   unsigned CastScalarSize = CastTy.getScalarSizeInBits();
754 
755   if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
756     return false;
757 
758   bool Fast = false;
759   return allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
760                                         CastTy, MMO, &Fast) &&
761          Fast;
762 }
763 
764 // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
765 // profitable with the expansion for 64-bit since it's generally good to
766 // speculate things.
767 // FIXME: These should really have the size as a parameter.
768 bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const {
769   return true;
770 }
771 
772 bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
773   return true;
774 }
775 
776 bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode * N) const {
777   switch (N->getOpcode()) {
778     default:
779     return false;
780     case ISD::EntryToken:
781     case ISD::TokenFactor:
782       return true;
783     case ISD::INTRINSIC_WO_CHAIN:
784     {
785       unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
786       switch (IntrID) {
787         default:
788         return false;
789         case Intrinsic::amdgcn_readfirstlane:
790         case Intrinsic::amdgcn_readlane:
791           return true;
792       }
793     }
794     break;
795     case ISD::LOAD:
796     {
797       if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
798           AMDGPUAS::CONSTANT_ADDRESS_32BIT)
799         return true;
800       return false;
801     }
802     break;
803   }
804 }
805 
806 TargetLowering::NegatibleCost
807 AMDGPUTargetLowering::getNegatibleCost(SDValue Op, SelectionDAG &DAG,
808                                        bool LegalOperations, bool ForCodeSize,
809                                        unsigned Depth) const {
810   switch (Op.getOpcode()) {
811   case ISD::FMA:
812   case ISD::FMAD: {
813     // Negating a fma is not free if it has users without source mods.
814     if (!allUsesHaveSourceMods(Op.getNode()))
815       return NegatibleCost::Expensive;
816     break;
817   }
818   default:
819     break;
820   }
821 
822   return TargetLowering::getNegatibleCost(Op, DAG, LegalOperations, ForCodeSize,
823                                           Depth);
824 }
825 
826 //===---------------------------------------------------------------------===//
827 // Target Properties
828 //===---------------------------------------------------------------------===//
829 
830 bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
831   assert(VT.isFloatingPoint());
832 
833   // Packed operations do not have a fabs modifier.
834   return VT == MVT::f32 || VT == MVT::f64 ||
835          (Subtarget->has16BitInsts() && VT == MVT::f16);
836 }
837 
838 bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
839   assert(VT.isFloatingPoint());
840   return VT == MVT::f32 || VT == MVT::f64 ||
841          (Subtarget->has16BitInsts() && VT == MVT::f16) ||
842          (Subtarget->hasVOP3PInsts() && VT == MVT::v2f16);
843 }
844 
845 bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
846                                                          unsigned NumElem,
847                                                          unsigned AS) const {
848   return true;
849 }
850 
851 bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
852   // There are few operations which truly have vector input operands. Any vector
853   // operation is going to involve operations on each component, and a
854   // build_vector will be a copy per element, so it always makes sense to use a
855   // build_vector input in place of the extracted element to avoid a copy into a
856   // super register.
857   //
858   // We should probably only do this if all users are extracts only, but this
859   // should be the common case.
860   return true;
861 }
862 
863 bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
864   // Truncate is just accessing a subregister.
865 
866   unsigned SrcSize = Source.getSizeInBits();
867   unsigned DestSize = Dest.getSizeInBits();
868 
869   return DestSize < SrcSize && DestSize % 32 == 0 ;
870 }
871 
872 bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
873   // Truncate is just accessing a subregister.
874 
875   unsigned SrcSize = Source->getScalarSizeInBits();
876   unsigned DestSize = Dest->getScalarSizeInBits();
877 
878   if (DestSize== 16 && Subtarget->has16BitInsts())
879     return SrcSize >= 32;
880 
881   return DestSize < SrcSize && DestSize % 32 == 0;
882 }
883 
884 bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
885   unsigned SrcSize = Src->getScalarSizeInBits();
886   unsigned DestSize = Dest->getScalarSizeInBits();
887 
888   if (SrcSize == 16 && Subtarget->has16BitInsts())
889     return DestSize >= 32;
890 
891   return SrcSize == 32 && DestSize == 64;
892 }
893 
894 bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
895   // Any register load of a 64-bit value really requires 2 32-bit moves. For all
896   // practical purposes, the extra mov 0 to load a 64-bit is free.  As used,
897   // this will enable reducing 64-bit operations the 32-bit, which is always
898   // good.
899 
900   if (Src == MVT::i16)
901     return Dest == MVT::i32 ||Dest == MVT::i64 ;
902 
903   return Src == MVT::i32 && Dest == MVT::i64;
904 }
905 
906 bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
907   return isZExtFree(Val.getValueType(), VT2);
908 }
909 
910 bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
911   // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
912   // limited number of native 64-bit operations. Shrinking an operation to fit
913   // in a single 32-bit register should always be helpful. As currently used,
914   // this is much less general than the name suggests, and is only used in
915   // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
916   // not profitable, and may actually be harmful.
917   return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
918 }
919 
920 //===---------------------------------------------------------------------===//
921 // TargetLowering Callbacks
922 //===---------------------------------------------------------------------===//
923 
924 CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
925                                                   bool IsVarArg) {
926   switch (CC) {
927   case CallingConv::AMDGPU_VS:
928   case CallingConv::AMDGPU_GS:
929   case CallingConv::AMDGPU_PS:
930   case CallingConv::AMDGPU_CS:
931   case CallingConv::AMDGPU_HS:
932   case CallingConv::AMDGPU_ES:
933   case CallingConv::AMDGPU_LS:
934     return CC_AMDGPU;
935   case CallingConv::C:
936   case CallingConv::Fast:
937   case CallingConv::Cold:
938     return CC_AMDGPU_Func;
939   case CallingConv::AMDGPU_KERNEL:
940   case CallingConv::SPIR_KERNEL:
941   default:
942     report_fatal_error("Unsupported calling convention for call");
943   }
944 }
945 
946 CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
947                                                     bool IsVarArg) {
948   switch (CC) {
949   case CallingConv::AMDGPU_KERNEL:
950   case CallingConv::SPIR_KERNEL:
951     llvm_unreachable("kernels should not be handled here");
952   case CallingConv::AMDGPU_VS:
953   case CallingConv::AMDGPU_GS:
954   case CallingConv::AMDGPU_PS:
955   case CallingConv::AMDGPU_CS:
956   case CallingConv::AMDGPU_HS:
957   case CallingConv::AMDGPU_ES:
958   case CallingConv::AMDGPU_LS:
959     return RetCC_SI_Shader;
960   case CallingConv::C:
961   case CallingConv::Fast:
962   case CallingConv::Cold:
963     return RetCC_AMDGPU_Func;
964   default:
965     report_fatal_error("Unsupported calling convention.");
966   }
967 }
968 
969 /// The SelectionDAGBuilder will automatically promote function arguments
970 /// with illegal types.  However, this does not work for the AMDGPU targets
971 /// since the function arguments are stored in memory as these illegal types.
972 /// In order to handle this properly we need to get the original types sizes
973 /// from the LLVM IR Function and fixup the ISD:InputArg values before
974 /// passing them to AnalyzeFormalArguments()
975 
976 /// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
977 /// input values across multiple registers.  Each item in the Ins array
978 /// represents a single value that will be stored in registers.  Ins[x].VT is
979 /// the value type of the value that will be stored in the register, so
980 /// whatever SDNode we lower the argument to needs to be this type.
981 ///
982 /// In order to correctly lower the arguments we need to know the size of each
983 /// argument.  Since Ins[x].VT gives us the size of the register that will
984 /// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
985 /// for the orignal function argument so that we can deduce the correct memory
986 /// type to use for Ins[x].  In most cases the correct memory type will be
987 /// Ins[x].ArgVT.  However, this will not always be the case.  If, for example,
988 /// we have a kernel argument of type v8i8, this argument will be split into
989 /// 8 parts and each part will be represented by its own item in the Ins array.
990 /// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
991 /// the argument before it was split.  From this, we deduce that the memory type
992 /// for each individual part is i8.  We pass the memory type as LocVT to the
993 /// calling convention analysis function and the register type (Ins[x].VT) as
994 /// the ValVT.
995 void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
996   CCState &State,
997   const SmallVectorImpl<ISD::InputArg> &Ins) const {
998   const MachineFunction &MF = State.getMachineFunction();
999   const Function &Fn = MF.getFunction();
1000   LLVMContext &Ctx = Fn.getParent()->getContext();
1001   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
1002   const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn);
1003   CallingConv::ID CC = Fn.getCallingConv();
1004 
1005   unsigned MaxAlign = 1;
1006   uint64_t ExplicitArgOffset = 0;
1007   const DataLayout &DL = Fn.getParent()->getDataLayout();
1008 
1009   unsigned InIndex = 0;
1010 
1011   for (const Argument &Arg : Fn.args()) {
1012     Type *BaseArgTy = Arg.getType();
1013     unsigned Align = DL.getABITypeAlignment(BaseArgTy);
1014     MaxAlign = std::max(Align, MaxAlign);
1015     unsigned AllocSize = DL.getTypeAllocSize(BaseArgTy);
1016 
1017     uint64_t ArgOffset = alignTo(ExplicitArgOffset, Align) + ExplicitOffset;
1018     ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize;
1019 
1020     // We're basically throwing away everything passed into us and starting over
1021     // to get accurate in-memory offsets. The "PartOffset" is completely useless
1022     // to us as computed in Ins.
1023     //
1024     // We also need to figure out what type legalization is trying to do to get
1025     // the correct memory offsets.
1026 
1027     SmallVector<EVT, 16> ValueVTs;
1028     SmallVector<uint64_t, 16> Offsets;
1029     ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
1030 
1031     for (unsigned Value = 0, NumValues = ValueVTs.size();
1032          Value != NumValues; ++Value) {
1033       uint64_t BasePartOffset = Offsets[Value];
1034 
1035       EVT ArgVT = ValueVTs[Value];
1036       EVT MemVT = ArgVT;
1037       MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1038       unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1039 
1040       if (NumRegs == 1) {
1041         // This argument is not split, so the IR type is the memory type.
1042         if (ArgVT.isExtended()) {
1043           // We have an extended type, like i24, so we should just use the
1044           // register type.
1045           MemVT = RegisterVT;
1046         } else {
1047           MemVT = ArgVT;
1048         }
1049       } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1050                  ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1051         assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1052         // We have a vector value which has been split into a vector with
1053         // the same scalar type, but fewer elements.  This should handle
1054         // all the floating-point vector types.
1055         MemVT = RegisterVT;
1056       } else if (ArgVT.isVector() &&
1057                  ArgVT.getVectorNumElements() == NumRegs) {
1058         // This arg has been split so that each element is stored in a separate
1059         // register.
1060         MemVT = ArgVT.getScalarType();
1061       } else if (ArgVT.isExtended()) {
1062         // We have an extended type, like i65.
1063         MemVT = RegisterVT;
1064       } else {
1065         unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1066         assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1067         if (RegisterVT.isInteger()) {
1068           MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1069         } else if (RegisterVT.isVector()) {
1070           assert(!RegisterVT.getScalarType().isFloatingPoint());
1071           unsigned NumElements = RegisterVT.getVectorNumElements();
1072           assert(MemoryBits % NumElements == 0);
1073           // This vector type has been split into another vector type with
1074           // a different elements size.
1075           EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1076                                            MemoryBits / NumElements);
1077           MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1078         } else {
1079           llvm_unreachable("cannot deduce memory type.");
1080         }
1081       }
1082 
1083       // Convert one element vectors to scalar.
1084       if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1085         MemVT = MemVT.getScalarType();
1086 
1087       // Round up vec3/vec5 argument.
1088       if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1089         assert(MemVT.getVectorNumElements() == 3 ||
1090                MemVT.getVectorNumElements() == 5);
1091         MemVT = MemVT.getPow2VectorType(State.getContext());
1092       } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1093         MemVT = MemVT.getRoundIntegerType(State.getContext());
1094       }
1095 
1096       unsigned PartOffset = 0;
1097       for (unsigned i = 0; i != NumRegs; ++i) {
1098         State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1099                                                BasePartOffset + PartOffset,
1100                                                MemVT.getSimpleVT(),
1101                                                CCValAssign::Full));
1102         PartOffset += MemVT.getStoreSize();
1103       }
1104     }
1105   }
1106 }
1107 
1108 SDValue AMDGPUTargetLowering::LowerReturn(
1109   SDValue Chain, CallingConv::ID CallConv,
1110   bool isVarArg,
1111   const SmallVectorImpl<ISD::OutputArg> &Outs,
1112   const SmallVectorImpl<SDValue> &OutVals,
1113   const SDLoc &DL, SelectionDAG &DAG) const {
1114   // FIXME: Fails for r600 tests
1115   //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1116   // "wave terminate should not have return values");
1117   return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1118 }
1119 
1120 //===---------------------------------------------------------------------===//
1121 // Target specific lowering
1122 //===---------------------------------------------------------------------===//
1123 
1124 /// Selects the correct CCAssignFn for a given CallingConvention value.
1125 CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
1126                                                     bool IsVarArg) {
1127   return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1128 }
1129 
1130 CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
1131                                                       bool IsVarArg) {
1132   return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
1133 }
1134 
1135 SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain,
1136                                                   SelectionDAG &DAG,
1137                                                   MachineFrameInfo &MFI,
1138                                                   int ClobberedFI) const {
1139   SmallVector<SDValue, 8> ArgChains;
1140   int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1141   int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1142 
1143   // Include the original chain at the beginning of the list. When this is
1144   // used by target LowerCall hooks, this helps legalize find the
1145   // CALLSEQ_BEGIN node.
1146   ArgChains.push_back(Chain);
1147 
1148   // Add a chain value for each stack argument corresponding
1149   for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
1150                             UE = DAG.getEntryNode().getNode()->use_end();
1151        U != UE; ++U) {
1152     if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U)) {
1153       if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1154         if (FI->getIndex() < 0) {
1155           int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1156           int64_t InLastByte = InFirstByte;
1157           InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1158 
1159           if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1160               (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1161             ArgChains.push_back(SDValue(L, 1));
1162         }
1163       }
1164     }
1165   }
1166 
1167   // Build a tokenfactor for all the chains.
1168   return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1169 }
1170 
1171 SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,
1172                                                  SmallVectorImpl<SDValue> &InVals,
1173                                                  StringRef Reason) const {
1174   SDValue Callee = CLI.Callee;
1175   SelectionDAG &DAG = CLI.DAG;
1176 
1177   const Function &Fn = DAG.getMachineFunction().getFunction();
1178 
1179   StringRef FuncName("<unknown>");
1180 
1181   if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1182     FuncName = G->getSymbol();
1183   else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1184     FuncName = G->getGlobal()->getName();
1185 
1186   DiagnosticInfoUnsupported NoCalls(
1187     Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1188   DAG.getContext()->diagnose(NoCalls);
1189 
1190   if (!CLI.IsTailCall) {
1191     for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
1192       InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
1193   }
1194 
1195   return DAG.getEntryNode();
1196 }
1197 
1198 SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
1199                                         SmallVectorImpl<SDValue> &InVals) const {
1200   return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1201 }
1202 
1203 SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
1204                                                       SelectionDAG &DAG) const {
1205   const Function &Fn = DAG.getMachineFunction().getFunction();
1206 
1207   DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1208                                             SDLoc(Op).getDebugLoc());
1209   DAG.getContext()->diagnose(NoDynamicAlloca);
1210   auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1211   return DAG.getMergeValues(Ops, SDLoc());
1212 }
1213 
1214 SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
1215                                              SelectionDAG &DAG) const {
1216   switch (Op.getOpcode()) {
1217   default:
1218     Op->print(errs(), &DAG);
1219     llvm_unreachable("Custom lowering code for this"
1220                      "instruction is not implemented yet!");
1221     break;
1222   case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
1223   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1224   case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
1225   case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1226   case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1227   case ISD::FREM: return LowerFREM(Op, DAG);
1228   case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1229   case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1230   case ISD::FRINT: return LowerFRINT(Op, DAG);
1231   case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1232   case ISD::FROUND: return LowerFROUND(Op, DAG);
1233   case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1234   case ISD::FLOG:
1235     return LowerFLOG(Op, DAG, 1.0F / numbers::log2ef);
1236   case ISD::FLOG10:
1237     return LowerFLOG(Op, DAG, numbers::ln2f / numbers::ln10f);
1238   case ISD::FEXP:
1239     return lowerFEXP(Op, DAG);
1240   case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1241   case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1242   case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1243   case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
1244   case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
1245   case ISD::CTTZ:
1246   case ISD::CTTZ_ZERO_UNDEF:
1247   case ISD::CTLZ:
1248   case ISD::CTLZ_ZERO_UNDEF:
1249     return LowerCTLZ_CTTZ(Op, DAG);
1250   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
1251   }
1252   return Op;
1253 }
1254 
1255 void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
1256                                               SmallVectorImpl<SDValue> &Results,
1257                                               SelectionDAG &DAG) const {
1258   switch (N->getOpcode()) {
1259   case ISD::SIGN_EXTEND_INREG:
1260     // Different parts of legalization seem to interpret which type of
1261     // sign_extend_inreg is the one to check for custom lowering. The extended
1262     // from type is what really matters, but some places check for custom
1263     // lowering of the result type. This results in trying to use
1264     // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1265     // nothing here and let the illegal result integer be handled normally.
1266     return;
1267   default:
1268     return;
1269   }
1270 }
1271 
1272 bool AMDGPUTargetLowering::hasDefinedInitializer(const GlobalValue *GV) {
1273   const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
1274   if (!GVar || !GVar->hasInitializer())
1275     return false;
1276 
1277   return !isa<UndefValue>(GVar->getInitializer());
1278 }
1279 
1280 SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
1281                                                  SDValue Op,
1282                                                  SelectionDAG &DAG) const {
1283 
1284   const DataLayout &DL = DAG.getDataLayout();
1285   GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1286   const GlobalValue *GV = G->getGlobal();
1287 
1288   if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1289       G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1290     if (!MFI->isEntryFunction()) {
1291       SDLoc DL(Op);
1292       const Function &Fn = DAG.getMachineFunction().getFunction();
1293       DiagnosticInfoUnsupported BadLDSDecl(
1294         Fn, "local memory global used by non-kernel function",
1295         DL.getDebugLoc(), DS_Warning);
1296       DAG.getContext()->diagnose(BadLDSDecl);
1297 
1298       // We currently don't have a way to correctly allocate LDS objects that
1299       // aren't directly associated with a kernel. We do force inlining of
1300       // functions that use local objects. However, if these dead functions are
1301       // not eliminated, we don't want a compile time error. Just emit a warning
1302       // and a trap, since there should be no callable path here.
1303       SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1304       SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1305                                         Trap, DAG.getRoot());
1306       DAG.setRoot(OutputChain);
1307       return DAG.getUNDEF(Op.getValueType());
1308     }
1309 
1310     // XXX: What does the value of G->getOffset() mean?
1311     assert(G->getOffset() == 0 &&
1312          "Do not know what to do with an non-zero offset");
1313 
1314     // TODO: We could emit code to handle the initialization somewhere.
1315     if (!hasDefinedInitializer(GV)) {
1316       unsigned Offset = MFI->allocateLDSGlobal(DL, *GV);
1317       return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1318     }
1319   }
1320 
1321   const Function &Fn = DAG.getMachineFunction().getFunction();
1322   DiagnosticInfoUnsupported BadInit(
1323       Fn, "unsupported initializer for address space", SDLoc(Op).getDebugLoc());
1324   DAG.getContext()->diagnose(BadInit);
1325   return SDValue();
1326 }
1327 
1328 SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
1329                                                   SelectionDAG &DAG) const {
1330   SmallVector<SDValue, 8> Args;
1331 
1332   EVT VT = Op.getValueType();
1333   if (VT == MVT::v4i16 || VT == MVT::v4f16) {
1334     SDLoc SL(Op);
1335     SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(0));
1336     SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(1));
1337 
1338     SDValue BV = DAG.getBuildVector(MVT::v2i32, SL, { Lo, Hi });
1339     return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1340   }
1341 
1342   for (const SDUse &U : Op->ops())
1343     DAG.ExtractVectorElements(U.get(), Args);
1344 
1345   return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1346 }
1347 
1348 SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
1349                                                      SelectionDAG &DAG) const {
1350 
1351   SmallVector<SDValue, 8> Args;
1352   unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
1353   EVT VT = Op.getValueType();
1354   DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1355                             VT.getVectorNumElements());
1356 
1357   return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1358 }
1359 
1360 /// Generate Min/Max node
1361 SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
1362                                                    SDValue LHS, SDValue RHS,
1363                                                    SDValue True, SDValue False,
1364                                                    SDValue CC,
1365                                                    DAGCombinerInfo &DCI) const {
1366   if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
1367     return SDValue();
1368 
1369   SelectionDAG &DAG = DCI.DAG;
1370   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1371   switch (CCOpcode) {
1372   case ISD::SETOEQ:
1373   case ISD::SETONE:
1374   case ISD::SETUNE:
1375   case ISD::SETNE:
1376   case ISD::SETUEQ:
1377   case ISD::SETEQ:
1378   case ISD::SETFALSE:
1379   case ISD::SETFALSE2:
1380   case ISD::SETTRUE:
1381   case ISD::SETTRUE2:
1382   case ISD::SETUO:
1383   case ISD::SETO:
1384     break;
1385   case ISD::SETULE:
1386   case ISD::SETULT: {
1387     if (LHS == True)
1388       return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1389     return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1390   }
1391   case ISD::SETOLE:
1392   case ISD::SETOLT:
1393   case ISD::SETLE:
1394   case ISD::SETLT: {
1395     // Ordered. Assume ordered for undefined.
1396 
1397     // Only do this after legalization to avoid interfering with other combines
1398     // which might occur.
1399     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1400         !DCI.isCalledByLegalizer())
1401       return SDValue();
1402 
1403     // We need to permute the operands to get the correct NaN behavior. The
1404     // selected operand is the second one based on the failing compare with NaN,
1405     // so permute it based on the compare type the hardware uses.
1406     if (LHS == True)
1407       return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1408     return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1409   }
1410   case ISD::SETUGE:
1411   case ISD::SETUGT: {
1412     if (LHS == True)
1413       return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1414     return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1415   }
1416   case ISD::SETGT:
1417   case ISD::SETGE:
1418   case ISD::SETOGE:
1419   case ISD::SETOGT: {
1420     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1421         !DCI.isCalledByLegalizer())
1422       return SDValue();
1423 
1424     if (LHS == True)
1425       return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1426     return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1427   }
1428   case ISD::SETCC_INVALID:
1429     llvm_unreachable("Invalid setcc condcode!");
1430   }
1431   return SDValue();
1432 }
1433 
1434 std::pair<SDValue, SDValue>
1435 AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
1436   SDLoc SL(Op);
1437 
1438   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1439 
1440   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1441   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1442 
1443   SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1444   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1445 
1446   return std::make_pair(Lo, Hi);
1447 }
1448 
1449 SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
1450   SDLoc SL(Op);
1451 
1452   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1453   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1454   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1455 }
1456 
1457 SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
1458   SDLoc SL(Op);
1459 
1460   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1461   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1462   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1463 }
1464 
1465 // Split a vector type into two parts. The first part is a power of two vector.
1466 // The second part is whatever is left over, and is a scalar if it would
1467 // otherwise be a 1-vector.
1468 std::pair<EVT, EVT>
1469 AMDGPUTargetLowering::getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const {
1470   EVT LoVT, HiVT;
1471   EVT EltVT = VT.getVectorElementType();
1472   unsigned NumElts = VT.getVectorNumElements();
1473   unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1474   LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1475   HiVT = NumElts - LoNumElts == 1
1476              ? EltVT
1477              : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1478   return std::make_pair(LoVT, HiVT);
1479 }
1480 
1481 // Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1482 // scalar.
1483 std::pair<SDValue, SDValue>
1484 AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL,
1485                                   const EVT &LoVT, const EVT &HiVT,
1486                                   SelectionDAG &DAG) const {
1487   assert(LoVT.getVectorNumElements() +
1488                  (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1489              N.getValueType().getVectorNumElements() &&
1490          "More vector elements requested than available!");
1491   SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
1492                            DAG.getVectorIdxConstant(0, DL));
1493   SDValue Hi = DAG.getNode(
1494       HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL,
1495       HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));
1496   return std::make_pair(Lo, Hi);
1497 }
1498 
1499 SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
1500                                               SelectionDAG &DAG) const {
1501   LoadSDNode *Load = cast<LoadSDNode>(Op);
1502   EVT VT = Op.getValueType();
1503   SDLoc SL(Op);
1504 
1505 
1506   // If this is a 2 element vector, we really want to scalarize and not create
1507   // weird 1 element vectors.
1508   if (VT.getVectorNumElements() == 2) {
1509     SDValue Ops[2];
1510     std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1511     return DAG.getMergeValues(Ops, SL);
1512   }
1513 
1514   SDValue BasePtr = Load->getBasePtr();
1515   EVT MemVT = Load->getMemoryVT();
1516 
1517   const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1518 
1519   EVT LoVT, HiVT;
1520   EVT LoMemVT, HiMemVT;
1521   SDValue Lo, Hi;
1522 
1523   std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1524   std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1525   std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1526 
1527   unsigned Size = LoMemVT.getStoreSize();
1528   unsigned BaseAlign = Load->getAlignment();
1529   unsigned HiAlign = MinAlign(BaseAlign, Size);
1530 
1531   SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1532                                   Load->getChain(), BasePtr, SrcValue, LoMemVT,
1533                                   BaseAlign, Load->getMemOperand()->getFlags());
1534   SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, Size);
1535   SDValue HiLoad =
1536       DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1537                      HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1538                      HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1539 
1540   SDValue Join;
1541   if (LoVT == HiVT) {
1542     // This is the case that the vector is power of two so was evenly split.
1543     Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1544   } else {
1545     Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
1546                        DAG.getVectorIdxConstant(0, SL));
1547     Join = DAG.getNode(
1548         HiVT.isVector() ? ISD::INSERT_SUBVECTOR : ISD::INSERT_VECTOR_ELT, SL,
1549         VT, Join, HiLoad,
1550         DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), SL));
1551   }
1552 
1553   SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1554                                      LoLoad.getValue(1), HiLoad.getValue(1))};
1555 
1556   return DAG.getMergeValues(Ops, SL);
1557 }
1558 
1559 // Widen a vector load from vec3 to vec4.
1560 SDValue AMDGPUTargetLowering::WidenVectorLoad(SDValue Op,
1561                                               SelectionDAG &DAG) const {
1562   LoadSDNode *Load = cast<LoadSDNode>(Op);
1563   EVT VT = Op.getValueType();
1564   assert(VT.getVectorNumElements() == 3);
1565   SDValue BasePtr = Load->getBasePtr();
1566   EVT MemVT = Load->getMemoryVT();
1567   SDLoc SL(Op);
1568   const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1569   unsigned BaseAlign = Load->getAlignment();
1570 
1571   EVT WideVT =
1572       EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
1573   EVT WideMemVT =
1574       EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 4);
1575   SDValue WideLoad = DAG.getExtLoad(
1576       Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1577       WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1578   return DAG.getMergeValues(
1579       {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1580                    DAG.getVectorIdxConstant(0, SL)),
1581        WideLoad.getValue(1)},
1582       SL);
1583 }
1584 
1585 SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
1586                                                SelectionDAG &DAG) const {
1587   StoreSDNode *Store = cast<StoreSDNode>(Op);
1588   SDValue Val = Store->getValue();
1589   EVT VT = Val.getValueType();
1590 
1591   // If this is a 2 element vector, we really want to scalarize and not create
1592   // weird 1 element vectors.
1593   if (VT.getVectorNumElements() == 2)
1594     return scalarizeVectorStore(Store, DAG);
1595 
1596   EVT MemVT = Store->getMemoryVT();
1597   SDValue Chain = Store->getChain();
1598   SDValue BasePtr = Store->getBasePtr();
1599   SDLoc SL(Op);
1600 
1601   EVT LoVT, HiVT;
1602   EVT LoMemVT, HiMemVT;
1603   SDValue Lo, Hi;
1604 
1605   std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1606   std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1607   std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1608 
1609   SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1610 
1611   const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1612   unsigned BaseAlign = Store->getAlignment();
1613   unsigned Size = LoMemVT.getStoreSize();
1614   unsigned HiAlign = MinAlign(BaseAlign, Size);
1615 
1616   SDValue LoStore =
1617       DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1618                         Store->getMemOperand()->getFlags());
1619   SDValue HiStore =
1620       DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1621                         HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1622 
1623   return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1624 }
1625 
1626 // This is a shortcut for integer division because we have fast i32<->f32
1627 // conversions, and fast f32 reciprocal instructions. The fractional part of a
1628 // float is enough to accurately represent up to a 24-bit signed integer.
1629 SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
1630                                             bool Sign) const {
1631   SDLoc DL(Op);
1632   EVT VT = Op.getValueType();
1633   SDValue LHS = Op.getOperand(0);
1634   SDValue RHS = Op.getOperand(1);
1635   MVT IntVT = MVT::i32;
1636   MVT FltVT = MVT::f32;
1637 
1638   unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1639   if (LHSSignBits < 9)
1640     return SDValue();
1641 
1642   unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1643   if (RHSSignBits < 9)
1644     return SDValue();
1645 
1646   unsigned BitSize = VT.getSizeInBits();
1647   unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1648   unsigned DivBits = BitSize - SignBits;
1649   if (Sign)
1650     ++DivBits;
1651 
1652   ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
1653   ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
1654 
1655   SDValue jq = DAG.getConstant(1, DL, IntVT);
1656 
1657   if (Sign) {
1658     // char|short jq = ia ^ ib;
1659     jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1660 
1661     // jq = jq >> (bitsize - 2)
1662     jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1663                      DAG.getConstant(BitSize - 2, DL, VT));
1664 
1665     // jq = jq | 0x1
1666     jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1667   }
1668 
1669   // int ia = (int)LHS;
1670   SDValue ia = LHS;
1671 
1672   // int ib, (int)RHS;
1673   SDValue ib = RHS;
1674 
1675   // float fa = (float)ia;
1676   SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1677 
1678   // float fb = (float)ib;
1679   SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1680 
1681   SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1682                            fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1683 
1684   // fq = trunc(fq);
1685   fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1686 
1687   // float fqneg = -fq;
1688   SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1689 
1690   MachineFunction &MF = DAG.getMachineFunction();
1691   const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
1692 
1693   // float fr = mad(fqneg, fb, fa);
1694   unsigned OpCode = !MFI->getMode().allFP32Denormals() ?
1695                     (unsigned)ISD::FMAD :
1696                     (unsigned)AMDGPUISD::FMAD_FTZ;
1697 
1698   SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1699 
1700   // int iq = (int)fq;
1701   SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1702 
1703   // fr = fabs(fr);
1704   fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1705 
1706   // fb = fabs(fb);
1707   fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1708 
1709   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1710 
1711   // int cv = fr >= fb;
1712   SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1713 
1714   // jq = (cv ? jq : 0);
1715   jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1716 
1717   // dst = iq + jq;
1718   SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
1719 
1720   // Rem needs compensation, it's easier to recompute it
1721   SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
1722   Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
1723 
1724   // Truncate to number of bits this divide really is.
1725   if (Sign) {
1726     SDValue InRegSize
1727       = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
1728     Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
1729     Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
1730   } else {
1731     SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
1732     Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
1733     Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
1734   }
1735 
1736   return DAG.getMergeValues({ Div, Rem }, DL);
1737 }
1738 
1739 void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
1740                                       SelectionDAG &DAG,
1741                                       SmallVectorImpl<SDValue> &Results) const {
1742   SDLoc DL(Op);
1743   EVT VT = Op.getValueType();
1744 
1745   assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
1746 
1747   EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1748 
1749   SDValue One = DAG.getConstant(1, DL, HalfVT);
1750   SDValue Zero = DAG.getConstant(0, DL, HalfVT);
1751 
1752   //HiLo split
1753   SDValue LHS = Op.getOperand(0);
1754   SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
1755   SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, One);
1756 
1757   SDValue RHS = Op.getOperand(1);
1758   SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
1759   SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, One);
1760 
1761   if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
1762       DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
1763 
1764     SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1765                               LHS_Lo, RHS_Lo);
1766 
1767     SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
1768     SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
1769 
1770     Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
1771     Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
1772     return;
1773   }
1774 
1775   if (isTypeLegal(MVT::i64)) {
1776     MachineFunction &MF = DAG.getMachineFunction();
1777     const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1778 
1779     // Compute denominator reciprocal.
1780     unsigned FMAD = !MFI->getMode().allFP32Denormals() ?
1781                     (unsigned)ISD::FMAD :
1782                     (unsigned)AMDGPUISD::FMAD_FTZ;
1783 
1784 
1785     SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
1786     SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
1787     SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
1788       DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
1789       Cvt_Lo);
1790     SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
1791     SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
1792       DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
1793     SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
1794       DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
1795     SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
1796     SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
1797       DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
1798       Mul1);
1799     SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
1800     SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
1801     SDValue Rcp64 = DAG.getBitcast(VT,
1802                         DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
1803 
1804     SDValue Zero64 = DAG.getConstant(0, DL, VT);
1805     SDValue One64  = DAG.getConstant(1, DL, VT);
1806     SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
1807     SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
1808 
1809     SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
1810     SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
1811     SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
1812     SDValue Mulhi1_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
1813                                     Zero);
1814     SDValue Mulhi1_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
1815                                     One);
1816 
1817     SDValue Add1_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Lo,
1818                                   Mulhi1_Lo, Zero1);
1819     SDValue Add1_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Hi,
1820                                   Mulhi1_Hi, Add1_Lo.getValue(1));
1821     SDValue Add1_HiNc = DAG.getNode(ISD::ADD, DL, HalfVT, Rcp_Hi, Mulhi1_Hi);
1822     SDValue Add1 = DAG.getBitcast(VT,
1823                         DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
1824 
1825     SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
1826     SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
1827     SDValue Mulhi2_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
1828                                     Zero);
1829     SDValue Mulhi2_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
1830                                     One);
1831 
1832     SDValue Add2_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Lo,
1833                                   Mulhi2_Lo, Zero1);
1834     SDValue Add2_HiC = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_HiNc,
1835                                    Mulhi2_Hi, Add1_Lo.getValue(1));
1836     SDValue Add2_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add2_HiC,
1837                                   Zero, Add2_Lo.getValue(1));
1838     SDValue Add2 = DAG.getBitcast(VT,
1839                         DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
1840     SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
1841 
1842     SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
1843 
1844     SDValue Mul3_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, Zero);
1845     SDValue Mul3_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, One);
1846     SDValue Sub1_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Lo,
1847                                   Mul3_Lo, Zero1);
1848     SDValue Sub1_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Hi,
1849                                   Mul3_Hi, Sub1_Lo.getValue(1));
1850     SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
1851     SDValue Sub1 = DAG.getBitcast(VT,
1852                         DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
1853 
1854     SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
1855     SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
1856                                  ISD::SETUGE);
1857     SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
1858                                  ISD::SETUGE);
1859     SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
1860 
1861     // TODO: Here and below portions of the code can be enclosed into if/endif.
1862     // Currently control flow is unconditional and we have 4 selects after
1863     // potential endif to substitute PHIs.
1864 
1865     // if C3 != 0 ...
1866     SDValue Sub2_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Lo,
1867                                   RHS_Lo, Zero1);
1868     SDValue Sub2_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Mi,
1869                                   RHS_Hi, Sub1_Lo.getValue(1));
1870     SDValue Sub2_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1871                                   Zero, Sub2_Lo.getValue(1));
1872     SDValue Sub2 = DAG.getBitcast(VT,
1873                         DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
1874 
1875     SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
1876 
1877     SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
1878                                  ISD::SETUGE);
1879     SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
1880                                  ISD::SETUGE);
1881     SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
1882 
1883     // if (C6 != 0)
1884     SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
1885 
1886     SDValue Sub3_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Lo,
1887                                   RHS_Lo, Zero1);
1888     SDValue Sub3_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1889                                   RHS_Hi, Sub2_Lo.getValue(1));
1890     SDValue Sub3_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub3_Mi,
1891                                   Zero, Sub3_Lo.getValue(1));
1892     SDValue Sub3 = DAG.getBitcast(VT,
1893                         DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
1894 
1895     // endif C6
1896     // endif C3
1897 
1898     SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
1899     SDValue Div  = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
1900 
1901     SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
1902     SDValue Rem  = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
1903 
1904     Results.push_back(Div);
1905     Results.push_back(Rem);
1906 
1907     return;
1908   }
1909 
1910   // r600 expandion.
1911   // Get Speculative values
1912   SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
1913   SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
1914 
1915   SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
1916   SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
1917   REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
1918 
1919   SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
1920   SDValue DIV_Lo = Zero;
1921 
1922   const unsigned halfBitWidth = HalfVT.getSizeInBits();
1923 
1924   for (unsigned i = 0; i < halfBitWidth; ++i) {
1925     const unsigned bitPos = halfBitWidth - i - 1;
1926     SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
1927     // Get value of high bit
1928     SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
1929     HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
1930     HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
1931 
1932     // Shift
1933     REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
1934     // Add LHS high bit
1935     REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
1936 
1937     SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
1938     SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
1939 
1940     DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
1941 
1942     // Update REM
1943     SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
1944     REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
1945   }
1946 
1947   SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
1948   DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
1949   Results.push_back(DIV);
1950   Results.push_back(REM);
1951 }
1952 
1953 SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
1954                                            SelectionDAG &DAG) const {
1955   SDLoc DL(Op);
1956   EVT VT = Op.getValueType();
1957 
1958   if (VT == MVT::i64) {
1959     SmallVector<SDValue, 2> Results;
1960     LowerUDIVREM64(Op, DAG, Results);
1961     return DAG.getMergeValues(Results, DL);
1962   }
1963 
1964   if (VT == MVT::i32) {
1965     if (SDValue Res = LowerDIVREM24(Op, DAG, false))
1966       return Res;
1967   }
1968 
1969   SDValue Num = Op.getOperand(0);
1970   SDValue Den = Op.getOperand(1);
1971 
1972   // RCP =  URECIP(Den) = 2^32 / Den + e
1973   // e is rounding error.
1974   SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
1975 
1976   // RCP_LO = mul(RCP, Den) */
1977   SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den);
1978 
1979   // RCP_HI = mulhu (RCP, Den) */
1980   SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den);
1981 
1982   // NEG_RCP_LO = -RCP_LO
1983   SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
1984                                                      RCP_LO);
1985 
1986   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
1987   SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
1988                                            NEG_RCP_LO, RCP_LO,
1989                                            ISD::SETEQ);
1990   // Calculate the rounding error from the URECIP instruction
1991   // E = mulhu(ABS_RCP_LO, RCP)
1992   SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP);
1993 
1994   // RCP_A_E = RCP + E
1995   SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E);
1996 
1997   // RCP_S_E = RCP - E
1998   SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E);
1999 
2000   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
2001   SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
2002                                      RCP_A_E, RCP_S_E,
2003                                      ISD::SETEQ);
2004   // Quotient = mulhu(Tmp0, Num)
2005   SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num);
2006 
2007   // Num_S_Remainder = Quotient * Den
2008   SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den);
2009 
2010   // Remainder = Num - Num_S_Remainder
2011   SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder);
2012 
2013   // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
2014   SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den,
2015                                                  DAG.getConstant(-1, DL, VT),
2016                                                  DAG.getConstant(0, DL, VT),
2017                                                  ISD::SETUGE);
2018   // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
2019   SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num,
2020                                                   Num_S_Remainder,
2021                                                   DAG.getConstant(-1, DL, VT),
2022                                                   DAG.getConstant(0, DL, VT),
2023                                                   ISD::SETUGE);
2024   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
2025   SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den,
2026                                                Remainder_GE_Zero);
2027 
2028   // Calculate Division result:
2029 
2030   // Quotient_A_One = Quotient + 1
2031   SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient,
2032                                        DAG.getConstant(1, DL, VT));
2033 
2034   // Quotient_S_One = Quotient - 1
2035   SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient,
2036                                        DAG.getConstant(1, DL, VT));
2037 
2038   // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
2039   SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
2040                                      Quotient, Quotient_A_One, ISD::SETEQ);
2041 
2042   // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
2043   Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
2044                             Quotient_S_One, Div, ISD::SETEQ);
2045 
2046   // Calculate Rem result:
2047 
2048   // Remainder_S_Den = Remainder - Den
2049   SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den);
2050 
2051   // Remainder_A_Den = Remainder + Den
2052   SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den);
2053 
2054   // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
2055   SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
2056                                     Remainder, Remainder_S_Den, ISD::SETEQ);
2057 
2058   // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
2059   Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
2060                             Remainder_A_Den, Rem, ISD::SETEQ);
2061   SDValue Ops[2] = {
2062     Div,
2063     Rem
2064   };
2065   return DAG.getMergeValues(Ops, DL);
2066 }
2067 
2068 SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
2069                                            SelectionDAG &DAG) const {
2070   SDLoc DL(Op);
2071   EVT VT = Op.getValueType();
2072 
2073   SDValue LHS = Op.getOperand(0);
2074   SDValue RHS = Op.getOperand(1);
2075 
2076   SDValue Zero = DAG.getConstant(0, DL, VT);
2077   SDValue NegOne = DAG.getConstant(-1, DL, VT);
2078 
2079   if (VT == MVT::i32) {
2080     if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2081       return Res;
2082   }
2083 
2084   if (VT == MVT::i64 &&
2085       DAG.ComputeNumSignBits(LHS) > 32 &&
2086       DAG.ComputeNumSignBits(RHS) > 32) {
2087     EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2088 
2089     //HiLo split
2090     SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2091     SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2092     SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2093                                  LHS_Lo, RHS_Lo);
2094     SDValue Res[2] = {
2095       DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2096       DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2097     };
2098     return DAG.getMergeValues(Res, DL);
2099   }
2100 
2101   SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2102   SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2103   SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2104   SDValue RSign = LHSign; // Remainder sign is the same as LHS
2105 
2106   LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2107   RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2108 
2109   LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2110   RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2111 
2112   SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2113   SDValue Rem = Div.getValue(1);
2114 
2115   Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2116   Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2117 
2118   Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2119   Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2120 
2121   SDValue Res[2] = {
2122     Div,
2123     Rem
2124   };
2125   return DAG.getMergeValues(Res, DL);
2126 }
2127 
2128 // (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y))
2129 SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
2130   SDLoc SL(Op);
2131   EVT VT = Op.getValueType();
2132   SDValue X = Op.getOperand(0);
2133   SDValue Y = Op.getOperand(1);
2134 
2135   // TODO: Should this propagate fast-math-flags?
2136 
2137   SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y);
2138   SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div);
2139   SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y);
2140 
2141   return DAG.getNode(ISD::FSUB, SL, VT, X, Mul);
2142 }
2143 
2144 SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
2145   SDLoc SL(Op);
2146   SDValue Src = Op.getOperand(0);
2147 
2148   // result = trunc(src)
2149   // if (src > 0.0 && src != result)
2150   //   result += 1.0
2151 
2152   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2153 
2154   const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2155   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2156 
2157   EVT SetCCVT =
2158       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2159 
2160   SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2161   SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2162   SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2163 
2164   SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2165   // TODO: Should this propagate fast-math-flags?
2166   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2167 }
2168 
2169 static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
2170                                   SelectionDAG &DAG) {
2171   const unsigned FractBits = 52;
2172   const unsigned ExpBits = 11;
2173 
2174   SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2175                                 Hi,
2176                                 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2177                                 DAG.getConstant(ExpBits, SL, MVT::i32));
2178   SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2179                             DAG.getConstant(1023, SL, MVT::i32));
2180 
2181   return Exp;
2182 }
2183 
2184 SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
2185   SDLoc SL(Op);
2186   SDValue Src = Op.getOperand(0);
2187 
2188   assert(Op.getValueType() == MVT::f64);
2189 
2190   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2191   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
2192 
2193   SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
2194 
2195   // Extract the upper half, since this is where we will find the sign and
2196   // exponent.
2197   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One);
2198 
2199   SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2200 
2201   const unsigned FractBits = 52;
2202 
2203   // Extract the sign bit.
2204   const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2205   SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2206 
2207   // Extend back to 64-bits.
2208   SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2209   SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2210 
2211   SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2212   const SDValue FractMask
2213     = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2214 
2215   SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2216   SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2217   SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2218 
2219   EVT SetCCVT =
2220       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2221 
2222   const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2223 
2224   SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2225   SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2226 
2227   SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2228   SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2229 
2230   return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2231 }
2232 
2233 SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
2234   SDLoc SL(Op);
2235   SDValue Src = Op.getOperand(0);
2236 
2237   assert(Op.getValueType() == MVT::f64);
2238 
2239   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2240   SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2241   SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2242 
2243   // TODO: Should this propagate fast-math-flags?
2244 
2245   SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2246   SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2247 
2248   SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2249 
2250   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2251   SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2252 
2253   EVT SetCCVT =
2254       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2255   SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2256 
2257   return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2258 }
2259 
2260 SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const {
2261   // FNEARBYINT and FRINT are the same, except in their handling of FP
2262   // exceptions. Those aren't really meaningful for us, and OpenCL only has
2263   // rint, so just treat them as equivalent.
2264   return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
2265 }
2266 
2267 // XXX - May require not supporting f32 denormals?
2268 
2269 // Don't handle v2f16. The extra instructions to scalarize and repack around the
2270 // compare and vselect end up producing worse code than scalarizing the whole
2271 // operation.
2272 SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2273   SDLoc SL(Op);
2274   SDValue X = Op.getOperand(0);
2275   EVT VT = Op.getValueType();
2276 
2277   SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2278 
2279   // TODO: Should this propagate fast-math-flags?
2280 
2281   SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2282 
2283   SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2284 
2285   const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2286   const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2287   const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2288 
2289   SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X);
2290 
2291   EVT SetCCVT =
2292       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2293 
2294   SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2295 
2296   SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero);
2297 
2298   return DAG.getNode(ISD::FADD, SL, VT, T, Sel);
2299 }
2300 
2301 SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
2302   SDLoc SL(Op);
2303   SDValue Src = Op.getOperand(0);
2304 
2305   // result = trunc(src);
2306   // if (src < 0.0 && src != result)
2307   //   result += -1.0.
2308 
2309   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2310 
2311   const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2312   const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2313 
2314   EVT SetCCVT =
2315       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2316 
2317   SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2318   SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2319   SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2320 
2321   SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2322   // TODO: Should this propagate fast-math-flags?
2323   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2324 }
2325 
2326 SDValue AMDGPUTargetLowering::LowerFLOG(SDValue Op, SelectionDAG &DAG,
2327                                         double Log2BaseInverted) const {
2328   EVT VT = Op.getValueType();
2329 
2330   SDLoc SL(Op);
2331   SDValue Operand = Op.getOperand(0);
2332   SDValue Log2Operand = DAG.getNode(ISD::FLOG2, SL, VT, Operand);
2333   SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2334 
2335   return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand);
2336 }
2337 
2338 // exp2(M_LOG2E_F * f);
2339 SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
2340   EVT VT = Op.getValueType();
2341   SDLoc SL(Op);
2342   SDValue Src = Op.getOperand(0);
2343 
2344   const SDValue K = DAG.getConstantFP(numbers::log2e, SL, VT);
2345   SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Src, K, Op->getFlags());
2346   return DAG.getNode(ISD::FEXP2, SL, VT, Mul, Op->getFlags());
2347 }
2348 
2349 static bool isCtlzOpc(unsigned Opc) {
2350   return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
2351 }
2352 
2353 static bool isCttzOpc(unsigned Opc) {
2354   return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
2355 }
2356 
2357 SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {
2358   SDLoc SL(Op);
2359   SDValue Src = Op.getOperand(0);
2360   bool ZeroUndef = Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF ||
2361                    Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF;
2362 
2363   unsigned ISDOpc, NewOpc;
2364   if (isCtlzOpc(Op.getOpcode())) {
2365     ISDOpc = ISD::CTLZ_ZERO_UNDEF;
2366     NewOpc = AMDGPUISD::FFBH_U32;
2367   } else if (isCttzOpc(Op.getOpcode())) {
2368     ISDOpc = ISD::CTTZ_ZERO_UNDEF;
2369     NewOpc = AMDGPUISD::FFBL_B32;
2370   } else
2371     llvm_unreachable("Unexpected OPCode!!!");
2372 
2373 
2374   if (ZeroUndef && Src.getValueType() == MVT::i32)
2375     return DAG.getNode(NewOpc, SL, MVT::i32, Src);
2376 
2377   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
2378 
2379   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2380   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
2381 
2382   SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
2383   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
2384 
2385   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
2386                                    *DAG.getContext(), MVT::i32);
2387 
2388   SDValue HiOrLo = isCtlzOpc(Op.getOpcode()) ? Hi : Lo;
2389   SDValue Hi0orLo0 = DAG.getSetCC(SL, SetCCVT, HiOrLo, Zero, ISD::SETEQ);
2390 
2391   SDValue OprLo = DAG.getNode(ISDOpc, SL, MVT::i32, Lo);
2392   SDValue OprHi = DAG.getNode(ISDOpc, SL, MVT::i32, Hi);
2393 
2394   const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32);
2395   SDValue Add, NewOpr;
2396   if (isCtlzOpc(Op.getOpcode())) {
2397     Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprLo, Bits32);
2398     // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x))
2399     NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprHi);
2400   } else {
2401     Add = DAG.getNode(ISD::ADD, SL, MVT::i32, OprHi, Bits32);
2402     // cttz(x) = lo_32(x) == 0 ? cttz(hi_32(x)) + 32 : cttz(lo_32(x))
2403     NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0orLo0, Add, OprLo);
2404   }
2405 
2406   if (!ZeroUndef) {
2407     // Test if the full 64-bit input is zero.
2408 
2409     // FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32,
2410     // which we probably don't want.
2411     SDValue LoOrHi = isCtlzOpc(Op.getOpcode()) ? Lo : Hi;
2412     SDValue Lo0OrHi0 = DAG.getSetCC(SL, SetCCVT, LoOrHi, Zero, ISD::SETEQ);
2413     SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0OrHi0, Hi0orLo0);
2414 
2415     // TODO: If i64 setcc is half rate, it can result in 1 fewer instruction
2416     // with the same cycles, otherwise it is slower.
2417     // SDValue SrcIsZero = DAG.getSetCC(SL, SetCCVT, Src,
2418     // DAG.getConstant(0, SL, MVT::i64), ISD::SETEQ);
2419 
2420     const SDValue Bits32 = DAG.getConstant(64, SL, MVT::i32);
2421 
2422     // The instruction returns -1 for 0 input, but the defined intrinsic
2423     // behavior is to return the number of bits.
2424     NewOpr = DAG.getNode(ISD::SELECT, SL, MVT::i32,
2425                          SrcIsZero, Bits32, NewOpr);
2426   }
2427 
2428   return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
2429 }
2430 
2431 SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
2432                                                bool Signed) const {
2433   // Unsigned
2434   // cul2f(ulong u)
2435   //{
2436   //  uint lz = clz(u);
2437   //  uint e = (u != 0) ? 127U + 63U - lz : 0;
2438   //  u = (u << lz) & 0x7fffffffffffffffUL;
2439   //  ulong t = u & 0xffffffffffUL;
2440   //  uint v = (e << 23) | (uint)(u >> 40);
2441   //  uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
2442   //  return as_float(v + r);
2443   //}
2444   // Signed
2445   // cl2f(long l)
2446   //{
2447   //  long s = l >> 63;
2448   //  float r = cul2f((l + s) ^ s);
2449   //  return s ? -r : r;
2450   //}
2451 
2452   SDLoc SL(Op);
2453   SDValue Src = Op.getOperand(0);
2454   SDValue L = Src;
2455 
2456   SDValue S;
2457   if (Signed) {
2458     const SDValue SignBit = DAG.getConstant(63, SL, MVT::i64);
2459     S = DAG.getNode(ISD::SRA, SL, MVT::i64, L, SignBit);
2460 
2461     SDValue LPlusS = DAG.getNode(ISD::ADD, SL, MVT::i64, L, S);
2462     L = DAG.getNode(ISD::XOR, SL, MVT::i64, LPlusS, S);
2463   }
2464 
2465   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
2466                                    *DAG.getContext(), MVT::f32);
2467 
2468 
2469   SDValue ZeroI32 = DAG.getConstant(0, SL, MVT::i32);
2470   SDValue ZeroI64 = DAG.getConstant(0, SL, MVT::i64);
2471   SDValue LZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i64, L);
2472   LZ = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LZ);
2473 
2474   SDValue K = DAG.getConstant(127U + 63U, SL, MVT::i32);
2475   SDValue E = DAG.getSelect(SL, MVT::i32,
2476     DAG.getSetCC(SL, SetCCVT, L, ZeroI64, ISD::SETNE),
2477     DAG.getNode(ISD::SUB, SL, MVT::i32, K, LZ),
2478     ZeroI32);
2479 
2480   SDValue U = DAG.getNode(ISD::AND, SL, MVT::i64,
2481     DAG.getNode(ISD::SHL, SL, MVT::i64, L, LZ),
2482     DAG.getConstant((-1ULL) >> 1, SL, MVT::i64));
2483 
2484   SDValue T = DAG.getNode(ISD::AND, SL, MVT::i64, U,
2485                           DAG.getConstant(0xffffffffffULL, SL, MVT::i64));
2486 
2487   SDValue UShl = DAG.getNode(ISD::SRL, SL, MVT::i64,
2488                              U, DAG.getConstant(40, SL, MVT::i64));
2489 
2490   SDValue V = DAG.getNode(ISD::OR, SL, MVT::i32,
2491     DAG.getNode(ISD::SHL, SL, MVT::i32, E, DAG.getConstant(23, SL, MVT::i32)),
2492     DAG.getNode(ISD::TRUNCATE, SL, MVT::i32,  UShl));
2493 
2494   SDValue C = DAG.getConstant(0x8000000000ULL, SL, MVT::i64);
2495   SDValue RCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETUGT);
2496   SDValue TCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETEQ);
2497 
2498   SDValue One = DAG.getConstant(1, SL, MVT::i32);
2499 
2500   SDValue VTrunc1 = DAG.getNode(ISD::AND, SL, MVT::i32, V, One);
2501 
2502   SDValue R = DAG.getSelect(SL, MVT::i32,
2503     RCmp,
2504     One,
2505     DAG.getSelect(SL, MVT::i32, TCmp, VTrunc1, ZeroI32));
2506   R = DAG.getNode(ISD::ADD, SL, MVT::i32, V, R);
2507   R = DAG.getNode(ISD::BITCAST, SL, MVT::f32, R);
2508 
2509   if (!Signed)
2510     return R;
2511 
2512   SDValue RNeg = DAG.getNode(ISD::FNEG, SL, MVT::f32, R);
2513   return DAG.getSelect(SL, MVT::f32, DAG.getSExtOrTrunc(S, SL, SetCCVT), RNeg, R);
2514 }
2515 
2516 SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
2517                                                bool Signed) const {
2518   SDLoc SL(Op);
2519   SDValue Src = Op.getOperand(0);
2520 
2521   SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
2522 
2523   SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
2524                            DAG.getConstant(0, SL, MVT::i32));
2525   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
2526                            DAG.getConstant(1, SL, MVT::i32));
2527 
2528   SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
2529                               SL, MVT::f64, Hi);
2530 
2531   SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
2532 
2533   SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
2534                               DAG.getConstant(32, SL, MVT::i32));
2535   // TODO: Should this propagate fast-math-flags?
2536   return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
2537 }
2538 
2539 SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
2540                                                SelectionDAG &DAG) const {
2541   // TODO: Factor out code common with LowerSINT_TO_FP.
2542   EVT DestVT = Op.getValueType();
2543   SDValue Src = Op.getOperand(0);
2544   EVT SrcVT = Src.getValueType();
2545 
2546   if (SrcVT == MVT::i16) {
2547     if (DestVT == MVT::f16)
2548       return Op;
2549     SDLoc DL(Op);
2550 
2551     // Promote src to i32
2552     SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
2553     return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
2554   }
2555 
2556   assert(SrcVT == MVT::i64 && "operation should be legal");
2557 
2558   if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2559     SDLoc DL(Op);
2560 
2561     SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2562     SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2563     SDValue FPRound =
2564         DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2565 
2566     return FPRound;
2567   }
2568 
2569   if (DestVT == MVT::f32)
2570     return LowerINT_TO_FP32(Op, DAG, false);
2571 
2572   assert(DestVT == MVT::f64);
2573   return LowerINT_TO_FP64(Op, DAG, false);
2574 }
2575 
2576 SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
2577                                               SelectionDAG &DAG) const {
2578   EVT DestVT = Op.getValueType();
2579 
2580   SDValue Src = Op.getOperand(0);
2581   EVT SrcVT = Src.getValueType();
2582 
2583   if (SrcVT == MVT::i16) {
2584     if (DestVT == MVT::f16)
2585       return Op;
2586 
2587     SDLoc DL(Op);
2588     // Promote src to i32
2589     SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
2590     return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
2591   }
2592 
2593   assert(SrcVT == MVT::i64 && "operation should be legal");
2594 
2595   // TODO: Factor out code common with LowerUINT_TO_FP.
2596 
2597   if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2598     SDLoc DL(Op);
2599     SDValue Src = Op.getOperand(0);
2600 
2601     SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2602     SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2603     SDValue FPRound =
2604         DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2605 
2606     return FPRound;
2607   }
2608 
2609   if (DestVT == MVT::f32)
2610     return LowerINT_TO_FP32(Op, DAG, true);
2611 
2612   assert(DestVT == MVT::f64);
2613   return LowerINT_TO_FP64(Op, DAG, true);
2614 }
2615 
2616 SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
2617                                                bool Signed) const {
2618   SDLoc SL(Op);
2619 
2620   SDValue Src = Op.getOperand(0);
2621 
2622   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2623 
2624   SDValue K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)), SL,
2625                                  MVT::f64);
2626   SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), SL,
2627                                  MVT::f64);
2628   // TODO: Should this propagate fast-math-flags?
2629   SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0);
2630 
2631   SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul);
2632 
2633 
2634   SDValue Fma = DAG.getNode(ISD::FMA, SL, MVT::f64, FloorMul, K1, Trunc);
2635 
2636   SDValue Hi = DAG.getNode(Signed ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, SL,
2637                            MVT::i32, FloorMul);
2638   SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
2639 
2640   SDValue Result = DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi});
2641 
2642   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result);
2643 }
2644 
2645 SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
2646   SDLoc DL(Op);
2647   SDValue N0 = Op.getOperand(0);
2648 
2649   // Convert to target node to get known bits
2650   if (N0.getValueType() == MVT::f32)
2651     return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
2652 
2653   if (getTargetMachine().Options.UnsafeFPMath) {
2654     // There is a generic expand for FP_TO_FP16 with unsafe fast math.
2655     return SDValue();
2656   }
2657 
2658   assert(N0.getSimpleValueType() == MVT::f64);
2659 
2660   // f64 -> f16 conversion using round-to-nearest-even rounding mode.
2661   const unsigned ExpMask = 0x7ff;
2662   const unsigned ExpBiasf64 = 1023;
2663   const unsigned ExpBiasf16 = 15;
2664   SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
2665   SDValue One = DAG.getConstant(1, DL, MVT::i32);
2666   SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
2667   SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
2668                            DAG.getConstant(32, DL, MVT::i64));
2669   UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
2670   U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
2671   SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2672                           DAG.getConstant(20, DL, MVT::i64));
2673   E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
2674                   DAG.getConstant(ExpMask, DL, MVT::i32));
2675   // Subtract the fp64 exponent bias (1023) to get the real exponent and
2676   // add the f16 bias (15) to get the biased exponent for the f16 format.
2677   E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
2678                   DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
2679 
2680   SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2681                           DAG.getConstant(8, DL, MVT::i32));
2682   M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
2683                   DAG.getConstant(0xffe, DL, MVT::i32));
2684 
2685   SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
2686                                   DAG.getConstant(0x1ff, DL, MVT::i32));
2687   MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
2688 
2689   SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
2690   M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
2691 
2692   // (M != 0 ? 0x0200 : 0) | 0x7c00;
2693   SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
2694       DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
2695                       Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
2696 
2697   // N = M | (E << 12);
2698   SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2699       DAG.getNode(ISD::SHL, DL, MVT::i32, E,
2700                   DAG.getConstant(12, DL, MVT::i32)));
2701 
2702   // B = clamp(1-E, 0, 13);
2703   SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
2704                                   One, E);
2705   SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
2706   B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
2707                   DAG.getConstant(13, DL, MVT::i32));
2708 
2709   SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2710                                    DAG.getConstant(0x1000, DL, MVT::i32));
2711 
2712   SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
2713   SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
2714   SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
2715   D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
2716 
2717   SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
2718   SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
2719                               DAG.getConstant(0x7, DL, MVT::i32));
2720   V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
2721                   DAG.getConstant(2, DL, MVT::i32));
2722   SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
2723                                One, Zero, ISD::SETEQ);
2724   SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
2725                                One, Zero, ISD::SETGT);
2726   V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
2727   V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
2728 
2729   V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
2730                       DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
2731   V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
2732                       I, V, ISD::SETEQ);
2733 
2734   // Extract the sign bit.
2735   SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2736                             DAG.getConstant(16, DL, MVT::i32));
2737   Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
2738                      DAG.getConstant(0x8000, DL, MVT::i32));
2739 
2740   V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
2741   return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
2742 }
2743 
2744 SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op,
2745                                               SelectionDAG &DAG) const {
2746   SDValue Src = Op.getOperand(0);
2747 
2748   // TODO: Factor out code common with LowerFP_TO_UINT.
2749 
2750   EVT SrcVT = Src.getValueType();
2751   if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
2752     SDLoc DL(Op);
2753 
2754     SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
2755     SDValue FpToInt32 =
2756         DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
2757 
2758     return FpToInt32;
2759   }
2760 
2761   if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
2762     return LowerFP64_TO_INT(Op, DAG, true);
2763 
2764   return SDValue();
2765 }
2766 
2767 SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op,
2768                                               SelectionDAG &DAG) const {
2769   SDValue Src = Op.getOperand(0);
2770 
2771   // TODO: Factor out code common with LowerFP_TO_SINT.
2772 
2773   EVT SrcVT = Src.getValueType();
2774   if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
2775     SDLoc DL(Op);
2776 
2777     SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
2778     SDValue FpToInt32 =
2779         DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
2780 
2781     return FpToInt32;
2782   }
2783 
2784   if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
2785     return LowerFP64_TO_INT(Op, DAG, false);
2786 
2787   return SDValue();
2788 }
2789 
2790 SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
2791                                                      SelectionDAG &DAG) const {
2792   EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
2793   MVT VT = Op.getSimpleValueType();
2794   MVT ScalarVT = VT.getScalarType();
2795 
2796   assert(VT.isVector());
2797 
2798   SDValue Src = Op.getOperand(0);
2799   SDLoc DL(Op);
2800 
2801   // TODO: Don't scalarize on Evergreen?
2802   unsigned NElts = VT.getVectorNumElements();
2803   SmallVector<SDValue, 8> Args;
2804   DAG.ExtractVectorElements(Src, Args, 0, NElts);
2805 
2806   SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
2807   for (unsigned I = 0; I < NElts; ++I)
2808     Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
2809 
2810   return DAG.getBuildVector(VT, DL, Args);
2811 }
2812 
2813 //===----------------------------------------------------------------------===//
2814 // Custom DAG optimizations
2815 //===----------------------------------------------------------------------===//
2816 
2817 static bool isU24(SDValue Op, SelectionDAG &DAG) {
2818   return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
2819 }
2820 
2821 static bool isI24(SDValue Op, SelectionDAG &DAG) {
2822   EVT VT = Op.getValueType();
2823   return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
2824                                      // as unsigned 24-bit values.
2825     AMDGPUTargetLowering::numBitsSigned(Op, DAG) < 24;
2826 }
2827 
2828 static SDValue simplifyI24(SDNode *Node24,
2829                            TargetLowering::DAGCombinerInfo &DCI) {
2830   SelectionDAG &DAG = DCI.DAG;
2831   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2832   bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
2833 
2834   SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
2835   SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
2836   unsigned NewOpcode = Node24->getOpcode();
2837   if (IsIntrin) {
2838     unsigned IID = cast<ConstantSDNode>(Node24->getOperand(0))->getZExtValue();
2839     NewOpcode = IID == Intrinsic::amdgcn_mul_i24 ?
2840       AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
2841   }
2842 
2843   APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
2844 
2845   // First try to simplify using SimplifyMultipleUseDemandedBits which allows
2846   // the operands to have other uses, but will only perform simplifications that
2847   // involve bypassing some nodes for this user.
2848   SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
2849   SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
2850   if (DemandedLHS || DemandedRHS)
2851     return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
2852                        DemandedLHS ? DemandedLHS : LHS,
2853                        DemandedRHS ? DemandedRHS : RHS);
2854 
2855   // Now try SimplifyDemandedBits which can simplify the nodes used by our
2856   // operands if this node is the only user.
2857   if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
2858     return SDValue(Node24, 0);
2859   if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
2860     return SDValue(Node24, 0);
2861 
2862   return SDValue();
2863 }
2864 
2865 template <typename IntTy>
2866 static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
2867                                uint32_t Width, const SDLoc &DL) {
2868   if (Width + Offset < 32) {
2869     uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
2870     IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
2871     return DAG.getConstant(Result, DL, MVT::i32);
2872   }
2873 
2874   return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
2875 }
2876 
2877 static bool hasVolatileUser(SDNode *Val) {
2878   for (SDNode *U : Val->uses()) {
2879     if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
2880       if (M->isVolatile())
2881         return true;
2882     }
2883   }
2884 
2885   return false;
2886 }
2887 
2888 bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
2889   // i32 vectors are the canonical memory type.
2890   if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
2891     return false;
2892 
2893   if (!VT.isByteSized())
2894     return false;
2895 
2896   unsigned Size = VT.getStoreSize();
2897 
2898   if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
2899     return false;
2900 
2901   if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
2902     return false;
2903 
2904   return true;
2905 }
2906 
2907 // Replace load of an illegal type with a store of a bitcast to a friendlier
2908 // type.
2909 SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
2910                                                  DAGCombinerInfo &DCI) const {
2911   if (!DCI.isBeforeLegalize())
2912     return SDValue();
2913 
2914   LoadSDNode *LN = cast<LoadSDNode>(N);
2915   if (LN->isVolatile() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
2916     return SDValue();
2917 
2918   SDLoc SL(N);
2919   SelectionDAG &DAG = DCI.DAG;
2920   EVT VT = LN->getMemoryVT();
2921 
2922   unsigned Size = VT.getStoreSize();
2923   unsigned Align = LN->getAlignment();
2924   if (Align < Size && isTypeLegal(VT)) {
2925     bool IsFast;
2926     unsigned AS = LN->getAddressSpace();
2927 
2928     // Expand unaligned loads earlier than legalization. Due to visitation order
2929     // problems during legalization, the emitted instructions to pack and unpack
2930     // the bytes again are not eliminated in the case of an unaligned copy.
2931     if (!allowsMisalignedMemoryAccesses(
2932             VT, AS, Align, LN->getMemOperand()->getFlags(), &IsFast)) {
2933       SDValue Ops[2];
2934 
2935       if (VT.isVector())
2936         std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(LN, DAG);
2937       else
2938         std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
2939 
2940       return DAG.getMergeValues(Ops, SDLoc(N));
2941     }
2942 
2943     if (!IsFast)
2944       return SDValue();
2945   }
2946 
2947   if (!shouldCombineMemoryType(VT))
2948     return SDValue();
2949 
2950   EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
2951 
2952   SDValue NewLoad
2953     = DAG.getLoad(NewVT, SL, LN->getChain(),
2954                   LN->getBasePtr(), LN->getMemOperand());
2955 
2956   SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
2957   DCI.CombineTo(N, BC, NewLoad.getValue(1));
2958   return SDValue(N, 0);
2959 }
2960 
2961 // Replace store of an illegal type with a store of a bitcast to a friendlier
2962 // type.
2963 SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
2964                                                   DAGCombinerInfo &DCI) const {
2965   if (!DCI.isBeforeLegalize())
2966     return SDValue();
2967 
2968   StoreSDNode *SN = cast<StoreSDNode>(N);
2969   if (SN->isVolatile() || !ISD::isNormalStore(SN))
2970     return SDValue();
2971 
2972   EVT VT = SN->getMemoryVT();
2973   unsigned Size = VT.getStoreSize();
2974 
2975   SDLoc SL(N);
2976   SelectionDAG &DAG = DCI.DAG;
2977   unsigned Align = SN->getAlignment();
2978   if (Align < Size && isTypeLegal(VT)) {
2979     bool IsFast;
2980     unsigned AS = SN->getAddressSpace();
2981 
2982     // Expand unaligned stores earlier than legalization. Due to visitation
2983     // order problems during legalization, the emitted instructions to pack and
2984     // unpack the bytes again are not eliminated in the case of an unaligned
2985     // copy.
2986     if (!allowsMisalignedMemoryAccesses(
2987             VT, AS, Align, SN->getMemOperand()->getFlags(), &IsFast)) {
2988       if (VT.isVector())
2989         return scalarizeVectorStore(SN, DAG);
2990 
2991       return expandUnalignedStore(SN, DAG);
2992     }
2993 
2994     if (!IsFast)
2995       return SDValue();
2996   }
2997 
2998   if (!shouldCombineMemoryType(VT))
2999     return SDValue();
3000 
3001   EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3002   SDValue Val = SN->getValue();
3003 
3004   //DCI.AddToWorklist(Val.getNode());
3005 
3006   bool OtherUses = !Val.hasOneUse();
3007   SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
3008   if (OtherUses) {
3009     SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
3010     DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
3011   }
3012 
3013   return DAG.getStore(SN->getChain(), SL, CastVal,
3014                       SN->getBasePtr(), SN->getMemOperand());
3015 }
3016 
3017 // FIXME: This should go in generic DAG combiner with an isTruncateFree check,
3018 // but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
3019 // issues.
3020 SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
3021                                                         DAGCombinerInfo &DCI) const {
3022   SelectionDAG &DAG = DCI.DAG;
3023   SDValue N0 = N->getOperand(0);
3024 
3025   // (vt2 (assertzext (truncate vt0:x), vt1)) ->
3026   //     (vt2 (truncate (assertzext vt0:x, vt1)))
3027   if (N0.getOpcode() == ISD::TRUNCATE) {
3028     SDValue N1 = N->getOperand(1);
3029     EVT ExtVT = cast<VTSDNode>(N1)->getVT();
3030     SDLoc SL(N);
3031 
3032     SDValue Src = N0.getOperand(0);
3033     EVT SrcVT = Src.getValueType();
3034     if (SrcVT.bitsGE(ExtVT)) {
3035       SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
3036       return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
3037     }
3038   }
3039 
3040   return SDValue();
3041 }
3042 
3043 SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
3044   SDNode *N, DAGCombinerInfo &DCI) const {
3045   unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
3046   switch (IID) {
3047   case Intrinsic::amdgcn_mul_i24:
3048   case Intrinsic::amdgcn_mul_u24:
3049     return simplifyI24(N, DCI);
3050   case Intrinsic::amdgcn_fract:
3051   case Intrinsic::amdgcn_rsq:
3052   case Intrinsic::amdgcn_rcp_legacy:
3053   case Intrinsic::amdgcn_rsq_legacy:
3054   case Intrinsic::amdgcn_rsq_clamp:
3055   case Intrinsic::amdgcn_ldexp: {
3056     // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
3057     SDValue Src = N->getOperand(1);
3058     return Src.isUndef() ? Src : SDValue();
3059   }
3060   default:
3061     return SDValue();
3062   }
3063 }
3064 
3065 /// Split the 64-bit value \p LHS into two 32-bit components, and perform the
3066 /// binary operation \p Opc to it with the corresponding constant operands.
3067 SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
3068   DAGCombinerInfo &DCI, const SDLoc &SL,
3069   unsigned Opc, SDValue LHS,
3070   uint32_t ValLo, uint32_t ValHi) const {
3071   SelectionDAG &DAG = DCI.DAG;
3072   SDValue Lo, Hi;
3073   std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
3074 
3075   SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
3076   SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
3077 
3078   SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
3079   SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
3080 
3081   // Re-visit the ands. It's possible we eliminated one of them and it could
3082   // simplify the vector.
3083   DCI.AddToWorklist(Lo.getNode());
3084   DCI.AddToWorklist(Hi.getNode());
3085 
3086   SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
3087   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3088 }
3089 
3090 SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
3091                                                 DAGCombinerInfo &DCI) const {
3092   EVT VT = N->getValueType(0);
3093 
3094   ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3095   if (!RHS)
3096     return SDValue();
3097 
3098   SDValue LHS = N->getOperand(0);
3099   unsigned RHSVal = RHS->getZExtValue();
3100   if (!RHSVal)
3101     return LHS;
3102 
3103   SDLoc SL(N);
3104   SelectionDAG &DAG = DCI.DAG;
3105 
3106   switch (LHS->getOpcode()) {
3107   default:
3108     break;
3109   case ISD::ZERO_EXTEND:
3110   case ISD::SIGN_EXTEND:
3111   case ISD::ANY_EXTEND: {
3112     SDValue X = LHS->getOperand(0);
3113 
3114     if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
3115         isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
3116       // Prefer build_vector as the canonical form if packed types are legal.
3117       // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
3118       SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
3119        { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
3120       return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
3121     }
3122 
3123     // shl (ext x) => zext (shl x), if shift does not overflow int
3124     if (VT != MVT::i64)
3125       break;
3126     KnownBits Known = DAG.computeKnownBits(X);
3127     unsigned LZ = Known.countMinLeadingZeros();
3128     if (LZ < RHSVal)
3129       break;
3130     EVT XVT = X.getValueType();
3131     SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
3132     return DAG.getZExtOrTrunc(Shl, SL, VT);
3133   }
3134   }
3135 
3136   if (VT != MVT::i64)
3137     return SDValue();
3138 
3139   // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
3140 
3141   // On some subtargets, 64-bit shift is a quarter rate instruction. In the
3142   // common case, splitting this into a move and a 32-bit shift is faster and
3143   // the same code size.
3144   if (RHSVal < 32)
3145     return SDValue();
3146 
3147   SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
3148 
3149   SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
3150   SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
3151 
3152   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3153 
3154   SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
3155   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3156 }
3157 
3158 SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
3159                                                 DAGCombinerInfo &DCI) const {
3160   if (N->getValueType(0) != MVT::i64)
3161     return SDValue();
3162 
3163   const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3164   if (!RHS)
3165     return SDValue();
3166 
3167   SelectionDAG &DAG = DCI.DAG;
3168   SDLoc SL(N);
3169   unsigned RHSVal = RHS->getZExtValue();
3170 
3171   // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
3172   if (RHSVal == 32) {
3173     SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3174     SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3175                                    DAG.getConstant(31, SL, MVT::i32));
3176 
3177     SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
3178     return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3179   }
3180 
3181   // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
3182   if (RHSVal == 63) {
3183     SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3184     SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3185                                    DAG.getConstant(31, SL, MVT::i32));
3186     SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
3187     return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3188   }
3189 
3190   return SDValue();
3191 }
3192 
3193 SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
3194                                                 DAGCombinerInfo &DCI) const {
3195   auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3196   if (!RHS)
3197     return SDValue();
3198 
3199   EVT VT = N->getValueType(0);
3200   SDValue LHS = N->getOperand(0);
3201   unsigned ShiftAmt = RHS->getZExtValue();
3202   SelectionDAG &DAG = DCI.DAG;
3203   SDLoc SL(N);
3204 
3205   // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
3206   // this improves the ability to match BFE patterns in isel.
3207   if (LHS.getOpcode() == ISD::AND) {
3208     if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
3209       if (Mask->getAPIntValue().isShiftedMask() &&
3210           Mask->getAPIntValue().countTrailingZeros() == ShiftAmt) {
3211         return DAG.getNode(
3212             ISD::AND, SL, VT,
3213             DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
3214             DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
3215       }
3216     }
3217   }
3218 
3219   if (VT != MVT::i64)
3220     return SDValue();
3221 
3222   if (ShiftAmt < 32)
3223     return SDValue();
3224 
3225   // srl i64:x, C for C >= 32
3226   // =>
3227   //   build_pair (srl hi_32(x), C - 32), 0
3228   SDValue One = DAG.getConstant(1, SL, MVT::i32);
3229   SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3230 
3231   SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, LHS);
3232   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecOp, One);
3233 
3234   SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
3235   SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
3236 
3237   SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
3238 
3239   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
3240 }
3241 
3242 SDValue AMDGPUTargetLowering::performTruncateCombine(
3243   SDNode *N, DAGCombinerInfo &DCI) const {
3244   SDLoc SL(N);
3245   SelectionDAG &DAG = DCI.DAG;
3246   EVT VT = N->getValueType(0);
3247   SDValue Src = N->getOperand(0);
3248 
3249   // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
3250   if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
3251     SDValue Vec = Src.getOperand(0);
3252     if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
3253       SDValue Elt0 = Vec.getOperand(0);
3254       EVT EltVT = Elt0.getValueType();
3255       if (VT.getSizeInBits() <= EltVT.getSizeInBits()) {
3256         if (EltVT.isFloatingPoint()) {
3257           Elt0 = DAG.getNode(ISD::BITCAST, SL,
3258                              EltVT.changeTypeToInteger(), Elt0);
3259         }
3260 
3261         return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
3262       }
3263     }
3264   }
3265 
3266   // Equivalent of above for accessing the high element of a vector as an
3267   // integer operation.
3268   // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
3269   if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
3270     if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
3271       if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
3272         SDValue BV = stripBitcast(Src.getOperand(0));
3273         if (BV.getOpcode() == ISD::BUILD_VECTOR &&
3274             BV.getValueType().getVectorNumElements() == 2) {
3275           SDValue SrcElt = BV.getOperand(1);
3276           EVT SrcEltVT = SrcElt.getValueType();
3277           if (SrcEltVT.isFloatingPoint()) {
3278             SrcElt = DAG.getNode(ISD::BITCAST, SL,
3279                                  SrcEltVT.changeTypeToInteger(), SrcElt);
3280           }
3281 
3282           return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
3283         }
3284       }
3285     }
3286   }
3287 
3288   // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
3289   //
3290   // i16 (trunc (srl i64:x, K)), K <= 16 ->
3291   //     i16 (trunc (srl (i32 (trunc x), K)))
3292   if (VT.getScalarSizeInBits() < 32) {
3293     EVT SrcVT = Src.getValueType();
3294     if (SrcVT.getScalarSizeInBits() > 32 &&
3295         (Src.getOpcode() == ISD::SRL ||
3296          Src.getOpcode() == ISD::SRA ||
3297          Src.getOpcode() == ISD::SHL)) {
3298       SDValue Amt = Src.getOperand(1);
3299       KnownBits Known = DAG.computeKnownBits(Amt);
3300       unsigned Size = VT.getScalarSizeInBits();
3301       if ((Known.isConstant() && Known.getConstant().ule(Size)) ||
3302           (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size))) {
3303         EVT MidVT = VT.isVector() ?
3304           EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3305                            VT.getVectorNumElements()) : MVT::i32;
3306 
3307         EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
3308         SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
3309                                     Src.getOperand(0));
3310         DCI.AddToWorklist(Trunc.getNode());
3311 
3312         if (Amt.getValueType() != NewShiftVT) {
3313           Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
3314           DCI.AddToWorklist(Amt.getNode());
3315         }
3316 
3317         SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
3318                                           Trunc, Amt);
3319         return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
3320       }
3321     }
3322   }
3323 
3324   return SDValue();
3325 }
3326 
3327 // We need to specifically handle i64 mul here to avoid unnecessary conversion
3328 // instructions. If we only match on the legalized i64 mul expansion,
3329 // SimplifyDemandedBits will be unable to remove them because there will be
3330 // multiple uses due to the separate mul + mulh[su].
3331 static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
3332                         SDValue N0, SDValue N1, unsigned Size, bool Signed) {
3333   if (Size <= 32) {
3334     unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3335     return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
3336   }
3337 
3338   // Because we want to eliminate extension instructions before the
3339   // operation, we need to create a single user here (i.e. not the separate
3340   // mul_lo + mul_hi) so that SimplifyDemandedBits will deal with it.
3341 
3342   unsigned MulOpc = Signed ? AMDGPUISD::MUL_LOHI_I24 : AMDGPUISD::MUL_LOHI_U24;
3343 
3344   SDValue Mul = DAG.getNode(MulOpc, SL,
3345                             DAG.getVTList(MVT::i32, MVT::i32), N0, N1);
3346 
3347   return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64,
3348                      Mul.getValue(0), Mul.getValue(1));
3349 }
3350 
3351 SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
3352                                                 DAGCombinerInfo &DCI) const {
3353   EVT VT = N->getValueType(0);
3354 
3355   unsigned Size = VT.getSizeInBits();
3356   if (VT.isVector() || Size > 64)
3357     return SDValue();
3358 
3359   // There are i16 integer mul/mad.
3360   if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
3361     return SDValue();
3362 
3363   SelectionDAG &DAG = DCI.DAG;
3364   SDLoc DL(N);
3365 
3366   SDValue N0 = N->getOperand(0);
3367   SDValue N1 = N->getOperand(1);
3368 
3369   // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
3370   // in the source into any_extends if the result of the mul is truncated. Since
3371   // we can assume the high bits are whatever we want, use the underlying value
3372   // to avoid the unknown high bits from interfering.
3373   if (N0.getOpcode() == ISD::ANY_EXTEND)
3374     N0 = N0.getOperand(0);
3375 
3376   if (N1.getOpcode() == ISD::ANY_EXTEND)
3377     N1 = N1.getOperand(0);
3378 
3379   SDValue Mul;
3380 
3381   if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
3382     N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3383     N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3384     Mul = getMul24(DAG, DL, N0, N1, Size, false);
3385   } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
3386     N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3387     N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3388     Mul = getMul24(DAG, DL, N0, N1, Size, true);
3389   } else {
3390     return SDValue();
3391   }
3392 
3393   // We need to use sext even for MUL_U24, because MUL_U24 is used
3394   // for signed multiply of 8 and 16-bit types.
3395   return DAG.getSExtOrTrunc(Mul, DL, VT);
3396 }
3397 
3398 SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
3399                                                   DAGCombinerInfo &DCI) const {
3400   EVT VT = N->getValueType(0);
3401 
3402   if (!Subtarget->hasMulI24() || VT.isVector())
3403     return SDValue();
3404 
3405   SelectionDAG &DAG = DCI.DAG;
3406   SDLoc DL(N);
3407 
3408   SDValue N0 = N->getOperand(0);
3409   SDValue N1 = N->getOperand(1);
3410 
3411   if (!isI24(N0, DAG) || !isI24(N1, DAG))
3412     return SDValue();
3413 
3414   N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3415   N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3416 
3417   SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
3418   DCI.AddToWorklist(Mulhi.getNode());
3419   return DAG.getSExtOrTrunc(Mulhi, DL, VT);
3420 }
3421 
3422 SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
3423                                                   DAGCombinerInfo &DCI) const {
3424   EVT VT = N->getValueType(0);
3425 
3426   if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
3427     return SDValue();
3428 
3429   SelectionDAG &DAG = DCI.DAG;
3430   SDLoc DL(N);
3431 
3432   SDValue N0 = N->getOperand(0);
3433   SDValue N1 = N->getOperand(1);
3434 
3435   if (!isU24(N0, DAG) || !isU24(N1, DAG))
3436     return SDValue();
3437 
3438   N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3439   N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3440 
3441   SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
3442   DCI.AddToWorklist(Mulhi.getNode());
3443   return DAG.getZExtOrTrunc(Mulhi, DL, VT);
3444 }
3445 
3446 SDValue AMDGPUTargetLowering::performMulLoHi24Combine(
3447   SDNode *N, DAGCombinerInfo &DCI) const {
3448   SelectionDAG &DAG = DCI.DAG;
3449 
3450   // Simplify demanded bits before splitting into multiple users.
3451   if (SDValue V = simplifyI24(N, DCI))
3452     return V;
3453 
3454   SDValue N0 = N->getOperand(0);
3455   SDValue N1 = N->getOperand(1);
3456 
3457   bool Signed = (N->getOpcode() == AMDGPUISD::MUL_LOHI_I24);
3458 
3459   unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3460   unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
3461 
3462   SDLoc SL(N);
3463 
3464   SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
3465   SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
3466   return DAG.getMergeValues({ MulLo, MulHi }, SL);
3467 }
3468 
3469 static bool isNegativeOne(SDValue Val) {
3470   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
3471     return C->isAllOnesValue();
3472   return false;
3473 }
3474 
3475 SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
3476                                           SDValue Op,
3477                                           const SDLoc &DL,
3478                                           unsigned Opc) const {
3479   EVT VT = Op.getValueType();
3480   EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
3481   if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
3482                               LegalVT != MVT::i16))
3483     return SDValue();
3484 
3485   if (VT != MVT::i32)
3486     Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
3487 
3488   SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
3489   if (VT != MVT::i32)
3490     FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
3491 
3492   return FFBX;
3493 }
3494 
3495 // The native instructions return -1 on 0 input. Optimize out a select that
3496 // produces -1 on 0.
3497 //
3498 // TODO: If zero is not undef, we could also do this if the output is compared
3499 // against the bitwidth.
3500 //
3501 // TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
3502 SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond,
3503                                                  SDValue LHS, SDValue RHS,
3504                                                  DAGCombinerInfo &DCI) const {
3505   ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
3506   if (!CmpRhs || !CmpRhs->isNullValue())
3507     return SDValue();
3508 
3509   SelectionDAG &DAG = DCI.DAG;
3510   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
3511   SDValue CmpLHS = Cond.getOperand(0);
3512 
3513   unsigned Opc = isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 :
3514                                            AMDGPUISD::FFBH_U32;
3515 
3516   // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
3517   // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
3518   if (CCOpcode == ISD::SETEQ &&
3519       (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
3520       RHS.getOperand(0) == CmpLHS &&
3521       isNegativeOne(LHS)) {
3522     return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3523   }
3524 
3525   // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
3526   // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
3527   if (CCOpcode == ISD::SETNE &&
3528       (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
3529       LHS.getOperand(0) == CmpLHS &&
3530       isNegativeOne(RHS)) {
3531     return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3532   }
3533 
3534   return SDValue();
3535 }
3536 
3537 static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
3538                                          unsigned Op,
3539                                          const SDLoc &SL,
3540                                          SDValue Cond,
3541                                          SDValue N1,
3542                                          SDValue N2) {
3543   SelectionDAG &DAG = DCI.DAG;
3544   EVT VT = N1.getValueType();
3545 
3546   SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
3547                                   N1.getOperand(0), N2.getOperand(0));
3548   DCI.AddToWorklist(NewSelect.getNode());
3549   return DAG.getNode(Op, SL, VT, NewSelect);
3550 }
3551 
3552 // Pull a free FP operation out of a select so it may fold into uses.
3553 //
3554 // select c, (fneg x), (fneg y) -> fneg (select c, x, y)
3555 // select c, (fneg x), k -> fneg (select c, x, (fneg k))
3556 //
3557 // select c, (fabs x), (fabs y) -> fabs (select c, x, y)
3558 // select c, (fabs x), +k -> fabs (select c, x, k)
3559 static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
3560                                     SDValue N) {
3561   SelectionDAG &DAG = DCI.DAG;
3562   SDValue Cond = N.getOperand(0);
3563   SDValue LHS = N.getOperand(1);
3564   SDValue RHS = N.getOperand(2);
3565 
3566   EVT VT = N.getValueType();
3567   if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
3568       (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
3569     return distributeOpThroughSelect(DCI, LHS.getOpcode(),
3570                                      SDLoc(N), Cond, LHS, RHS);
3571   }
3572 
3573   bool Inv = false;
3574   if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
3575     std::swap(LHS, RHS);
3576     Inv = true;
3577   }
3578 
3579   // TODO: Support vector constants.
3580   ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
3581   if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) {
3582     SDLoc SL(N);
3583     // If one side is an fneg/fabs and the other is a constant, we can push the
3584     // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
3585     SDValue NewLHS = LHS.getOperand(0);
3586     SDValue NewRHS = RHS;
3587 
3588     // Careful: if the neg can be folded up, don't try to pull it back down.
3589     bool ShouldFoldNeg = true;
3590 
3591     if (NewLHS.hasOneUse()) {
3592       unsigned Opc = NewLHS.getOpcode();
3593       if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc))
3594         ShouldFoldNeg = false;
3595       if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
3596         ShouldFoldNeg = false;
3597     }
3598 
3599     if (ShouldFoldNeg) {
3600       if (LHS.getOpcode() == ISD::FNEG)
3601         NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3602       else if (CRHS->isNegative())
3603         return SDValue();
3604 
3605       if (Inv)
3606         std::swap(NewLHS, NewRHS);
3607 
3608       SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
3609                                       Cond, NewLHS, NewRHS);
3610       DCI.AddToWorklist(NewSelect.getNode());
3611       return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
3612     }
3613   }
3614 
3615   return SDValue();
3616 }
3617 
3618 
3619 SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
3620                                                    DAGCombinerInfo &DCI) const {
3621   if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
3622     return Folded;
3623 
3624   SDValue Cond = N->getOperand(0);
3625   if (Cond.getOpcode() != ISD::SETCC)
3626     return SDValue();
3627 
3628   EVT VT = N->getValueType(0);
3629   SDValue LHS = Cond.getOperand(0);
3630   SDValue RHS = Cond.getOperand(1);
3631   SDValue CC = Cond.getOperand(2);
3632 
3633   SDValue True = N->getOperand(1);
3634   SDValue False = N->getOperand(2);
3635 
3636   if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
3637     SelectionDAG &DAG = DCI.DAG;
3638     if (DAG.isConstantValueOfAnyType(True) &&
3639         !DAG.isConstantValueOfAnyType(False)) {
3640       // Swap cmp + select pair to move constant to false input.
3641       // This will allow using VOPC cndmasks more often.
3642       // select (setcc x, y), k, x -> select (setccinv x, y), x, k
3643 
3644       SDLoc SL(N);
3645       ISD::CondCode NewCC =
3646           getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
3647 
3648       SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
3649       return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
3650     }
3651 
3652     if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
3653       SDValue MinMax
3654         = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
3655       // Revisit this node so we can catch min3/max3/med3 patterns.
3656       //DCI.AddToWorklist(MinMax.getNode());
3657       return MinMax;
3658     }
3659   }
3660 
3661   // There's no reason to not do this if the condition has other uses.
3662   return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
3663 }
3664 
3665 static bool isInv2Pi(const APFloat &APF) {
3666   static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
3667   static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
3668   static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
3669 
3670   return APF.bitwiseIsEqual(KF16) ||
3671          APF.bitwiseIsEqual(KF32) ||
3672          APF.bitwiseIsEqual(KF64);
3673 }
3674 
3675 // 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
3676 // additional cost to negate them.
3677 bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const {
3678   if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N)) {
3679     if (C->isZero() && !C->isNegative())
3680       return true;
3681 
3682     if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
3683       return true;
3684   }
3685 
3686   return false;
3687 }
3688 
3689 static unsigned inverseMinMax(unsigned Opc) {
3690   switch (Opc) {
3691   case ISD::FMAXNUM:
3692     return ISD::FMINNUM;
3693   case ISD::FMINNUM:
3694     return ISD::FMAXNUM;
3695   case ISD::FMAXNUM_IEEE:
3696     return ISD::FMINNUM_IEEE;
3697   case ISD::FMINNUM_IEEE:
3698     return ISD::FMAXNUM_IEEE;
3699   case AMDGPUISD::FMAX_LEGACY:
3700     return AMDGPUISD::FMIN_LEGACY;
3701   case AMDGPUISD::FMIN_LEGACY:
3702     return  AMDGPUISD::FMAX_LEGACY;
3703   default:
3704     llvm_unreachable("invalid min/max opcode");
3705   }
3706 }
3707 
3708 SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
3709                                                  DAGCombinerInfo &DCI) const {
3710   SelectionDAG &DAG = DCI.DAG;
3711   SDValue N0 = N->getOperand(0);
3712   EVT VT = N->getValueType(0);
3713 
3714   unsigned Opc = N0.getOpcode();
3715 
3716   // If the input has multiple uses and we can either fold the negate down, or
3717   // the other uses cannot, give up. This both prevents unprofitable
3718   // transformations and infinite loops: we won't repeatedly try to fold around
3719   // a negate that has no 'good' form.
3720   if (N0.hasOneUse()) {
3721     // This may be able to fold into the source, but at a code size cost. Don't
3722     // fold if the fold into the user is free.
3723     if (allUsesHaveSourceMods(N, 0))
3724       return SDValue();
3725   } else {
3726     if (fnegFoldsIntoOp(Opc) &&
3727         (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode())))
3728       return SDValue();
3729   }
3730 
3731   SDLoc SL(N);
3732   switch (Opc) {
3733   case ISD::FADD: {
3734     if (!mayIgnoreSignedZero(N0))
3735       return SDValue();
3736 
3737     // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
3738     SDValue LHS = N0.getOperand(0);
3739     SDValue RHS = N0.getOperand(1);
3740 
3741     if (LHS.getOpcode() != ISD::FNEG)
3742       LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3743     else
3744       LHS = LHS.getOperand(0);
3745 
3746     if (RHS.getOpcode() != ISD::FNEG)
3747       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3748     else
3749       RHS = RHS.getOperand(0);
3750 
3751     SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
3752     if (Res.getOpcode() != ISD::FADD)
3753       return SDValue(); // Op got folded away.
3754     if (!N0.hasOneUse())
3755       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3756     return Res;
3757   }
3758   case ISD::FMUL:
3759   case AMDGPUISD::FMUL_LEGACY: {
3760     // (fneg (fmul x, y)) -> (fmul x, (fneg y))
3761     // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
3762     SDValue LHS = N0.getOperand(0);
3763     SDValue RHS = N0.getOperand(1);
3764 
3765     if (LHS.getOpcode() == ISD::FNEG)
3766       LHS = LHS.getOperand(0);
3767     else if (RHS.getOpcode() == ISD::FNEG)
3768       RHS = RHS.getOperand(0);
3769     else
3770       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3771 
3772     SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
3773     if (Res.getOpcode() != Opc)
3774       return SDValue(); // Op got folded away.
3775     if (!N0.hasOneUse())
3776       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3777     return Res;
3778   }
3779   case ISD::FMA:
3780   case ISD::FMAD: {
3781     if (!mayIgnoreSignedZero(N0))
3782       return SDValue();
3783 
3784     // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
3785     SDValue LHS = N0.getOperand(0);
3786     SDValue MHS = N0.getOperand(1);
3787     SDValue RHS = N0.getOperand(2);
3788 
3789     if (LHS.getOpcode() == ISD::FNEG)
3790       LHS = LHS.getOperand(0);
3791     else if (MHS.getOpcode() == ISD::FNEG)
3792       MHS = MHS.getOperand(0);
3793     else
3794       MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
3795 
3796     if (RHS.getOpcode() != ISD::FNEG)
3797       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3798     else
3799       RHS = RHS.getOperand(0);
3800 
3801     SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
3802     if (Res.getOpcode() != Opc)
3803       return SDValue(); // Op got folded away.
3804     if (!N0.hasOneUse())
3805       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3806     return Res;
3807   }
3808   case ISD::FMAXNUM:
3809   case ISD::FMINNUM:
3810   case ISD::FMAXNUM_IEEE:
3811   case ISD::FMINNUM_IEEE:
3812   case AMDGPUISD::FMAX_LEGACY:
3813   case AMDGPUISD::FMIN_LEGACY: {
3814     // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
3815     // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
3816     // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
3817     // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
3818 
3819     SDValue LHS = N0.getOperand(0);
3820     SDValue RHS = N0.getOperand(1);
3821 
3822     // 0 doesn't have a negated inline immediate.
3823     // TODO: This constant check should be generalized to other operations.
3824     if (isConstantCostlierToNegate(RHS))
3825       return SDValue();
3826 
3827     SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3828     SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3829     unsigned Opposite = inverseMinMax(Opc);
3830 
3831     SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
3832     if (Res.getOpcode() != Opposite)
3833       return SDValue(); // Op got folded away.
3834     if (!N0.hasOneUse())
3835       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3836     return Res;
3837   }
3838   case AMDGPUISD::FMED3: {
3839     SDValue Ops[3];
3840     for (unsigned I = 0; I < 3; ++I)
3841       Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
3842 
3843     SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
3844     if (Res.getOpcode() != AMDGPUISD::FMED3)
3845       return SDValue(); // Op got folded away.
3846     if (!N0.hasOneUse())
3847       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3848     return Res;
3849   }
3850   case ISD::FP_EXTEND:
3851   case ISD::FTRUNC:
3852   case ISD::FRINT:
3853   case ISD::FNEARBYINT: // XXX - Should fround be handled?
3854   case ISD::FSIN:
3855   case ISD::FCANONICALIZE:
3856   case AMDGPUISD::RCP:
3857   case AMDGPUISD::RCP_LEGACY:
3858   case AMDGPUISD::RCP_IFLAG:
3859   case AMDGPUISD::SIN_HW: {
3860     SDValue CvtSrc = N0.getOperand(0);
3861     if (CvtSrc.getOpcode() == ISD::FNEG) {
3862       // (fneg (fp_extend (fneg x))) -> (fp_extend x)
3863       // (fneg (rcp (fneg x))) -> (rcp x)
3864       return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
3865     }
3866 
3867     if (!N0.hasOneUse())
3868       return SDValue();
3869 
3870     // (fneg (fp_extend x)) -> (fp_extend (fneg x))
3871     // (fneg (rcp x)) -> (rcp (fneg x))
3872     SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
3873     return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
3874   }
3875   case ISD::FP_ROUND: {
3876     SDValue CvtSrc = N0.getOperand(0);
3877 
3878     if (CvtSrc.getOpcode() == ISD::FNEG) {
3879       // (fneg (fp_round (fneg x))) -> (fp_round x)
3880       return DAG.getNode(ISD::FP_ROUND, SL, VT,
3881                          CvtSrc.getOperand(0), N0.getOperand(1));
3882     }
3883 
3884     if (!N0.hasOneUse())
3885       return SDValue();
3886 
3887     // (fneg (fp_round x)) -> (fp_round (fneg x))
3888     SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
3889     return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
3890   }
3891   case ISD::FP16_TO_FP: {
3892     // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
3893     // f16, but legalization of f16 fneg ends up pulling it out of the source.
3894     // Put the fneg back as a legal source operation that can be matched later.
3895     SDLoc SL(N);
3896 
3897     SDValue Src = N0.getOperand(0);
3898     EVT SrcVT = Src.getValueType();
3899 
3900     // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
3901     SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
3902                                   DAG.getConstant(0x8000, SL, SrcVT));
3903     return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
3904   }
3905   default:
3906     return SDValue();
3907   }
3908 }
3909 
3910 SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
3911                                                  DAGCombinerInfo &DCI) const {
3912   SelectionDAG &DAG = DCI.DAG;
3913   SDValue N0 = N->getOperand(0);
3914 
3915   if (!N0.hasOneUse())
3916     return SDValue();
3917 
3918   switch (N0.getOpcode()) {
3919   case ISD::FP16_TO_FP: {
3920     assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
3921     SDLoc SL(N);
3922     SDValue Src = N0.getOperand(0);
3923     EVT SrcVT = Src.getValueType();
3924 
3925     // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
3926     SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
3927                                   DAG.getConstant(0x7fff, SL, SrcVT));
3928     return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
3929   }
3930   default:
3931     return SDValue();
3932   }
3933 }
3934 
3935 SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N,
3936                                                 DAGCombinerInfo &DCI) const {
3937   const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
3938   if (!CFP)
3939     return SDValue();
3940 
3941   // XXX - Should this flush denormals?
3942   const APFloat &Val = CFP->getValueAPF();
3943   APFloat One(Val.getSemantics(), "1.0");
3944   return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
3945 }
3946 
3947 SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
3948                                                 DAGCombinerInfo &DCI) const {
3949   SelectionDAG &DAG = DCI.DAG;
3950   SDLoc DL(N);
3951 
3952   switch(N->getOpcode()) {
3953   default:
3954     break;
3955   case ISD::BITCAST: {
3956     EVT DestVT = N->getValueType(0);
3957 
3958     // Push casts through vector builds. This helps avoid emitting a large
3959     // number of copies when materializing floating point vector constants.
3960     //
3961     // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
3962     //   vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
3963     if (DestVT.isVector()) {
3964       SDValue Src = N->getOperand(0);
3965       if (Src.getOpcode() == ISD::BUILD_VECTOR) {
3966         EVT SrcVT = Src.getValueType();
3967         unsigned NElts = DestVT.getVectorNumElements();
3968 
3969         if (SrcVT.getVectorNumElements() == NElts) {
3970           EVT DestEltVT = DestVT.getVectorElementType();
3971 
3972           SmallVector<SDValue, 8> CastedElts;
3973           SDLoc SL(N);
3974           for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
3975             SDValue Elt = Src.getOperand(I);
3976             CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
3977           }
3978 
3979           return DAG.getBuildVector(DestVT, SL, CastedElts);
3980         }
3981       }
3982     }
3983 
3984     if (DestVT.getSizeInBits() != 64 && !DestVT.isVector())
3985       break;
3986 
3987     // Fold bitcasts of constants.
3988     //
3989     // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
3990     // TODO: Generalize and move to DAGCombiner
3991     SDValue Src = N->getOperand(0);
3992     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
3993       if (Src.getValueType() == MVT::i64) {
3994         SDLoc SL(N);
3995         uint64_t CVal = C->getZExtValue();
3996         SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
3997                                  DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
3998                                  DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
3999         return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
4000       }
4001     }
4002 
4003     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
4004       const APInt &Val = C->getValueAPF().bitcastToAPInt();
4005       SDLoc SL(N);
4006       uint64_t CVal = Val.getZExtValue();
4007       SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
4008                                 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
4009                                 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
4010 
4011       return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
4012     }
4013 
4014     break;
4015   }
4016   case ISD::SHL: {
4017     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
4018       break;
4019 
4020     return performShlCombine(N, DCI);
4021   }
4022   case ISD::SRL: {
4023     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
4024       break;
4025 
4026     return performSrlCombine(N, DCI);
4027   }
4028   case ISD::SRA: {
4029     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
4030       break;
4031 
4032     return performSraCombine(N, DCI);
4033   }
4034   case ISD::TRUNCATE:
4035     return performTruncateCombine(N, DCI);
4036   case ISD::MUL:
4037     return performMulCombine(N, DCI);
4038   case ISD::MULHS:
4039     return performMulhsCombine(N, DCI);
4040   case ISD::MULHU:
4041     return performMulhuCombine(N, DCI);
4042   case AMDGPUISD::MUL_I24:
4043   case AMDGPUISD::MUL_U24:
4044   case AMDGPUISD::MULHI_I24:
4045   case AMDGPUISD::MULHI_U24: {
4046     if (SDValue V = simplifyI24(N, DCI))
4047       return V;
4048     return SDValue();
4049   }
4050   case AMDGPUISD::MUL_LOHI_I24:
4051   case AMDGPUISD::MUL_LOHI_U24:
4052     return performMulLoHi24Combine(N, DCI);
4053   case ISD::SELECT:
4054     return performSelectCombine(N, DCI);
4055   case ISD::FNEG:
4056     return performFNegCombine(N, DCI);
4057   case ISD::FABS:
4058     return performFAbsCombine(N, DCI);
4059   case AMDGPUISD::BFE_I32:
4060   case AMDGPUISD::BFE_U32: {
4061     assert(!N->getValueType(0).isVector() &&
4062            "Vector handling of BFE not implemented");
4063     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
4064     if (!Width)
4065       break;
4066 
4067     uint32_t WidthVal = Width->getZExtValue() & 0x1f;
4068     if (WidthVal == 0)
4069       return DAG.getConstant(0, DL, MVT::i32);
4070 
4071     ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
4072     if (!Offset)
4073       break;
4074 
4075     SDValue BitsFrom = N->getOperand(0);
4076     uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
4077 
4078     bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
4079 
4080     if (OffsetVal == 0) {
4081       // This is already sign / zero extended, so try to fold away extra BFEs.
4082       unsigned SignBits =  Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
4083 
4084       unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
4085       if (OpSignBits >= SignBits)
4086         return BitsFrom;
4087 
4088       EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
4089       if (Signed) {
4090         // This is a sign_extend_inreg. Replace it to take advantage of existing
4091         // DAG Combines. If not eliminated, we will match back to BFE during
4092         // selection.
4093 
4094         // TODO: The sext_inreg of extended types ends, although we can could
4095         // handle them in a single BFE.
4096         return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
4097                            DAG.getValueType(SmallVT));
4098       }
4099 
4100       return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
4101     }
4102 
4103     if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
4104       if (Signed) {
4105         return constantFoldBFE<int32_t>(DAG,
4106                                         CVal->getSExtValue(),
4107                                         OffsetVal,
4108                                         WidthVal,
4109                                         DL);
4110       }
4111 
4112       return constantFoldBFE<uint32_t>(DAG,
4113                                        CVal->getZExtValue(),
4114                                        OffsetVal,
4115                                        WidthVal,
4116                                        DL);
4117     }
4118 
4119     if ((OffsetVal + WidthVal) >= 32 &&
4120         !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
4121       SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
4122       return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
4123                          BitsFrom, ShiftVal);
4124     }
4125 
4126     if (BitsFrom.hasOneUse()) {
4127       APInt Demanded = APInt::getBitsSet(32,
4128                                          OffsetVal,
4129                                          OffsetVal + WidthVal);
4130 
4131       KnownBits Known;
4132       TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
4133                                             !DCI.isBeforeLegalizeOps());
4134       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4135       if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
4136           TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
4137         DCI.CommitTargetLoweringOpt(TLO);
4138       }
4139     }
4140 
4141     break;
4142   }
4143   case ISD::LOAD:
4144     return performLoadCombine(N, DCI);
4145   case ISD::STORE:
4146     return performStoreCombine(N, DCI);
4147   case AMDGPUISD::RCP:
4148   case AMDGPUISD::RCP_IFLAG:
4149     return performRcpCombine(N, DCI);
4150   case ISD::AssertZext:
4151   case ISD::AssertSext:
4152     return performAssertSZExtCombine(N, DCI);
4153   case ISD::INTRINSIC_WO_CHAIN:
4154     return performIntrinsicWOChainCombine(N, DCI);
4155   }
4156   return SDValue();
4157 }
4158 
4159 //===----------------------------------------------------------------------===//
4160 // Helper functions
4161 //===----------------------------------------------------------------------===//
4162 
4163 SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
4164                                                    const TargetRegisterClass *RC,
4165                                                    Register Reg, EVT VT,
4166                                                    const SDLoc &SL,
4167                                                    bool RawReg) const {
4168   MachineFunction &MF = DAG.getMachineFunction();
4169   MachineRegisterInfo &MRI = MF.getRegInfo();
4170   Register VReg;
4171 
4172   if (!MRI.isLiveIn(Reg)) {
4173     VReg = MRI.createVirtualRegister(RC);
4174     MRI.addLiveIn(Reg, VReg);
4175   } else {
4176     VReg = MRI.getLiveInVirtReg(Reg);
4177   }
4178 
4179   if (RawReg)
4180     return DAG.getRegister(VReg, VT);
4181 
4182   return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
4183 }
4184 
4185 // This may be called multiple times, and nothing prevents creating multiple
4186 // objects at the same offset. See if we already defined this object.
4187 static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size,
4188                                        int64_t Offset) {
4189   for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
4190     if (MFI.getObjectOffset(I) == Offset) {
4191       assert(MFI.getObjectSize(I) == Size);
4192       return I;
4193     }
4194   }
4195 
4196   return MFI.CreateFixedObject(Size, Offset, true);
4197 }
4198 
4199 SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
4200                                                   EVT VT,
4201                                                   const SDLoc &SL,
4202                                                   int64_t Offset) const {
4203   MachineFunction &MF = DAG.getMachineFunction();
4204   MachineFrameInfo &MFI = MF.getFrameInfo();
4205   int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
4206 
4207   auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
4208   SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
4209 
4210   return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, 4,
4211                      MachineMemOperand::MODereferenceable |
4212                      MachineMemOperand::MOInvariant);
4213 }
4214 
4215 SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
4216                                                    const SDLoc &SL,
4217                                                    SDValue Chain,
4218                                                    SDValue ArgVal,
4219                                                    int64_t Offset) const {
4220   MachineFunction &MF = DAG.getMachineFunction();
4221   MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
4222 
4223   SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
4224   SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, 4,
4225                                MachineMemOperand::MODereferenceable);
4226   return Store;
4227 }
4228 
4229 SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
4230                                              const TargetRegisterClass *RC,
4231                                              EVT VT, const SDLoc &SL,
4232                                              const ArgDescriptor &Arg) const {
4233   assert(Arg && "Attempting to load missing argument");
4234 
4235   SDValue V = Arg.isRegister() ?
4236     CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
4237     loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
4238 
4239   if (!Arg.isMasked())
4240     return V;
4241 
4242   unsigned Mask = Arg.getMask();
4243   unsigned Shift = countTrailingZeros<unsigned>(Mask);
4244   V = DAG.getNode(ISD::SRL, SL, VT, V,
4245                   DAG.getShiftAmountConstant(Shift, VT, SL));
4246   return DAG.getNode(ISD::AND, SL, VT, V,
4247                      DAG.getConstant(Mask >> Shift, SL, VT));
4248 }
4249 
4250 uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
4251     const MachineFunction &MF, const ImplicitParameter Param) const {
4252   const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
4253   const AMDGPUSubtarget &ST =
4254       AMDGPUSubtarget::get(getTargetMachine(), MF.getFunction());
4255   unsigned ExplicitArgOffset = ST.getExplicitKernelArgOffset(MF.getFunction());
4256   const Align Alignment = ST.getAlignmentForImplicitArgPtr();
4257   uint64_t ArgOffset = alignTo(MFI->getExplicitKernArgSize(), Alignment) +
4258                        ExplicitArgOffset;
4259   switch (Param) {
4260   case GRID_DIM:
4261     return ArgOffset;
4262   case GRID_OFFSET:
4263     return ArgOffset + 4;
4264   }
4265   llvm_unreachable("unexpected implicit parameter type");
4266 }
4267 
4268 #define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
4269 
4270 const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
4271   switch ((AMDGPUISD::NodeType)Opcode) {
4272   case AMDGPUISD::FIRST_NUMBER: break;
4273   // AMDIL DAG nodes
4274   NODE_NAME_CASE(UMUL);
4275   NODE_NAME_CASE(BRANCH_COND);
4276 
4277   // AMDGPU DAG nodes
4278   NODE_NAME_CASE(IF)
4279   NODE_NAME_CASE(ELSE)
4280   NODE_NAME_CASE(LOOP)
4281   NODE_NAME_CASE(CALL)
4282   NODE_NAME_CASE(TC_RETURN)
4283   NODE_NAME_CASE(TRAP)
4284   NODE_NAME_CASE(RET_FLAG)
4285   NODE_NAME_CASE(RETURN_TO_EPILOG)
4286   NODE_NAME_CASE(ENDPGM)
4287   NODE_NAME_CASE(DWORDADDR)
4288   NODE_NAME_CASE(FRACT)
4289   NODE_NAME_CASE(SETCC)
4290   NODE_NAME_CASE(SETREG)
4291   NODE_NAME_CASE(DENORM_MODE)
4292   NODE_NAME_CASE(FMA_W_CHAIN)
4293   NODE_NAME_CASE(FMUL_W_CHAIN)
4294   NODE_NAME_CASE(CLAMP)
4295   NODE_NAME_CASE(COS_HW)
4296   NODE_NAME_CASE(SIN_HW)
4297   NODE_NAME_CASE(FMAX_LEGACY)
4298   NODE_NAME_CASE(FMIN_LEGACY)
4299   NODE_NAME_CASE(FMAX3)
4300   NODE_NAME_CASE(SMAX3)
4301   NODE_NAME_CASE(UMAX3)
4302   NODE_NAME_CASE(FMIN3)
4303   NODE_NAME_CASE(SMIN3)
4304   NODE_NAME_CASE(UMIN3)
4305   NODE_NAME_CASE(FMED3)
4306   NODE_NAME_CASE(SMED3)
4307   NODE_NAME_CASE(UMED3)
4308   NODE_NAME_CASE(FDOT2)
4309   NODE_NAME_CASE(URECIP)
4310   NODE_NAME_CASE(DIV_SCALE)
4311   NODE_NAME_CASE(DIV_FMAS)
4312   NODE_NAME_CASE(DIV_FIXUP)
4313   NODE_NAME_CASE(FMAD_FTZ)
4314   NODE_NAME_CASE(TRIG_PREOP)
4315   NODE_NAME_CASE(RCP)
4316   NODE_NAME_CASE(RSQ)
4317   NODE_NAME_CASE(RCP_LEGACY)
4318   NODE_NAME_CASE(RCP_IFLAG)
4319   NODE_NAME_CASE(FMUL_LEGACY)
4320   NODE_NAME_CASE(RSQ_CLAMP)
4321   NODE_NAME_CASE(LDEXP)
4322   NODE_NAME_CASE(FP_CLASS)
4323   NODE_NAME_CASE(DOT4)
4324   NODE_NAME_CASE(CARRY)
4325   NODE_NAME_CASE(BORROW)
4326   NODE_NAME_CASE(BFE_U32)
4327   NODE_NAME_CASE(BFE_I32)
4328   NODE_NAME_CASE(BFI)
4329   NODE_NAME_CASE(BFM)
4330   NODE_NAME_CASE(FFBH_U32)
4331   NODE_NAME_CASE(FFBH_I32)
4332   NODE_NAME_CASE(FFBL_B32)
4333   NODE_NAME_CASE(MUL_U24)
4334   NODE_NAME_CASE(MUL_I24)
4335   NODE_NAME_CASE(MULHI_U24)
4336   NODE_NAME_CASE(MULHI_I24)
4337   NODE_NAME_CASE(MUL_LOHI_U24)
4338   NODE_NAME_CASE(MUL_LOHI_I24)
4339   NODE_NAME_CASE(MAD_U24)
4340   NODE_NAME_CASE(MAD_I24)
4341   NODE_NAME_CASE(MAD_I64_I32)
4342   NODE_NAME_CASE(MAD_U64_U32)
4343   NODE_NAME_CASE(PERM)
4344   NODE_NAME_CASE(TEXTURE_FETCH)
4345   NODE_NAME_CASE(R600_EXPORT)
4346   NODE_NAME_CASE(CONST_ADDRESS)
4347   NODE_NAME_CASE(REGISTER_LOAD)
4348   NODE_NAME_CASE(REGISTER_STORE)
4349   NODE_NAME_CASE(SAMPLE)
4350   NODE_NAME_CASE(SAMPLEB)
4351   NODE_NAME_CASE(SAMPLED)
4352   NODE_NAME_CASE(SAMPLEL)
4353   NODE_NAME_CASE(CVT_F32_UBYTE0)
4354   NODE_NAME_CASE(CVT_F32_UBYTE1)
4355   NODE_NAME_CASE(CVT_F32_UBYTE2)
4356   NODE_NAME_CASE(CVT_F32_UBYTE3)
4357   NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
4358   NODE_NAME_CASE(CVT_PKNORM_I16_F32)
4359   NODE_NAME_CASE(CVT_PKNORM_U16_F32)
4360   NODE_NAME_CASE(CVT_PK_I16_I32)
4361   NODE_NAME_CASE(CVT_PK_U16_U32)
4362   NODE_NAME_CASE(FP_TO_FP16)
4363   NODE_NAME_CASE(FP16_ZEXT)
4364   NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
4365   NODE_NAME_CASE(CONST_DATA_PTR)
4366   NODE_NAME_CASE(PC_ADD_REL_OFFSET)
4367   NODE_NAME_CASE(LDS)
4368   NODE_NAME_CASE(DUMMY_CHAIN)
4369   case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
4370   NODE_NAME_CASE(LOAD_D16_HI)
4371   NODE_NAME_CASE(LOAD_D16_LO)
4372   NODE_NAME_CASE(LOAD_D16_HI_I8)
4373   NODE_NAME_CASE(LOAD_D16_HI_U8)
4374   NODE_NAME_CASE(LOAD_D16_LO_I8)
4375   NODE_NAME_CASE(LOAD_D16_LO_U8)
4376   NODE_NAME_CASE(STORE_MSKOR)
4377   NODE_NAME_CASE(LOAD_CONSTANT)
4378   NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
4379   NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)
4380   NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
4381   NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
4382   NODE_NAME_CASE(DS_ORDERED_COUNT)
4383   NODE_NAME_CASE(ATOMIC_CMP_SWAP)
4384   NODE_NAME_CASE(ATOMIC_INC)
4385   NODE_NAME_CASE(ATOMIC_DEC)
4386   NODE_NAME_CASE(ATOMIC_LOAD_FMIN)
4387   NODE_NAME_CASE(ATOMIC_LOAD_FMAX)
4388   NODE_NAME_CASE(BUFFER_LOAD)
4389   NODE_NAME_CASE(BUFFER_LOAD_UBYTE)
4390   NODE_NAME_CASE(BUFFER_LOAD_USHORT)
4391   NODE_NAME_CASE(BUFFER_LOAD_BYTE)
4392   NODE_NAME_CASE(BUFFER_LOAD_SHORT)
4393   NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
4394   NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
4395   NODE_NAME_CASE(SBUFFER_LOAD)
4396   NODE_NAME_CASE(BUFFER_STORE)
4397   NODE_NAME_CASE(BUFFER_STORE_BYTE)
4398   NODE_NAME_CASE(BUFFER_STORE_SHORT)
4399   NODE_NAME_CASE(BUFFER_STORE_FORMAT)
4400   NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
4401   NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)
4402   NODE_NAME_CASE(BUFFER_ATOMIC_ADD)
4403   NODE_NAME_CASE(BUFFER_ATOMIC_SUB)
4404   NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)
4405   NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)
4406   NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)
4407   NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)
4408   NODE_NAME_CASE(BUFFER_ATOMIC_AND)
4409   NODE_NAME_CASE(BUFFER_ATOMIC_OR)
4410   NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
4411   NODE_NAME_CASE(BUFFER_ATOMIC_INC)
4412   NODE_NAME_CASE(BUFFER_ATOMIC_DEC)
4413   NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
4414   NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
4415   NODE_NAME_CASE(BUFFER_ATOMIC_PK_FADD)
4416   NODE_NAME_CASE(ATOMIC_PK_FADD)
4417 
4418   case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
4419   }
4420   return nullptr;
4421 }
4422 
4423 SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
4424                                               SelectionDAG &DAG, int Enabled,
4425                                               int &RefinementSteps,
4426                                               bool &UseOneConstNR,
4427                                               bool Reciprocal) const {
4428   EVT VT = Operand.getValueType();
4429 
4430   if (VT == MVT::f32) {
4431     RefinementSteps = 0;
4432     return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
4433   }
4434 
4435   // TODO: There is also f64 rsq instruction, but the documentation is less
4436   // clear on its precision.
4437 
4438   return SDValue();
4439 }
4440 
4441 SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
4442                                                SelectionDAG &DAG, int Enabled,
4443                                                int &RefinementSteps) const {
4444   EVT VT = Operand.getValueType();
4445 
4446   if (VT == MVT::f32) {
4447     // Reciprocal, < 1 ulp error.
4448     //
4449     // This reciprocal approximation converges to < 0.5 ulp error with one
4450     // newton rhapson performed with two fused multiple adds (FMAs).
4451 
4452     RefinementSteps = 0;
4453     return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
4454   }
4455 
4456   // TODO: There is also f64 rcp instruction, but the documentation is less
4457   // clear on its precision.
4458 
4459   return SDValue();
4460 }
4461 
4462 void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
4463     const SDValue Op, KnownBits &Known,
4464     const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
4465 
4466   Known.resetAll(); // Don't know anything.
4467 
4468   unsigned Opc = Op.getOpcode();
4469 
4470   switch (Opc) {
4471   default:
4472     break;
4473   case AMDGPUISD::CARRY:
4474   case AMDGPUISD::BORROW: {
4475     Known.Zero = APInt::getHighBitsSet(32, 31);
4476     break;
4477   }
4478 
4479   case AMDGPUISD::BFE_I32:
4480   case AMDGPUISD::BFE_U32: {
4481     ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4482     if (!CWidth)
4483       return;
4484 
4485     uint32_t Width = CWidth->getZExtValue() & 0x1f;
4486 
4487     if (Opc == AMDGPUISD::BFE_U32)
4488       Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
4489 
4490     break;
4491   }
4492   case AMDGPUISD::FP_TO_FP16:
4493   case AMDGPUISD::FP16_ZEXT: {
4494     unsigned BitWidth = Known.getBitWidth();
4495 
4496     // High bits are zero.
4497     Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
4498     break;
4499   }
4500   case AMDGPUISD::MUL_U24:
4501   case AMDGPUISD::MUL_I24: {
4502     KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
4503     KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
4504     unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
4505                       RHSKnown.countMinTrailingZeros();
4506     Known.Zero.setLowBits(std::min(TrailZ, 32u));
4507     // Skip extra check if all bits are known zeros.
4508     if (TrailZ >= 32)
4509       break;
4510 
4511     // Truncate to 24 bits.
4512     LHSKnown = LHSKnown.trunc(24);
4513     RHSKnown = RHSKnown.trunc(24);
4514 
4515     if (Opc == AMDGPUISD::MUL_I24) {
4516       unsigned LHSValBits = 24 - LHSKnown.countMinSignBits();
4517       unsigned RHSValBits = 24 - RHSKnown.countMinSignBits();
4518       unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u);
4519       if (MaxValBits >= 32)
4520         break;
4521       bool LHSNegative = LHSKnown.isNegative();
4522       bool LHSNonNegative = LHSKnown.isNonNegative();
4523       bool LHSPositive = LHSKnown.isStrictlyPositive();
4524       bool RHSNegative = RHSKnown.isNegative();
4525       bool RHSNonNegative = RHSKnown.isNonNegative();
4526       bool RHSPositive = RHSKnown.isStrictlyPositive();
4527 
4528       if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
4529         Known.Zero.setHighBits(32 - MaxValBits);
4530       else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
4531         Known.One.setHighBits(32 - MaxValBits);
4532     } else {
4533       unsigned LHSValBits = 24 - LHSKnown.countMinLeadingZeros();
4534       unsigned RHSValBits = 24 - RHSKnown.countMinLeadingZeros();
4535       unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u);
4536       if (MaxValBits >= 32)
4537         break;
4538       Known.Zero.setHighBits(32 - MaxValBits);
4539     }
4540     break;
4541   }
4542   case AMDGPUISD::PERM: {
4543     ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4544     if (!CMask)
4545       return;
4546 
4547     KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
4548     KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
4549     unsigned Sel = CMask->getZExtValue();
4550 
4551     for (unsigned I = 0; I < 32; I += 8) {
4552       unsigned SelBits = Sel & 0xff;
4553       if (SelBits < 4) {
4554         SelBits *= 8;
4555         Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
4556         Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
4557       } else if (SelBits < 7) {
4558         SelBits = (SelBits & 3) * 8;
4559         Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
4560         Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
4561       } else if (SelBits == 0x0c) {
4562         Known.Zero |= 0xFFull << I;
4563       } else if (SelBits > 0x0c) {
4564         Known.One |= 0xFFull << I;
4565       }
4566       Sel >>= 8;
4567     }
4568     break;
4569   }
4570   case AMDGPUISD::BUFFER_LOAD_UBYTE:  {
4571     Known.Zero.setHighBits(24);
4572     break;
4573   }
4574   case AMDGPUISD::BUFFER_LOAD_USHORT: {
4575     Known.Zero.setHighBits(16);
4576     break;
4577   }
4578   case AMDGPUISD::LDS: {
4579     auto GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
4580     unsigned Align = GA->getGlobal()->getAlignment();
4581 
4582     Known.Zero.setHighBits(16);
4583     if (Align)
4584       Known.Zero.setLowBits(Log2_32(Align));
4585     break;
4586   }
4587   case ISD::INTRINSIC_WO_CHAIN: {
4588     unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4589     switch (IID) {
4590     case Intrinsic::amdgcn_mbcnt_lo:
4591     case Intrinsic::amdgcn_mbcnt_hi: {
4592       const GCNSubtarget &ST =
4593           DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
4594       // These return at most the wavefront size - 1.
4595       unsigned Size = Op.getValueType().getSizeInBits();
4596       Known.Zero.setHighBits(Size - ST.getWavefrontSizeLog2());
4597       break;
4598     }
4599     default:
4600       break;
4601     }
4602   }
4603   }
4604 }
4605 
4606 unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
4607     SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
4608     unsigned Depth) const {
4609   switch (Op.getOpcode()) {
4610   case AMDGPUISD::BFE_I32: {
4611     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4612     if (!Width)
4613       return 1;
4614 
4615     unsigned SignBits = 32 - Width->getZExtValue() + 1;
4616     if (!isNullConstant(Op.getOperand(1)))
4617       return SignBits;
4618 
4619     // TODO: Could probably figure something out with non-0 offsets.
4620     unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
4621     return std::max(SignBits, Op0SignBits);
4622   }
4623 
4624   case AMDGPUISD::BFE_U32: {
4625     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4626     return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
4627   }
4628 
4629   case AMDGPUISD::CARRY:
4630   case AMDGPUISD::BORROW:
4631     return 31;
4632   case AMDGPUISD::BUFFER_LOAD_BYTE:
4633     return 25;
4634   case AMDGPUISD::BUFFER_LOAD_SHORT:
4635     return 17;
4636   case AMDGPUISD::BUFFER_LOAD_UBYTE:
4637     return 24;
4638   case AMDGPUISD::BUFFER_LOAD_USHORT:
4639     return 16;
4640   case AMDGPUISD::FP_TO_FP16:
4641   case AMDGPUISD::FP16_ZEXT:
4642     return 16;
4643   default:
4644     return 1;
4645   }
4646 }
4647 
4648 unsigned AMDGPUTargetLowering::computeNumSignBitsForTargetInstr(
4649   GISelKnownBits &Analysis, Register R,
4650   const APInt &DemandedElts, const MachineRegisterInfo &MRI,
4651   unsigned Depth) const {
4652   const MachineInstr *MI = MRI.getVRegDef(R);
4653   if (!MI)
4654     return 1;
4655 
4656   // TODO: Check range metadata on MMO.
4657   switch (MI->getOpcode()) {
4658   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
4659     return 25;
4660   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
4661     return 17;
4662   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
4663     return 24;
4664   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
4665     return 16;
4666   default:
4667     return 1;
4668   }
4669 }
4670 
4671 bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
4672                                                         const SelectionDAG &DAG,
4673                                                         bool SNaN,
4674                                                         unsigned Depth) const {
4675   unsigned Opcode = Op.getOpcode();
4676   switch (Opcode) {
4677   case AMDGPUISD::FMIN_LEGACY:
4678   case AMDGPUISD::FMAX_LEGACY: {
4679     if (SNaN)
4680       return true;
4681 
4682     // TODO: Can check no nans on one of the operands for each one, but which
4683     // one?
4684     return false;
4685   }
4686   case AMDGPUISD::FMUL_LEGACY:
4687   case AMDGPUISD::CVT_PKRTZ_F16_F32: {
4688     if (SNaN)
4689       return true;
4690     return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
4691            DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
4692   }
4693   case AMDGPUISD::FMED3:
4694   case AMDGPUISD::FMIN3:
4695   case AMDGPUISD::FMAX3:
4696   case AMDGPUISD::FMAD_FTZ: {
4697     if (SNaN)
4698       return true;
4699     return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
4700            DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
4701            DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
4702   }
4703   case AMDGPUISD::CVT_F32_UBYTE0:
4704   case AMDGPUISD::CVT_F32_UBYTE1:
4705   case AMDGPUISD::CVT_F32_UBYTE2:
4706   case AMDGPUISD::CVT_F32_UBYTE3:
4707     return true;
4708 
4709   case AMDGPUISD::RCP:
4710   case AMDGPUISD::RSQ:
4711   case AMDGPUISD::RCP_LEGACY:
4712   case AMDGPUISD::RSQ_CLAMP: {
4713     if (SNaN)
4714       return true;
4715 
4716     // TODO: Need is known positive check.
4717     return false;
4718   }
4719   case AMDGPUISD::LDEXP:
4720   case AMDGPUISD::FRACT: {
4721     if (SNaN)
4722       return true;
4723     return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
4724   }
4725   case AMDGPUISD::DIV_SCALE:
4726   case AMDGPUISD::DIV_FMAS:
4727   case AMDGPUISD::DIV_FIXUP:
4728   case AMDGPUISD::TRIG_PREOP:
4729     // TODO: Refine on operands.
4730     return SNaN;
4731   case AMDGPUISD::SIN_HW:
4732   case AMDGPUISD::COS_HW: {
4733     // TODO: Need check for infinity
4734     return SNaN;
4735   }
4736   case ISD::INTRINSIC_WO_CHAIN: {
4737     unsigned IntrinsicID
4738       = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4739     // TODO: Handle more intrinsics
4740     switch (IntrinsicID) {
4741     case Intrinsic::amdgcn_cubeid:
4742       return true;
4743 
4744     case Intrinsic::amdgcn_frexp_mant: {
4745       if (SNaN)
4746         return true;
4747       return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
4748     }
4749     case Intrinsic::amdgcn_cvt_pkrtz: {
4750       if (SNaN)
4751         return true;
4752       return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
4753              DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
4754     }
4755     case Intrinsic::amdgcn_rcp:
4756     case Intrinsic::amdgcn_rsq:
4757     case Intrinsic::amdgcn_rcp_legacy:
4758     case Intrinsic::amdgcn_rsq_legacy:
4759     case Intrinsic::amdgcn_rsq_clamp: {
4760       if (SNaN)
4761         return true;
4762 
4763       // TODO: Need is known positive check.
4764       return false;
4765     }
4766     case Intrinsic::amdgcn_fdot2:
4767       // TODO: Refine on operand
4768       return SNaN;
4769     default:
4770       return false;
4771     }
4772   }
4773   default:
4774     return false;
4775   }
4776 }
4777 
4778 TargetLowering::AtomicExpansionKind
4779 AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
4780   switch (RMW->getOperation()) {
4781   case AtomicRMWInst::Nand:
4782   case AtomicRMWInst::FAdd:
4783   case AtomicRMWInst::FSub:
4784     return AtomicExpansionKind::CmpXChg;
4785   default:
4786     return AtomicExpansionKind::None;
4787   }
4788 }
4789