1 //===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This is the parent TargetLowering class for hardware code gen
11 /// targets.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUISelLowering.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPUMachineFunction.h"
19 #include "GCNSubtarget.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "llvm/CodeGen/Analysis.h"
22 #include "llvm/CodeGen/MachineFrameInfo.h"
23 #include "llvm/IR/DiagnosticInfo.h"
24 #include "llvm/IR/IntrinsicsAMDGPU.h"
25 #include "llvm/Support/CommandLine.h"
26 #include "llvm/Support/KnownBits.h"
27 #include "llvm/Target/TargetMachine.h"
28 
29 using namespace llvm;
30 
31 #include "AMDGPUGenCallingConv.inc"
32 
33 static cl::opt<bool> AMDGPUBypassSlowDiv(
34   "amdgpu-bypass-slow-div",
35   cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
36   cl::init(true));
37 
38 // Find a larger type to do a load / store of a vector with.
39 EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
40   unsigned StoreSize = VT.getStoreSizeInBits();
41   if (StoreSize <= 32)
42     return EVT::getIntegerVT(Ctx, StoreSize);
43 
44   assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
45   return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
46 }
47 
48 unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
49   return DAG.computeKnownBits(Op).countMaxActiveBits();
50 }
51 
52 unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
53   // In order for this to be a signed 24-bit value, bit 23, must
54   // be a sign bit.
55   return DAG.ComputeMaxSignificantBits(Op);
56 }
57 
58 AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
59                                            const AMDGPUSubtarget &STI)
60     : TargetLowering(TM), Subtarget(&STI) {
61   // Lower floating point store/load to integer store/load to reduce the number
62   // of patterns in tablegen.
63   setOperationAction(ISD::LOAD, MVT::f32, Promote);
64   AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
65 
66   setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
67   AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
68 
69   setOperationAction(ISD::LOAD, MVT::v3f32, Promote);
70   AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
71 
72   setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
73   AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
74 
75   setOperationAction(ISD::LOAD, MVT::v5f32, Promote);
76   AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
77 
78   setOperationAction(ISD::LOAD, MVT::v6f32, Promote);
79   AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
80 
81   setOperationAction(ISD::LOAD, MVT::v7f32, Promote);
82   AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
83 
84   setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
85   AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
86 
87   setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
88   AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
89 
90   setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
91   AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
92 
93   setOperationAction(ISD::LOAD, MVT::i64, Promote);
94   AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
95 
96   setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
97   AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
98 
99   setOperationAction(ISD::LOAD, MVT::f64, Promote);
100   AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
101 
102   setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
103   AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
104 
105   setOperationAction(ISD::LOAD, MVT::v3i64, Promote);
106   AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
107 
108   setOperationAction(ISD::LOAD, MVT::v4i64, Promote);
109   AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
110 
111   setOperationAction(ISD::LOAD, MVT::v3f64, Promote);
112   AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
113 
114   setOperationAction(ISD::LOAD, MVT::v4f64, Promote);
115   AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
116 
117   setOperationAction(ISD::LOAD, MVT::v8i64, Promote);
118   AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
119 
120   setOperationAction(ISD::LOAD, MVT::v8f64, Promote);
121   AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
122 
123   setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
124   AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
125 
126   setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
127   AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
128 
129   // There are no 64-bit extloads. These should be done as a 32-bit extload and
130   // an extension to 64-bit.
131   for (MVT VT : MVT::integer_valuetypes()) {
132     setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand);
133     setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand);
134     setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand);
135   }
136 
137   for (MVT VT : MVT::integer_valuetypes()) {
138     if (VT == MVT::i64)
139       continue;
140 
141     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
142     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal);
143     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal);
144     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
145 
146     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
147     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal);
148     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal);
149     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);
150 
151     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
152     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal);
153     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal);
154     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
155   }
156 
157   for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) {
158     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
159     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
160     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
161     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand);
162     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand);
163     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand);
164     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
165     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
166     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
167     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v3i16, Expand);
168     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v3i16, Expand);
169     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v3i16, Expand);
170     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
171     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
172     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
173   }
174 
175   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
176   setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
177   setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
178   setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
179   setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
180   setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
181   setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
182 
183   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
184   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
185   setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
186   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
187   setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
188   setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
189 
190   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
191   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
192   setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
193   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
194   setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
195   setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
196 
197   setOperationAction(ISD::STORE, MVT::f32, Promote);
198   AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
199 
200   setOperationAction(ISD::STORE, MVT::v2f32, Promote);
201   AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
202 
203   setOperationAction(ISD::STORE, MVT::v3f32, Promote);
204   AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
205 
206   setOperationAction(ISD::STORE, MVT::v4f32, Promote);
207   AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
208 
209   setOperationAction(ISD::STORE, MVT::v5f32, Promote);
210   AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
211 
212   setOperationAction(ISD::STORE, MVT::v6f32, Promote);
213   AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
214 
215   setOperationAction(ISD::STORE, MVT::v7f32, Promote);
216   AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
217 
218   setOperationAction(ISD::STORE, MVT::v8f32, Promote);
219   AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
220 
221   setOperationAction(ISD::STORE, MVT::v16f32, Promote);
222   AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
223 
224   setOperationAction(ISD::STORE, MVT::v32f32, Promote);
225   AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
226 
227   setOperationAction(ISD::STORE, MVT::i64, Promote);
228   AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
229 
230   setOperationAction(ISD::STORE, MVT::v2i64, Promote);
231   AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
232 
233   setOperationAction(ISD::STORE, MVT::f64, Promote);
234   AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
235 
236   setOperationAction(ISD::STORE, MVT::v2f64, Promote);
237   AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
238 
239   setOperationAction(ISD::STORE, MVT::v3i64, Promote);
240   AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
241 
242   setOperationAction(ISD::STORE, MVT::v3f64, Promote);
243   AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
244 
245   setOperationAction(ISD::STORE, MVT::v4i64, Promote);
246   AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
247 
248   setOperationAction(ISD::STORE, MVT::v4f64, Promote);
249   AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
250 
251   setOperationAction(ISD::STORE, MVT::v8i64, Promote);
252   AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
253 
254   setOperationAction(ISD::STORE, MVT::v8f64, Promote);
255   AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
256 
257   setOperationAction(ISD::STORE, MVT::v16i64, Promote);
258   AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
259 
260   setOperationAction(ISD::STORE, MVT::v16f64, Promote);
261   AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
262 
263   setTruncStoreAction(MVT::i64, MVT::i1, Expand);
264   setTruncStoreAction(MVT::i64, MVT::i8, Expand);
265   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
266   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
267 
268   setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
269   setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
270   setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
271   setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
272 
273   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
274   setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
275   setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
276   setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
277   setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
278   setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
279   setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
280 
281   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
282   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
283 
284   setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
285   setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
286 
287   setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
288   setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
289   setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
290   setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
291 
292   setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
293   setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
294   setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
295   setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
296 
297   setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
298   setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
299 
300   setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
301   setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
302   setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
303   setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
304   setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
305   setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
306   setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
307 
308   setOperationAction(ISD::Constant, MVT::i32, Legal);
309   setOperationAction(ISD::Constant, MVT::i64, Legal);
310   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
311   setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
312 
313   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
314   setOperationAction(ISD::BRIND, MVT::Other, Expand);
315 
316   // This is totally unsupported, just custom lower to produce an error.
317   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
318 
319   // Library functions.  These default to Expand, but we have instructions
320   // for them.
321   setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
322   setOperationAction(ISD::FEXP2,  MVT::f32, Legal);
323   setOperationAction(ISD::FPOW,   MVT::f32, Legal);
324   setOperationAction(ISD::FLOG2,  MVT::f32, Legal);
325   setOperationAction(ISD::FABS,   MVT::f32, Legal);
326   setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
327   setOperationAction(ISD::FRINT,  MVT::f32, Legal);
328   setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
329   setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
330   setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
331 
332   setOperationAction(ISD::FROUND, MVT::f32, Custom);
333   setOperationAction(ISD::FROUND, MVT::f64, Custom);
334 
335   setOperationAction(ISD::FLOG, MVT::f32, Custom);
336   setOperationAction(ISD::FLOG10, MVT::f32, Custom);
337   setOperationAction(ISD::FEXP, MVT::f32, Custom);
338 
339 
340   setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
341   setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
342 
343   setOperationAction(ISD::FREM, MVT::f16, Custom);
344   setOperationAction(ISD::FREM, MVT::f32, Custom);
345   setOperationAction(ISD::FREM, MVT::f64, Custom);
346 
347   // Expand to fneg + fadd.
348   setOperationAction(ISD::FSUB, MVT::f64, Expand);
349 
350   setOperationAction(ISD::CONCAT_VECTORS, MVT::v3i32, Custom);
351   setOperationAction(ISD::CONCAT_VECTORS, MVT::v3f32, Custom);
352   setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
353   setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
354   setOperationAction(ISD::CONCAT_VECTORS, MVT::v5i32, Custom);
355   setOperationAction(ISD::CONCAT_VECTORS, MVT::v5f32, Custom);
356   setOperationAction(ISD::CONCAT_VECTORS, MVT::v6i32, Custom);
357   setOperationAction(ISD::CONCAT_VECTORS, MVT::v6f32, Custom);
358   setOperationAction(ISD::CONCAT_VECTORS, MVT::v7i32, Custom);
359   setOperationAction(ISD::CONCAT_VECTORS, MVT::v7f32, Custom);
360   setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
361   setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
362   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f16, Custom);
363   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i16, Custom);
364   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f16, Custom);
365   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i16, Custom);
366   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
367   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
368   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f32, Custom);
369   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3i32, Custom);
370   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom);
371   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
372   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5f32, Custom);
373   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v5i32, Custom);
374   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v6f32, Custom);
375   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v6i32, Custom);
376   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v7f32, Custom);
377   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v7i32, Custom);
378   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
379   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
380   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f32, Custom);
381   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i32, Custom);
382   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32f32, Custom);
383   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i32, Custom);
384   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f64, Custom);
385   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i64, Custom);
386   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3f64, Custom);
387   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v3i64, Custom);
388   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f64, Custom);
389   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i64, Custom);
390   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f64, Custom);
391   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i64, Custom);
392   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f64, Custom);
393   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i64, Custom);
394 
395   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
396   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);
397   setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom);
398 
399   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
400   for (MVT VT : ScalarIntVTs) {
401     // These should use [SU]DIVREM, so set them to expand
402     setOperationAction(ISD::SDIV, VT, Expand);
403     setOperationAction(ISD::UDIV, VT, Expand);
404     setOperationAction(ISD::SREM, VT, Expand);
405     setOperationAction(ISD::UREM, VT, Expand);
406 
407     // GPU does not have divrem function for signed or unsigned.
408     setOperationAction(ISD::SDIVREM, VT, Custom);
409     setOperationAction(ISD::UDIVREM, VT, Custom);
410 
411     // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
412     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
413     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
414 
415     setOperationAction(ISD::BSWAP, VT, Expand);
416     setOperationAction(ISD::CTTZ, VT, Expand);
417     setOperationAction(ISD::CTLZ, VT, Expand);
418 
419     // AMDGPU uses ADDC/SUBC/ADDE/SUBE
420     setOperationAction(ISD::ADDC, VT, Legal);
421     setOperationAction(ISD::SUBC, VT, Legal);
422     setOperationAction(ISD::ADDE, VT, Legal);
423     setOperationAction(ISD::SUBE, VT, Legal);
424   }
425 
426   // The hardware supports 32-bit FSHR, but not FSHL.
427   setOperationAction(ISD::FSHR, MVT::i32, Legal);
428 
429   // The hardware supports 32-bit ROTR, but not ROTL.
430   setOperationAction(ISD::ROTL, MVT::i32, Expand);
431   setOperationAction(ISD::ROTL, MVT::i64, Expand);
432   setOperationAction(ISD::ROTR, MVT::i64, Expand);
433 
434   setOperationAction(ISD::MULHU, MVT::i16, Expand);
435   setOperationAction(ISD::MULHS, MVT::i16, Expand);
436 
437   setOperationAction(ISD::MUL, MVT::i64, Expand);
438   setOperationAction(ISD::MULHU, MVT::i64, Expand);
439   setOperationAction(ISD::MULHS, MVT::i64, Expand);
440   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
441   setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
442   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
443   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
444   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
445 
446   setOperationAction(ISD::SMIN, MVT::i32, Legal);
447   setOperationAction(ISD::UMIN, MVT::i32, Legal);
448   setOperationAction(ISD::SMAX, MVT::i32, Legal);
449   setOperationAction(ISD::UMAX, MVT::i32, Legal);
450 
451   setOperationAction(ISD::CTTZ, MVT::i64, Custom);
452   setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Custom);
453   setOperationAction(ISD::CTLZ, MVT::i64, Custom);
454   setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
455 
456   static const MVT::SimpleValueType VectorIntTypes[] = {
457       MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32};
458 
459   for (MVT VT : VectorIntTypes) {
460     // Expand the following operations for the current type by default.
461     setOperationAction(ISD::ADD,  VT, Expand);
462     setOperationAction(ISD::AND,  VT, Expand);
463     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
464     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
465     setOperationAction(ISD::MUL,  VT, Expand);
466     setOperationAction(ISD::MULHU, VT, Expand);
467     setOperationAction(ISD::MULHS, VT, Expand);
468     setOperationAction(ISD::OR,   VT, Expand);
469     setOperationAction(ISD::SHL,  VT, Expand);
470     setOperationAction(ISD::SRA,  VT, Expand);
471     setOperationAction(ISD::SRL,  VT, Expand);
472     setOperationAction(ISD::ROTL, VT, Expand);
473     setOperationAction(ISD::ROTR, VT, Expand);
474     setOperationAction(ISD::SUB,  VT, Expand);
475     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
476     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
477     setOperationAction(ISD::SDIV, VT, Expand);
478     setOperationAction(ISD::UDIV, VT, Expand);
479     setOperationAction(ISD::SREM, VT, Expand);
480     setOperationAction(ISD::UREM, VT, Expand);
481     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
482     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
483     setOperationAction(ISD::SDIVREM, VT, Expand);
484     setOperationAction(ISD::UDIVREM, VT, Expand);
485     setOperationAction(ISD::SELECT, VT, Expand);
486     setOperationAction(ISD::VSELECT, VT, Expand);
487     setOperationAction(ISD::SELECT_CC, VT, Expand);
488     setOperationAction(ISD::XOR,  VT, Expand);
489     setOperationAction(ISD::BSWAP, VT, Expand);
490     setOperationAction(ISD::CTPOP, VT, Expand);
491     setOperationAction(ISD::CTTZ, VT, Expand);
492     setOperationAction(ISD::CTLZ, VT, Expand);
493     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
494     setOperationAction(ISD::SETCC, VT, Expand);
495   }
496 
497   static const MVT::SimpleValueType FloatVectorTypes[] = {
498       MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32};
499 
500   for (MVT VT : FloatVectorTypes) {
501     setOperationAction(ISD::FABS, VT, Expand);
502     setOperationAction(ISD::FMINNUM, VT, Expand);
503     setOperationAction(ISD::FMAXNUM, VT, Expand);
504     setOperationAction(ISD::FADD, VT, Expand);
505     setOperationAction(ISD::FCEIL, VT, Expand);
506     setOperationAction(ISD::FCOS, VT, Expand);
507     setOperationAction(ISD::FDIV, VT, Expand);
508     setOperationAction(ISD::FEXP2, VT, Expand);
509     setOperationAction(ISD::FEXP, VT, Expand);
510     setOperationAction(ISD::FLOG2, VT, Expand);
511     setOperationAction(ISD::FREM, VT, Expand);
512     setOperationAction(ISD::FLOG, VT, Expand);
513     setOperationAction(ISD::FLOG10, VT, Expand);
514     setOperationAction(ISD::FPOW, VT, Expand);
515     setOperationAction(ISD::FFLOOR, VT, Expand);
516     setOperationAction(ISD::FTRUNC, VT, Expand);
517     setOperationAction(ISD::FMUL, VT, Expand);
518     setOperationAction(ISD::FMA, VT, Expand);
519     setOperationAction(ISD::FRINT, VT, Expand);
520     setOperationAction(ISD::FNEARBYINT, VT, Expand);
521     setOperationAction(ISD::FSQRT, VT, Expand);
522     setOperationAction(ISD::FSIN, VT, Expand);
523     setOperationAction(ISD::FSUB, VT, Expand);
524     setOperationAction(ISD::FNEG, VT, Expand);
525     setOperationAction(ISD::VSELECT, VT, Expand);
526     setOperationAction(ISD::SELECT_CC, VT, Expand);
527     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
528     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
529     setOperationAction(ISD::SETCC, VT, Expand);
530     setOperationAction(ISD::FCANONICALIZE, VT, Expand);
531   }
532 
533   // This causes using an unrolled select operation rather than expansion with
534   // bit operations. This is in general better, but the alternative using BFI
535   // instructions may be better if the select sources are SGPRs.
536   setOperationAction(ISD::SELECT, MVT::v2f32, Promote);
537   AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
538 
539   setOperationAction(ISD::SELECT, MVT::v3f32, Promote);
540   AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
541 
542   setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
543   AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
544 
545   setOperationAction(ISD::SELECT, MVT::v5f32, Promote);
546   AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
547 
548   setOperationAction(ISD::SELECT, MVT::v6f32, Promote);
549   AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
550 
551   setOperationAction(ISD::SELECT, MVT::v7f32, Promote);
552   AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
553 
554   // There are no libcalls of any kind.
555   for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
556     setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
557 
558   setSchedulingPreference(Sched::RegPressure);
559   setJumpIsExpensive(true);
560 
561   // FIXME: This is only partially true. If we have to do vector compares, any
562   // SGPR pair can be a condition register. If we have a uniform condition, we
563   // are better off doing SALU operations, where there is only one SCC. For now,
564   // we don't have a way of knowing during instruction selection if a condition
565   // will be uniform and we always use vector compares. Assume we are using
566   // vector compares until that is fixed.
567   setHasMultipleConditionRegisters(true);
568 
569   setMinCmpXchgSizeInBits(32);
570   setSupportsUnalignedAtomics(false);
571 
572   PredictableSelectIsExpensive = false;
573 
574   // We want to find all load dependencies for long chains of stores to enable
575   // merging into very wide vectors. The problem is with vectors with > 4
576   // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
577   // vectors are a legal type, even though we have to split the loads
578   // usually. When we can more precisely specify load legality per address
579   // space, we should be able to make FindBetterChain/MergeConsecutiveStores
580   // smarter so that they can figure out what to do in 2 iterations without all
581   // N > 4 stores on the same chain.
582   GatherAllAliasesMaxDepth = 16;
583 
584   // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
585   // about these during lowering.
586   MaxStoresPerMemcpy  = 0xffffffff;
587   MaxStoresPerMemmove = 0xffffffff;
588   MaxStoresPerMemset  = 0xffffffff;
589 
590   // The expansion for 64-bit division is enormous.
591   if (AMDGPUBypassSlowDiv)
592     addBypassSlowDiv(64, 32);
593 
594   setTargetDAGCombine(ISD::BITCAST);
595   setTargetDAGCombine(ISD::SHL);
596   setTargetDAGCombine(ISD::SRA);
597   setTargetDAGCombine(ISD::SRL);
598   setTargetDAGCombine(ISD::TRUNCATE);
599   setTargetDAGCombine(ISD::MUL);
600   setTargetDAGCombine(ISD::SMUL_LOHI);
601   setTargetDAGCombine(ISD::UMUL_LOHI);
602   setTargetDAGCombine(ISD::MULHU);
603   setTargetDAGCombine(ISD::MULHS);
604   setTargetDAGCombine(ISD::SELECT);
605   setTargetDAGCombine(ISD::SELECT_CC);
606   setTargetDAGCombine(ISD::STORE);
607   setTargetDAGCombine(ISD::FADD);
608   setTargetDAGCombine(ISD::FSUB);
609   setTargetDAGCombine(ISD::FNEG);
610   setTargetDAGCombine(ISD::FABS);
611   setTargetDAGCombine(ISD::AssertZext);
612   setTargetDAGCombine(ISD::AssertSext);
613   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
614 }
615 
616 bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const {
617   if (getTargetMachine().Options.NoSignedZerosFPMath)
618     return true;
619 
620   const auto Flags = Op.getNode()->getFlags();
621   if (Flags.hasNoSignedZeros())
622     return true;
623 
624   return false;
625 }
626 
627 //===----------------------------------------------------------------------===//
628 // Target Information
629 //===----------------------------------------------------------------------===//
630 
631 LLVM_READNONE
632 static bool fnegFoldsIntoOp(unsigned Opc) {
633   switch (Opc) {
634   case ISD::FADD:
635   case ISD::FSUB:
636   case ISD::FMUL:
637   case ISD::FMA:
638   case ISD::FMAD:
639   case ISD::FMINNUM:
640   case ISD::FMAXNUM:
641   case ISD::FMINNUM_IEEE:
642   case ISD::FMAXNUM_IEEE:
643   case ISD::FSIN:
644   case ISD::FTRUNC:
645   case ISD::FRINT:
646   case ISD::FNEARBYINT:
647   case ISD::FCANONICALIZE:
648   case AMDGPUISD::RCP:
649   case AMDGPUISD::RCP_LEGACY:
650   case AMDGPUISD::RCP_IFLAG:
651   case AMDGPUISD::SIN_HW:
652   case AMDGPUISD::FMUL_LEGACY:
653   case AMDGPUISD::FMIN_LEGACY:
654   case AMDGPUISD::FMAX_LEGACY:
655   case AMDGPUISD::FMED3:
656     // TODO: handle llvm.amdgcn.fma.legacy
657     return true;
658   default:
659     return false;
660   }
661 }
662 
663 /// \p returns true if the operation will definitely need to use a 64-bit
664 /// encoding, and thus will use a VOP3 encoding regardless of the source
665 /// modifiers.
666 LLVM_READONLY
667 static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
668   return N->getNumOperands() > 2 || VT == MVT::f64;
669 }
670 
671 // Most FP instructions support source modifiers, but this could be refined
672 // slightly.
673 LLVM_READONLY
674 static bool hasSourceMods(const SDNode *N) {
675   if (isa<MemSDNode>(N))
676     return false;
677 
678   switch (N->getOpcode()) {
679   case ISD::CopyToReg:
680   case ISD::SELECT:
681   case ISD::FDIV:
682   case ISD::FREM:
683   case ISD::INLINEASM:
684   case ISD::INLINEASM_BR:
685   case AMDGPUISD::DIV_SCALE:
686   case ISD::INTRINSIC_W_CHAIN:
687 
688   // TODO: Should really be looking at the users of the bitcast. These are
689   // problematic because bitcasts are used to legalize all stores to integer
690   // types.
691   case ISD::BITCAST:
692     return false;
693   case ISD::INTRINSIC_WO_CHAIN: {
694     switch (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()) {
695     case Intrinsic::amdgcn_interp_p1:
696     case Intrinsic::amdgcn_interp_p2:
697     case Intrinsic::amdgcn_interp_mov:
698     case Intrinsic::amdgcn_interp_p1_f16:
699     case Intrinsic::amdgcn_interp_p2_f16:
700       return false;
701     default:
702       return true;
703     }
704   }
705   default:
706     return true;
707   }
708 }
709 
710 bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
711                                                  unsigned CostThreshold) {
712   // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
713   // it is truly free to use a source modifier in all cases. If there are
714   // multiple users but for each one will necessitate using VOP3, there will be
715   // a code size increase. Try to avoid increasing code size unless we know it
716   // will save on the instruction count.
717   unsigned NumMayIncreaseSize = 0;
718   MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
719 
720   // XXX - Should this limit number of uses to check?
721   for (const SDNode *U : N->uses()) {
722     if (!hasSourceMods(U))
723       return false;
724 
725     if (!opMustUseVOP3Encoding(U, VT)) {
726       if (++NumMayIncreaseSize > CostThreshold)
727         return false;
728     }
729   }
730 
731   return true;
732 }
733 
734 EVT AMDGPUTargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
735                                               ISD::NodeType ExtendKind) const {
736   assert(!VT.isVector() && "only scalar expected");
737 
738   // Round to the next multiple of 32-bits.
739   unsigned Size = VT.getSizeInBits();
740   if (Size <= 32)
741     return MVT::i32;
742   return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
743 }
744 
745 MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
746   return MVT::i32;
747 }
748 
749 bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
750   return true;
751 }
752 
753 // The backend supports 32 and 64 bit floating point immediates.
754 // FIXME: Why are we reporting vectors of FP immediates as legal?
755 bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
756                                         bool ForCodeSize) const {
757   EVT ScalarVT = VT.getScalarType();
758   return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
759          (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
760 }
761 
762 // We don't want to shrink f64 / f32 constants.
763 bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
764   EVT ScalarVT = VT.getScalarType();
765   return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
766 }
767 
768 bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
769                                                  ISD::LoadExtType ExtTy,
770                                                  EVT NewVT) const {
771   // TODO: This may be worth removing. Check regression tests for diffs.
772   if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT))
773     return false;
774 
775   unsigned NewSize = NewVT.getStoreSizeInBits();
776 
777   // If we are reducing to a 32-bit load or a smaller multi-dword load,
778   // this is always better.
779   if (NewSize >= 32)
780     return true;
781 
782   EVT OldVT = N->getValueType(0);
783   unsigned OldSize = OldVT.getStoreSizeInBits();
784 
785   MemSDNode *MN = cast<MemSDNode>(N);
786   unsigned AS = MN->getAddressSpace();
787   // Do not shrink an aligned scalar load to sub-dword.
788   // Scalar engine cannot do sub-dword loads.
789   if (OldSize >= 32 && NewSize < 32 && MN->getAlignment() >= 4 &&
790       (AS == AMDGPUAS::CONSTANT_ADDRESS ||
791        AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
792        (isa<LoadSDNode>(N) &&
793         AS == AMDGPUAS::GLOBAL_ADDRESS && MN->isInvariant())) &&
794       AMDGPUInstrInfo::isUniformMMO(MN->getMemOperand()))
795     return false;
796 
797   // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
798   // extloads, so doing one requires using a buffer_load. In cases where we
799   // still couldn't use a scalar load, using the wider load shouldn't really
800   // hurt anything.
801 
802   // If the old size already had to be an extload, there's no harm in continuing
803   // to reduce the width.
804   return (OldSize < 32);
805 }
806 
807 bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy,
808                                                    const SelectionDAG &DAG,
809                                                    const MachineMemOperand &MMO) const {
810 
811   assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
812 
813   if (LoadTy.getScalarType() == MVT::i32)
814     return false;
815 
816   unsigned LScalarSize = LoadTy.getScalarSizeInBits();
817   unsigned CastScalarSize = CastTy.getScalarSizeInBits();
818 
819   if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
820     return false;
821 
822   bool Fast = false;
823   return allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
824                                         CastTy, MMO, &Fast) &&
825          Fast;
826 }
827 
828 // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
829 // profitable with the expansion for 64-bit since it's generally good to
830 // speculate things.
831 // FIXME: These should really have the size as a parameter.
832 bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const {
833   return true;
834 }
835 
836 bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
837   return true;
838 }
839 
840 bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode *N) const {
841   switch (N->getOpcode()) {
842   case ISD::EntryToken:
843   case ISD::TokenFactor:
844     return true;
845   case ISD::INTRINSIC_WO_CHAIN: {
846     unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
847     switch (IntrID) {
848     case Intrinsic::amdgcn_readfirstlane:
849     case Intrinsic::amdgcn_readlane:
850       return true;
851     }
852     return false;
853   }
854   case ISD::LOAD:
855     if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
856         AMDGPUAS::CONSTANT_ADDRESS_32BIT)
857       return true;
858     return false;
859   }
860   return false;
861 }
862 
863 SDValue AMDGPUTargetLowering::getNegatedExpression(
864     SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
865     NegatibleCost &Cost, unsigned Depth) const {
866 
867   switch (Op.getOpcode()) {
868   case ISD::FMA:
869   case ISD::FMAD: {
870     // Negating a fma is not free if it has users without source mods.
871     if (!allUsesHaveSourceMods(Op.getNode()))
872       return SDValue();
873     break;
874   }
875   default:
876     break;
877   }
878 
879   return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
880                                               ForCodeSize, Cost, Depth);
881 }
882 
883 //===---------------------------------------------------------------------===//
884 // Target Properties
885 //===---------------------------------------------------------------------===//
886 
887 bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
888   assert(VT.isFloatingPoint());
889 
890   // Packed operations do not have a fabs modifier.
891   return VT == MVT::f32 || VT == MVT::f64 ||
892          (Subtarget->has16BitInsts() && VT == MVT::f16);
893 }
894 
895 bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
896   assert(VT.isFloatingPoint());
897   // Report this based on the end legalized type.
898   VT = VT.getScalarType();
899   return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16;
900 }
901 
902 bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
903                                                          unsigned NumElem,
904                                                          unsigned AS) const {
905   return true;
906 }
907 
908 bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
909   // There are few operations which truly have vector input operands. Any vector
910   // operation is going to involve operations on each component, and a
911   // build_vector will be a copy per element, so it always makes sense to use a
912   // build_vector input in place of the extracted element to avoid a copy into a
913   // super register.
914   //
915   // We should probably only do this if all users are extracts only, but this
916   // should be the common case.
917   return true;
918 }
919 
920 bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
921   // Truncate is just accessing a subregister.
922 
923   unsigned SrcSize = Source.getSizeInBits();
924   unsigned DestSize = Dest.getSizeInBits();
925 
926   return DestSize < SrcSize && DestSize % 32 == 0 ;
927 }
928 
929 bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
930   // Truncate is just accessing a subregister.
931 
932   unsigned SrcSize = Source->getScalarSizeInBits();
933   unsigned DestSize = Dest->getScalarSizeInBits();
934 
935   if (DestSize== 16 && Subtarget->has16BitInsts())
936     return SrcSize >= 32;
937 
938   return DestSize < SrcSize && DestSize % 32 == 0;
939 }
940 
941 bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
942   unsigned SrcSize = Src->getScalarSizeInBits();
943   unsigned DestSize = Dest->getScalarSizeInBits();
944 
945   if (SrcSize == 16 && Subtarget->has16BitInsts())
946     return DestSize >= 32;
947 
948   return SrcSize == 32 && DestSize == 64;
949 }
950 
951 bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
952   // Any register load of a 64-bit value really requires 2 32-bit moves. For all
953   // practical purposes, the extra mov 0 to load a 64-bit is free.  As used,
954   // this will enable reducing 64-bit operations the 32-bit, which is always
955   // good.
956 
957   if (Src == MVT::i16)
958     return Dest == MVT::i32 ||Dest == MVT::i64 ;
959 
960   return Src == MVT::i32 && Dest == MVT::i64;
961 }
962 
963 bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
964   return isZExtFree(Val.getValueType(), VT2);
965 }
966 
967 bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
968   // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
969   // limited number of native 64-bit operations. Shrinking an operation to fit
970   // in a single 32-bit register should always be helpful. As currently used,
971   // this is much less general than the name suggests, and is only used in
972   // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
973   // not profitable, and may actually be harmful.
974   return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
975 }
976 
977 //===---------------------------------------------------------------------===//
978 // TargetLowering Callbacks
979 //===---------------------------------------------------------------------===//
980 
981 CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
982                                                   bool IsVarArg) {
983   switch (CC) {
984   case CallingConv::AMDGPU_VS:
985   case CallingConv::AMDGPU_GS:
986   case CallingConv::AMDGPU_PS:
987   case CallingConv::AMDGPU_CS:
988   case CallingConv::AMDGPU_HS:
989   case CallingConv::AMDGPU_ES:
990   case CallingConv::AMDGPU_LS:
991     return CC_AMDGPU;
992   case CallingConv::C:
993   case CallingConv::Fast:
994   case CallingConv::Cold:
995     return CC_AMDGPU_Func;
996   case CallingConv::AMDGPU_Gfx:
997     return CC_SI_Gfx;
998   case CallingConv::AMDGPU_KERNEL:
999   case CallingConv::SPIR_KERNEL:
1000   default:
1001     report_fatal_error("Unsupported calling convention for call");
1002   }
1003 }
1004 
1005 CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
1006                                                     bool IsVarArg) {
1007   switch (CC) {
1008   case CallingConv::AMDGPU_KERNEL:
1009   case CallingConv::SPIR_KERNEL:
1010     llvm_unreachable("kernels should not be handled here");
1011   case CallingConv::AMDGPU_VS:
1012   case CallingConv::AMDGPU_GS:
1013   case CallingConv::AMDGPU_PS:
1014   case CallingConv::AMDGPU_CS:
1015   case CallingConv::AMDGPU_HS:
1016   case CallingConv::AMDGPU_ES:
1017   case CallingConv::AMDGPU_LS:
1018     return RetCC_SI_Shader;
1019   case CallingConv::AMDGPU_Gfx:
1020     return RetCC_SI_Gfx;
1021   case CallingConv::C:
1022   case CallingConv::Fast:
1023   case CallingConv::Cold:
1024     return RetCC_AMDGPU_Func;
1025   default:
1026     report_fatal_error("Unsupported calling convention.");
1027   }
1028 }
1029 
1030 /// The SelectionDAGBuilder will automatically promote function arguments
1031 /// with illegal types.  However, this does not work for the AMDGPU targets
1032 /// since the function arguments are stored in memory as these illegal types.
1033 /// In order to handle this properly we need to get the original types sizes
1034 /// from the LLVM IR Function and fixup the ISD:InputArg values before
1035 /// passing them to AnalyzeFormalArguments()
1036 
1037 /// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1038 /// input values across multiple registers.  Each item in the Ins array
1039 /// represents a single value that will be stored in registers.  Ins[x].VT is
1040 /// the value type of the value that will be stored in the register, so
1041 /// whatever SDNode we lower the argument to needs to be this type.
1042 ///
1043 /// In order to correctly lower the arguments we need to know the size of each
1044 /// argument.  Since Ins[x].VT gives us the size of the register that will
1045 /// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1046 /// for the original function argument so that we can deduce the correct memory
1047 /// type to use for Ins[x].  In most cases the correct memory type will be
1048 /// Ins[x].ArgVT.  However, this will not always be the case.  If, for example,
1049 /// we have a kernel argument of type v8i8, this argument will be split into
1050 /// 8 parts and each part will be represented by its own item in the Ins array.
1051 /// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1052 /// the argument before it was split.  From this, we deduce that the memory type
1053 /// for each individual part is i8.  We pass the memory type as LocVT to the
1054 /// calling convention analysis function and the register type (Ins[x].VT) as
1055 /// the ValVT.
1056 void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
1057   CCState &State,
1058   const SmallVectorImpl<ISD::InputArg> &Ins) const {
1059   const MachineFunction &MF = State.getMachineFunction();
1060   const Function &Fn = MF.getFunction();
1061   LLVMContext &Ctx = Fn.getParent()->getContext();
1062   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
1063   const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn);
1064   CallingConv::ID CC = Fn.getCallingConv();
1065 
1066   Align MaxAlign = Align(1);
1067   uint64_t ExplicitArgOffset = 0;
1068   const DataLayout &DL = Fn.getParent()->getDataLayout();
1069 
1070   unsigned InIndex = 0;
1071 
1072   for (const Argument &Arg : Fn.args()) {
1073     const bool IsByRef = Arg.hasByRefAttr();
1074     Type *BaseArgTy = Arg.getType();
1075     Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1076     MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
1077     if (!Alignment)
1078       Alignment = DL.getABITypeAlign(MemArgTy);
1079     MaxAlign = max(Alignment, MaxAlign);
1080     uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1081 
1082     uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1083     ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1084 
1085     // We're basically throwing away everything passed into us and starting over
1086     // to get accurate in-memory offsets. The "PartOffset" is completely useless
1087     // to us as computed in Ins.
1088     //
1089     // We also need to figure out what type legalization is trying to do to get
1090     // the correct memory offsets.
1091 
1092     SmallVector<EVT, 16> ValueVTs;
1093     SmallVector<uint64_t, 16> Offsets;
1094     ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
1095 
1096     for (unsigned Value = 0, NumValues = ValueVTs.size();
1097          Value != NumValues; ++Value) {
1098       uint64_t BasePartOffset = Offsets[Value];
1099 
1100       EVT ArgVT = ValueVTs[Value];
1101       EVT MemVT = ArgVT;
1102       MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1103       unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1104 
1105       if (NumRegs == 1) {
1106         // This argument is not split, so the IR type is the memory type.
1107         if (ArgVT.isExtended()) {
1108           // We have an extended type, like i24, so we should just use the
1109           // register type.
1110           MemVT = RegisterVT;
1111         } else {
1112           MemVT = ArgVT;
1113         }
1114       } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1115                  ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1116         assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1117         // We have a vector value which has been split into a vector with
1118         // the same scalar type, but fewer elements.  This should handle
1119         // all the floating-point vector types.
1120         MemVT = RegisterVT;
1121       } else if (ArgVT.isVector() &&
1122                  ArgVT.getVectorNumElements() == NumRegs) {
1123         // This arg has been split so that each element is stored in a separate
1124         // register.
1125         MemVT = ArgVT.getScalarType();
1126       } else if (ArgVT.isExtended()) {
1127         // We have an extended type, like i65.
1128         MemVT = RegisterVT;
1129       } else {
1130         unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1131         assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1132         if (RegisterVT.isInteger()) {
1133           MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1134         } else if (RegisterVT.isVector()) {
1135           assert(!RegisterVT.getScalarType().isFloatingPoint());
1136           unsigned NumElements = RegisterVT.getVectorNumElements();
1137           assert(MemoryBits % NumElements == 0);
1138           // This vector type has been split into another vector type with
1139           // a different elements size.
1140           EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1141                                            MemoryBits / NumElements);
1142           MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1143         } else {
1144           llvm_unreachable("cannot deduce memory type.");
1145         }
1146       }
1147 
1148       // Convert one element vectors to scalar.
1149       if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1150         MemVT = MemVT.getScalarType();
1151 
1152       // Round up vec3/vec5 argument.
1153       if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1154         assert(MemVT.getVectorNumElements() == 3 ||
1155                MemVT.getVectorNumElements() == 5);
1156         MemVT = MemVT.getPow2VectorType(State.getContext());
1157       } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1158         MemVT = MemVT.getRoundIntegerType(State.getContext());
1159       }
1160 
1161       unsigned PartOffset = 0;
1162       for (unsigned i = 0; i != NumRegs; ++i) {
1163         State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1164                                                BasePartOffset + PartOffset,
1165                                                MemVT.getSimpleVT(),
1166                                                CCValAssign::Full));
1167         PartOffset += MemVT.getStoreSize();
1168       }
1169     }
1170   }
1171 }
1172 
1173 SDValue AMDGPUTargetLowering::LowerReturn(
1174   SDValue Chain, CallingConv::ID CallConv,
1175   bool isVarArg,
1176   const SmallVectorImpl<ISD::OutputArg> &Outs,
1177   const SmallVectorImpl<SDValue> &OutVals,
1178   const SDLoc &DL, SelectionDAG &DAG) const {
1179   // FIXME: Fails for r600 tests
1180   //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1181   // "wave terminate should not have return values");
1182   return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1183 }
1184 
1185 //===---------------------------------------------------------------------===//
1186 // Target specific lowering
1187 //===---------------------------------------------------------------------===//
1188 
1189 /// Selects the correct CCAssignFn for a given CallingConvention value.
1190 CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
1191                                                     bool IsVarArg) {
1192   return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1193 }
1194 
1195 CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
1196                                                       bool IsVarArg) {
1197   return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
1198 }
1199 
1200 SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain,
1201                                                   SelectionDAG &DAG,
1202                                                   MachineFrameInfo &MFI,
1203                                                   int ClobberedFI) const {
1204   SmallVector<SDValue, 8> ArgChains;
1205   int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1206   int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1207 
1208   // Include the original chain at the beginning of the list. When this is
1209   // used by target LowerCall hooks, this helps legalize find the
1210   // CALLSEQ_BEGIN node.
1211   ArgChains.push_back(Chain);
1212 
1213   // Add a chain value for each stack argument corresponding
1214   for (SDNode *U : DAG.getEntryNode().getNode()->uses()) {
1215     if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1216       if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1217         if (FI->getIndex() < 0) {
1218           int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1219           int64_t InLastByte = InFirstByte;
1220           InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1221 
1222           if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1223               (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1224             ArgChains.push_back(SDValue(L, 1));
1225         }
1226       }
1227     }
1228   }
1229 
1230   // Build a tokenfactor for all the chains.
1231   return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1232 }
1233 
1234 SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,
1235                                                  SmallVectorImpl<SDValue> &InVals,
1236                                                  StringRef Reason) const {
1237   SDValue Callee = CLI.Callee;
1238   SelectionDAG &DAG = CLI.DAG;
1239 
1240   const Function &Fn = DAG.getMachineFunction().getFunction();
1241 
1242   StringRef FuncName("<unknown>");
1243 
1244   if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1245     FuncName = G->getSymbol();
1246   else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1247     FuncName = G->getGlobal()->getName();
1248 
1249   DiagnosticInfoUnsupported NoCalls(
1250     Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1251   DAG.getContext()->diagnose(NoCalls);
1252 
1253   if (!CLI.IsTailCall) {
1254     for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
1255       InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
1256   }
1257 
1258   return DAG.getEntryNode();
1259 }
1260 
1261 SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
1262                                         SmallVectorImpl<SDValue> &InVals) const {
1263   return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1264 }
1265 
1266 SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
1267                                                       SelectionDAG &DAG) const {
1268   const Function &Fn = DAG.getMachineFunction().getFunction();
1269 
1270   DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1271                                             SDLoc(Op).getDebugLoc());
1272   DAG.getContext()->diagnose(NoDynamicAlloca);
1273   auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1274   return DAG.getMergeValues(Ops, SDLoc());
1275 }
1276 
1277 SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
1278                                              SelectionDAG &DAG) const {
1279   switch (Op.getOpcode()) {
1280   default:
1281     Op->print(errs(), &DAG);
1282     llvm_unreachable("Custom lowering code for this "
1283                      "instruction is not implemented yet!");
1284     break;
1285   case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
1286   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1287   case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
1288   case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1289   case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1290   case ISD::FREM: return LowerFREM(Op, DAG);
1291   case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1292   case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1293   case ISD::FRINT: return LowerFRINT(Op, DAG);
1294   case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1295   case ISD::FROUND: return LowerFROUND(Op, DAG);
1296   case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1297   case ISD::FLOG:
1298     return LowerFLOG(Op, DAG, numbers::ln2f);
1299   case ISD::FLOG10:
1300     return LowerFLOG(Op, DAG, numbers::ln2f / numbers::ln10f);
1301   case ISD::FEXP:
1302     return lowerFEXP(Op, DAG);
1303   case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1304   case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1305   case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1306   case ISD::FP_TO_SINT:
1307   case ISD::FP_TO_UINT:
1308     return LowerFP_TO_INT(Op, DAG);
1309   case ISD::CTTZ:
1310   case ISD::CTTZ_ZERO_UNDEF:
1311   case ISD::CTLZ:
1312   case ISD::CTLZ_ZERO_UNDEF:
1313     return LowerCTLZ_CTTZ(Op, DAG);
1314   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
1315   }
1316   return Op;
1317 }
1318 
1319 void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
1320                                               SmallVectorImpl<SDValue> &Results,
1321                                               SelectionDAG &DAG) const {
1322   switch (N->getOpcode()) {
1323   case ISD::SIGN_EXTEND_INREG:
1324     // Different parts of legalization seem to interpret which type of
1325     // sign_extend_inreg is the one to check for custom lowering. The extended
1326     // from type is what really matters, but some places check for custom
1327     // lowering of the result type. This results in trying to use
1328     // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1329     // nothing here and let the illegal result integer be handled normally.
1330     return;
1331   default:
1332     return;
1333   }
1334 }
1335 
1336 SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
1337                                                  SDValue Op,
1338                                                  SelectionDAG &DAG) const {
1339 
1340   const DataLayout &DL = DAG.getDataLayout();
1341   GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1342   const GlobalValue *GV = G->getGlobal();
1343 
1344   if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1345       G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1346     if (!MFI->isModuleEntryFunction() &&
1347         !GV->getName().equals("llvm.amdgcn.module.lds")) {
1348       SDLoc DL(Op);
1349       const Function &Fn = DAG.getMachineFunction().getFunction();
1350       DiagnosticInfoUnsupported BadLDSDecl(
1351         Fn, "local memory global used by non-kernel function",
1352         DL.getDebugLoc(), DS_Warning);
1353       DAG.getContext()->diagnose(BadLDSDecl);
1354 
1355       // We currently don't have a way to correctly allocate LDS objects that
1356       // aren't directly associated with a kernel. We do force inlining of
1357       // functions that use local objects. However, if these dead functions are
1358       // not eliminated, we don't want a compile time error. Just emit a warning
1359       // and a trap, since there should be no callable path here.
1360       SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1361       SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1362                                         Trap, DAG.getRoot());
1363       DAG.setRoot(OutputChain);
1364       return DAG.getUNDEF(Op.getValueType());
1365     }
1366 
1367     // XXX: What does the value of G->getOffset() mean?
1368     assert(G->getOffset() == 0 &&
1369          "Do not know what to do with an non-zero offset");
1370 
1371     // TODO: We could emit code to handle the initialization somewhere.
1372     // We ignore the initializer for now and legalize it to allow selection.
1373     // The initializer will anyway get errored out during assembly emission.
1374     unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1375     return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1376   }
1377   return SDValue();
1378 }
1379 
1380 SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
1381                                                   SelectionDAG &DAG) const {
1382   SmallVector<SDValue, 8> Args;
1383 
1384   EVT VT = Op.getValueType();
1385   if (VT == MVT::v4i16 || VT == MVT::v4f16) {
1386     SDLoc SL(Op);
1387     SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(0));
1388     SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(1));
1389 
1390     SDValue BV = DAG.getBuildVector(MVT::v2i32, SL, { Lo, Hi });
1391     return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1392   }
1393 
1394   for (const SDUse &U : Op->ops())
1395     DAG.ExtractVectorElements(U.get(), Args);
1396 
1397   return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1398 }
1399 
1400 SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
1401                                                      SelectionDAG &DAG) const {
1402 
1403   SmallVector<SDValue, 8> Args;
1404   unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
1405   EVT VT = Op.getValueType();
1406   EVT SrcVT = Op.getOperand(0).getValueType();
1407 
1408   // For these types, we have some TableGen patterns except if the index is 1
1409   if (((SrcVT == MVT::v4f16 && VT == MVT::v2f16) ||
1410        (SrcVT == MVT::v4i16 && VT == MVT::v2i16)) &&
1411       Start != 1)
1412     return Op;
1413 
1414   if (((SrcVT == MVT::v8f16 && VT == MVT::v4f16) ||
1415        (SrcVT == MVT::v8i16 && VT == MVT::v4i16)) &&
1416       (Start == 0 || Start == 4))
1417     return Op;
1418 
1419   DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1420                             VT.getVectorNumElements());
1421 
1422   return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1423 }
1424 
1425 /// Generate Min/Max node
1426 SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
1427                                                    SDValue LHS, SDValue RHS,
1428                                                    SDValue True, SDValue False,
1429                                                    SDValue CC,
1430                                                    DAGCombinerInfo &DCI) const {
1431   if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
1432     return SDValue();
1433 
1434   SelectionDAG &DAG = DCI.DAG;
1435   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1436   switch (CCOpcode) {
1437   case ISD::SETOEQ:
1438   case ISD::SETONE:
1439   case ISD::SETUNE:
1440   case ISD::SETNE:
1441   case ISD::SETUEQ:
1442   case ISD::SETEQ:
1443   case ISD::SETFALSE:
1444   case ISD::SETFALSE2:
1445   case ISD::SETTRUE:
1446   case ISD::SETTRUE2:
1447   case ISD::SETUO:
1448   case ISD::SETO:
1449     break;
1450   case ISD::SETULE:
1451   case ISD::SETULT: {
1452     if (LHS == True)
1453       return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1454     return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1455   }
1456   case ISD::SETOLE:
1457   case ISD::SETOLT:
1458   case ISD::SETLE:
1459   case ISD::SETLT: {
1460     // Ordered. Assume ordered for undefined.
1461 
1462     // Only do this after legalization to avoid interfering with other combines
1463     // which might occur.
1464     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1465         !DCI.isCalledByLegalizer())
1466       return SDValue();
1467 
1468     // We need to permute the operands to get the correct NaN behavior. The
1469     // selected operand is the second one based on the failing compare with NaN,
1470     // so permute it based on the compare type the hardware uses.
1471     if (LHS == True)
1472       return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1473     return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1474   }
1475   case ISD::SETUGE:
1476   case ISD::SETUGT: {
1477     if (LHS == True)
1478       return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1479     return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1480   }
1481   case ISD::SETGT:
1482   case ISD::SETGE:
1483   case ISD::SETOGE:
1484   case ISD::SETOGT: {
1485     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1486         !DCI.isCalledByLegalizer())
1487       return SDValue();
1488 
1489     if (LHS == True)
1490       return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1491     return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1492   }
1493   case ISD::SETCC_INVALID:
1494     llvm_unreachable("Invalid setcc condcode!");
1495   }
1496   return SDValue();
1497 }
1498 
1499 std::pair<SDValue, SDValue>
1500 AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
1501   SDLoc SL(Op);
1502 
1503   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1504 
1505   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1506   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1507 
1508   SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1509   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1510 
1511   return std::make_pair(Lo, Hi);
1512 }
1513 
1514 SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
1515   SDLoc SL(Op);
1516 
1517   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1518   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1519   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1520 }
1521 
1522 SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
1523   SDLoc SL(Op);
1524 
1525   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1526   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1527   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1528 }
1529 
1530 // Split a vector type into two parts. The first part is a power of two vector.
1531 // The second part is whatever is left over, and is a scalar if it would
1532 // otherwise be a 1-vector.
1533 std::pair<EVT, EVT>
1534 AMDGPUTargetLowering::getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const {
1535   EVT LoVT, HiVT;
1536   EVT EltVT = VT.getVectorElementType();
1537   unsigned NumElts = VT.getVectorNumElements();
1538   unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1539   LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1540   HiVT = NumElts - LoNumElts == 1
1541              ? EltVT
1542              : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1543   return std::make_pair(LoVT, HiVT);
1544 }
1545 
1546 // Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1547 // scalar.
1548 std::pair<SDValue, SDValue>
1549 AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL,
1550                                   const EVT &LoVT, const EVT &HiVT,
1551                                   SelectionDAG &DAG) const {
1552   assert(LoVT.getVectorNumElements() +
1553                  (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1554              N.getValueType().getVectorNumElements() &&
1555          "More vector elements requested than available!");
1556   SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
1557                            DAG.getVectorIdxConstant(0, DL));
1558   SDValue Hi = DAG.getNode(
1559       HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL,
1560       HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));
1561   return std::make_pair(Lo, Hi);
1562 }
1563 
1564 SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
1565                                               SelectionDAG &DAG) const {
1566   LoadSDNode *Load = cast<LoadSDNode>(Op);
1567   EVT VT = Op.getValueType();
1568   SDLoc SL(Op);
1569 
1570 
1571   // If this is a 2 element vector, we really want to scalarize and not create
1572   // weird 1 element vectors.
1573   if (VT.getVectorNumElements() == 2) {
1574     SDValue Ops[2];
1575     std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1576     return DAG.getMergeValues(Ops, SL);
1577   }
1578 
1579   SDValue BasePtr = Load->getBasePtr();
1580   EVT MemVT = Load->getMemoryVT();
1581 
1582   const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1583 
1584   EVT LoVT, HiVT;
1585   EVT LoMemVT, HiMemVT;
1586   SDValue Lo, Hi;
1587 
1588   std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1589   std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1590   std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1591 
1592   unsigned Size = LoMemVT.getStoreSize();
1593   unsigned BaseAlign = Load->getAlignment();
1594   unsigned HiAlign = MinAlign(BaseAlign, Size);
1595 
1596   SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1597                                   Load->getChain(), BasePtr, SrcValue, LoMemVT,
1598                                   BaseAlign, Load->getMemOperand()->getFlags());
1599   SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Size));
1600   SDValue HiLoad =
1601       DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1602                      HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1603                      HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1604 
1605   SDValue Join;
1606   if (LoVT == HiVT) {
1607     // This is the case that the vector is power of two so was evenly split.
1608     Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1609   } else {
1610     Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
1611                        DAG.getVectorIdxConstant(0, SL));
1612     Join = DAG.getNode(
1613         HiVT.isVector() ? ISD::INSERT_SUBVECTOR : ISD::INSERT_VECTOR_ELT, SL,
1614         VT, Join, HiLoad,
1615         DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), SL));
1616   }
1617 
1618   SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1619                                      LoLoad.getValue(1), HiLoad.getValue(1))};
1620 
1621   return DAG.getMergeValues(Ops, SL);
1622 }
1623 
1624 SDValue AMDGPUTargetLowering::WidenOrSplitVectorLoad(SDValue Op,
1625                                                      SelectionDAG &DAG) const {
1626   LoadSDNode *Load = cast<LoadSDNode>(Op);
1627   EVT VT = Op.getValueType();
1628   SDValue BasePtr = Load->getBasePtr();
1629   EVT MemVT = Load->getMemoryVT();
1630   SDLoc SL(Op);
1631   const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1632   unsigned BaseAlign = Load->getAlignment();
1633   unsigned NumElements = MemVT.getVectorNumElements();
1634 
1635   // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1636   // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1637   if (NumElements != 3 ||
1638       (BaseAlign < 8 &&
1639        !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1640     return SplitVectorLoad(Op, DAG);
1641 
1642   assert(NumElements == 3);
1643 
1644   EVT WideVT =
1645       EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
1646   EVT WideMemVT =
1647       EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 4);
1648   SDValue WideLoad = DAG.getExtLoad(
1649       Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1650       WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1651   return DAG.getMergeValues(
1652       {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1653                    DAG.getVectorIdxConstant(0, SL)),
1654        WideLoad.getValue(1)},
1655       SL);
1656 }
1657 
1658 SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
1659                                                SelectionDAG &DAG) const {
1660   StoreSDNode *Store = cast<StoreSDNode>(Op);
1661   SDValue Val = Store->getValue();
1662   EVT VT = Val.getValueType();
1663 
1664   // If this is a 2 element vector, we really want to scalarize and not create
1665   // weird 1 element vectors.
1666   if (VT.getVectorNumElements() == 2)
1667     return scalarizeVectorStore(Store, DAG);
1668 
1669   EVT MemVT = Store->getMemoryVT();
1670   SDValue Chain = Store->getChain();
1671   SDValue BasePtr = Store->getBasePtr();
1672   SDLoc SL(Op);
1673 
1674   EVT LoVT, HiVT;
1675   EVT LoMemVT, HiMemVT;
1676   SDValue Lo, Hi;
1677 
1678   std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1679   std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1680   std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1681 
1682   SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1683 
1684   const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1685   unsigned BaseAlign = Store->getAlignment();
1686   unsigned Size = LoMemVT.getStoreSize();
1687   unsigned HiAlign = MinAlign(BaseAlign, Size);
1688 
1689   SDValue LoStore =
1690       DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1691                         Store->getMemOperand()->getFlags());
1692   SDValue HiStore =
1693       DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1694                         HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1695 
1696   return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1697 }
1698 
1699 // This is a shortcut for integer division because we have fast i32<->f32
1700 // conversions, and fast f32 reciprocal instructions. The fractional part of a
1701 // float is enough to accurately represent up to a 24-bit signed integer.
1702 SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
1703                                             bool Sign) const {
1704   SDLoc DL(Op);
1705   EVT VT = Op.getValueType();
1706   SDValue LHS = Op.getOperand(0);
1707   SDValue RHS = Op.getOperand(1);
1708   MVT IntVT = MVT::i32;
1709   MVT FltVT = MVT::f32;
1710 
1711   unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1712   if (LHSSignBits < 9)
1713     return SDValue();
1714 
1715   unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1716   if (RHSSignBits < 9)
1717     return SDValue();
1718 
1719   unsigned BitSize = VT.getSizeInBits();
1720   unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1721   unsigned DivBits = BitSize - SignBits;
1722   if (Sign)
1723     ++DivBits;
1724 
1725   ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
1726   ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
1727 
1728   SDValue jq = DAG.getConstant(1, DL, IntVT);
1729 
1730   if (Sign) {
1731     // char|short jq = ia ^ ib;
1732     jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1733 
1734     // jq = jq >> (bitsize - 2)
1735     jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1736                      DAG.getConstant(BitSize - 2, DL, VT));
1737 
1738     // jq = jq | 0x1
1739     jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1740   }
1741 
1742   // int ia = (int)LHS;
1743   SDValue ia = LHS;
1744 
1745   // int ib, (int)RHS;
1746   SDValue ib = RHS;
1747 
1748   // float fa = (float)ia;
1749   SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1750 
1751   // float fb = (float)ib;
1752   SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1753 
1754   SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1755                            fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1756 
1757   // fq = trunc(fq);
1758   fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1759 
1760   // float fqneg = -fq;
1761   SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1762 
1763   MachineFunction &MF = DAG.getMachineFunction();
1764   const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
1765 
1766   // float fr = mad(fqneg, fb, fa);
1767   unsigned OpCode = !Subtarget->hasMadMacF32Insts() ?
1768                     (unsigned)ISD::FMA :
1769                     !MFI->getMode().allFP32Denormals() ?
1770                     (unsigned)ISD::FMAD :
1771                     (unsigned)AMDGPUISD::FMAD_FTZ;
1772   SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1773 
1774   // int iq = (int)fq;
1775   SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1776 
1777   // fr = fabs(fr);
1778   fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1779 
1780   // fb = fabs(fb);
1781   fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1782 
1783   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1784 
1785   // int cv = fr >= fb;
1786   SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1787 
1788   // jq = (cv ? jq : 0);
1789   jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1790 
1791   // dst = iq + jq;
1792   SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
1793 
1794   // Rem needs compensation, it's easier to recompute it
1795   SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
1796   Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
1797 
1798   // Truncate to number of bits this divide really is.
1799   if (Sign) {
1800     SDValue InRegSize
1801       = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
1802     Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
1803     Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
1804   } else {
1805     SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
1806     Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
1807     Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
1808   }
1809 
1810   return DAG.getMergeValues({ Div, Rem }, DL);
1811 }
1812 
1813 void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
1814                                       SelectionDAG &DAG,
1815                                       SmallVectorImpl<SDValue> &Results) const {
1816   SDLoc DL(Op);
1817   EVT VT = Op.getValueType();
1818 
1819   assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
1820 
1821   EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1822 
1823   SDValue One = DAG.getConstant(1, DL, HalfVT);
1824   SDValue Zero = DAG.getConstant(0, DL, HalfVT);
1825 
1826   //HiLo split
1827   SDValue LHS = Op.getOperand(0);
1828   SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
1829   SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, One);
1830 
1831   SDValue RHS = Op.getOperand(1);
1832   SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
1833   SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, One);
1834 
1835   if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
1836       DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
1837 
1838     SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1839                               LHS_Lo, RHS_Lo);
1840 
1841     SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
1842     SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
1843 
1844     Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
1845     Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
1846     return;
1847   }
1848 
1849   if (isTypeLegal(MVT::i64)) {
1850     // The algorithm here is based on ideas from "Software Integer Division",
1851     // Tom Rodeheffer, August 2008.
1852 
1853     MachineFunction &MF = DAG.getMachineFunction();
1854     const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1855 
1856     // Compute denominator reciprocal.
1857     unsigned FMAD = !Subtarget->hasMadMacF32Insts() ?
1858                     (unsigned)ISD::FMA :
1859                     !MFI->getMode().allFP32Denormals() ?
1860                     (unsigned)ISD::FMAD :
1861                     (unsigned)AMDGPUISD::FMAD_FTZ;
1862 
1863     SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
1864     SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
1865     SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
1866       DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
1867       Cvt_Lo);
1868     SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
1869     SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
1870       DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
1871     SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
1872       DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
1873     SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
1874     SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
1875       DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
1876       Mul1);
1877     SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
1878     SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
1879     SDValue Rcp64 = DAG.getBitcast(VT,
1880                         DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
1881 
1882     SDValue Zero64 = DAG.getConstant(0, DL, VT);
1883     SDValue One64  = DAG.getConstant(1, DL, VT);
1884     SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
1885     SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
1886 
1887     // First round of UNR (Unsigned integer Newton-Raphson).
1888     SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
1889     SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
1890     SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
1891     SDValue Mulhi1_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
1892                                     Zero);
1893     SDValue Mulhi1_Hi =
1894         DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1, One);
1895     SDValue Add1_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Lo,
1896                                   Mulhi1_Lo, Zero1);
1897     SDValue Add1_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Hi,
1898                                   Mulhi1_Hi, Add1_Lo.getValue(1));
1899     SDValue Add1 = DAG.getBitcast(VT,
1900                         DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
1901 
1902     // Second round of UNR.
1903     SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
1904     SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
1905     SDValue Mulhi2_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
1906                                     Zero);
1907     SDValue Mulhi2_Hi =
1908         DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2, One);
1909     SDValue Add2_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Lo,
1910                                   Mulhi2_Lo, Zero1);
1911     SDValue Add2_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Hi,
1912                                   Mulhi2_Hi, Add2_Lo.getValue(1));
1913     SDValue Add2 = DAG.getBitcast(VT,
1914                         DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
1915 
1916     SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
1917 
1918     SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
1919 
1920     SDValue Mul3_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, Zero);
1921     SDValue Mul3_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, One);
1922     SDValue Sub1_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Lo,
1923                                   Mul3_Lo, Zero1);
1924     SDValue Sub1_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Hi,
1925                                   Mul3_Hi, Sub1_Lo.getValue(1));
1926     SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
1927     SDValue Sub1 = DAG.getBitcast(VT,
1928                         DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
1929 
1930     SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
1931     SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
1932                                  ISD::SETUGE);
1933     SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
1934                                  ISD::SETUGE);
1935     SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
1936 
1937     // TODO: Here and below portions of the code can be enclosed into if/endif.
1938     // Currently control flow is unconditional and we have 4 selects after
1939     // potential endif to substitute PHIs.
1940 
1941     // if C3 != 0 ...
1942     SDValue Sub2_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Lo,
1943                                   RHS_Lo, Zero1);
1944     SDValue Sub2_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Mi,
1945                                   RHS_Hi, Sub1_Lo.getValue(1));
1946     SDValue Sub2_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1947                                   Zero, Sub2_Lo.getValue(1));
1948     SDValue Sub2 = DAG.getBitcast(VT,
1949                         DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
1950 
1951     SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
1952 
1953     SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
1954                                  ISD::SETUGE);
1955     SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
1956                                  ISD::SETUGE);
1957     SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
1958 
1959     // if (C6 != 0)
1960     SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
1961 
1962     SDValue Sub3_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Lo,
1963                                   RHS_Lo, Zero1);
1964     SDValue Sub3_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1965                                   RHS_Hi, Sub2_Lo.getValue(1));
1966     SDValue Sub3_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub3_Mi,
1967                                   Zero, Sub3_Lo.getValue(1));
1968     SDValue Sub3 = DAG.getBitcast(VT,
1969                         DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
1970 
1971     // endif C6
1972     // endif C3
1973 
1974     SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
1975     SDValue Div  = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
1976 
1977     SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
1978     SDValue Rem  = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
1979 
1980     Results.push_back(Div);
1981     Results.push_back(Rem);
1982 
1983     return;
1984   }
1985 
1986   // r600 expandion.
1987   // Get Speculative values
1988   SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
1989   SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
1990 
1991   SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
1992   SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
1993   REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
1994 
1995   SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
1996   SDValue DIV_Lo = Zero;
1997 
1998   const unsigned halfBitWidth = HalfVT.getSizeInBits();
1999 
2000   for (unsigned i = 0; i < halfBitWidth; ++i) {
2001     const unsigned bitPos = halfBitWidth - i - 1;
2002     SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2003     // Get value of high bit
2004     SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2005     HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2006     HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2007 
2008     // Shift
2009     REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2010     // Add LHS high bit
2011     REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2012 
2013     SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2014     SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2015 
2016     DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2017 
2018     // Update REM
2019     SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2020     REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2021   }
2022 
2023   SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2024   DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2025   Results.push_back(DIV);
2026   Results.push_back(REM);
2027 }
2028 
2029 SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
2030                                            SelectionDAG &DAG) const {
2031   SDLoc DL(Op);
2032   EVT VT = Op.getValueType();
2033 
2034   if (VT == MVT::i64) {
2035     SmallVector<SDValue, 2> Results;
2036     LowerUDIVREM64(Op, DAG, Results);
2037     return DAG.getMergeValues(Results, DL);
2038   }
2039 
2040   if (VT == MVT::i32) {
2041     if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2042       return Res;
2043   }
2044 
2045   SDValue X = Op.getOperand(0);
2046   SDValue Y = Op.getOperand(1);
2047 
2048   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2049   // algorithm used here.
2050 
2051   // Initial estimate of inv(y).
2052   SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2053 
2054   // One round of UNR.
2055   SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2056   SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2057   Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2058                   DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2059 
2060   // Quotient/remainder estimate.
2061   SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2062   SDValue R =
2063       DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2064 
2065   // First quotient/remainder refinement.
2066   EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2067   SDValue One = DAG.getConstant(1, DL, VT);
2068   SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2069   Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2070                   DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2071   R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2072                   DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2073 
2074   // Second quotient/remainder refinement.
2075   Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2076   Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2077                   DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2078   R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2079                   DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2080 
2081   return DAG.getMergeValues({Q, R}, DL);
2082 }
2083 
2084 SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
2085                                            SelectionDAG &DAG) const {
2086   SDLoc DL(Op);
2087   EVT VT = Op.getValueType();
2088 
2089   SDValue LHS = Op.getOperand(0);
2090   SDValue RHS = Op.getOperand(1);
2091 
2092   SDValue Zero = DAG.getConstant(0, DL, VT);
2093   SDValue NegOne = DAG.getConstant(-1, DL, VT);
2094 
2095   if (VT == MVT::i32) {
2096     if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2097       return Res;
2098   }
2099 
2100   if (VT == MVT::i64 &&
2101       DAG.ComputeNumSignBits(LHS) > 32 &&
2102       DAG.ComputeNumSignBits(RHS) > 32) {
2103     EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2104 
2105     //HiLo split
2106     SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2107     SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2108     SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2109                                  LHS_Lo, RHS_Lo);
2110     SDValue Res[2] = {
2111       DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2112       DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2113     };
2114     return DAG.getMergeValues(Res, DL);
2115   }
2116 
2117   SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2118   SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2119   SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2120   SDValue RSign = LHSign; // Remainder sign is the same as LHS
2121 
2122   LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2123   RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2124 
2125   LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2126   RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2127 
2128   SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2129   SDValue Rem = Div.getValue(1);
2130 
2131   Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2132   Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2133 
2134   Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2135   Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2136 
2137   SDValue Res[2] = {
2138     Div,
2139     Rem
2140   };
2141   return DAG.getMergeValues(Res, DL);
2142 }
2143 
2144 // (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
2145 SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
2146   SDLoc SL(Op);
2147   EVT VT = Op.getValueType();
2148   auto Flags = Op->getFlags();
2149   SDValue X = Op.getOperand(0);
2150   SDValue Y = Op.getOperand(1);
2151 
2152   SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
2153   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
2154   SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
2155   // TODO: For f32 use FMAD instead if !hasFastFMA32?
2156   return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
2157 }
2158 
2159 SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
2160   SDLoc SL(Op);
2161   SDValue Src = Op.getOperand(0);
2162 
2163   // result = trunc(src)
2164   // if (src > 0.0 && src != result)
2165   //   result += 1.0
2166 
2167   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2168 
2169   const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2170   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2171 
2172   EVT SetCCVT =
2173       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2174 
2175   SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2176   SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2177   SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2178 
2179   SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2180   // TODO: Should this propagate fast-math-flags?
2181   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2182 }
2183 
2184 static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
2185                                   SelectionDAG &DAG) {
2186   const unsigned FractBits = 52;
2187   const unsigned ExpBits = 11;
2188 
2189   SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2190                                 Hi,
2191                                 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2192                                 DAG.getConstant(ExpBits, SL, MVT::i32));
2193   SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2194                             DAG.getConstant(1023, SL, MVT::i32));
2195 
2196   return Exp;
2197 }
2198 
2199 SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
2200   SDLoc SL(Op);
2201   SDValue Src = Op.getOperand(0);
2202 
2203   assert(Op.getValueType() == MVT::f64);
2204 
2205   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2206 
2207   // Extract the upper half, since this is where we will find the sign and
2208   // exponent.
2209   SDValue Hi = getHiHalf64(Src, DAG);
2210 
2211   SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2212 
2213   const unsigned FractBits = 52;
2214 
2215   // Extract the sign bit.
2216   const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2217   SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2218 
2219   // Extend back to 64-bits.
2220   SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2221   SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2222 
2223   SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2224   const SDValue FractMask
2225     = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2226 
2227   SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2228   SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2229   SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2230 
2231   EVT SetCCVT =
2232       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2233 
2234   const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2235 
2236   SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2237   SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2238 
2239   SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2240   SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2241 
2242   return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2243 }
2244 
2245 SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
2246   SDLoc SL(Op);
2247   SDValue Src = Op.getOperand(0);
2248 
2249   assert(Op.getValueType() == MVT::f64);
2250 
2251   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2252   SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2253   SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2254 
2255   // TODO: Should this propagate fast-math-flags?
2256 
2257   SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2258   SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2259 
2260   SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2261 
2262   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2263   SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2264 
2265   EVT SetCCVT =
2266       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2267   SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2268 
2269   return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2270 }
2271 
2272 SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const {
2273   // FNEARBYINT and FRINT are the same, except in their handling of FP
2274   // exceptions. Those aren't really meaningful for us, and OpenCL only has
2275   // rint, so just treat them as equivalent.
2276   return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
2277 }
2278 
2279 // XXX - May require not supporting f32 denormals?
2280 
2281 // Don't handle v2f16. The extra instructions to scalarize and repack around the
2282 // compare and vselect end up producing worse code than scalarizing the whole
2283 // operation.
2284 SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2285   SDLoc SL(Op);
2286   SDValue X = Op.getOperand(0);
2287   EVT VT = Op.getValueType();
2288 
2289   SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2290 
2291   // TODO: Should this propagate fast-math-flags?
2292 
2293   SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2294 
2295   SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2296 
2297   const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2298   const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2299   const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2300 
2301   SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X);
2302 
2303   EVT SetCCVT =
2304       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2305 
2306   SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2307 
2308   SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero);
2309 
2310   return DAG.getNode(ISD::FADD, SL, VT, T, Sel);
2311 }
2312 
2313 SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
2314   SDLoc SL(Op);
2315   SDValue Src = Op.getOperand(0);
2316 
2317   // result = trunc(src);
2318   // if (src < 0.0 && src != result)
2319   //   result += -1.0.
2320 
2321   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2322 
2323   const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2324   const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2325 
2326   EVT SetCCVT =
2327       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2328 
2329   SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2330   SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2331   SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2332 
2333   SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2334   // TODO: Should this propagate fast-math-flags?
2335   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2336 }
2337 
2338 SDValue AMDGPUTargetLowering::LowerFLOG(SDValue Op, SelectionDAG &DAG,
2339                                         double Log2BaseInverted) const {
2340   EVT VT = Op.getValueType();
2341 
2342   SDLoc SL(Op);
2343   SDValue Operand = Op.getOperand(0);
2344   SDValue Log2Operand = DAG.getNode(ISD::FLOG2, SL, VT, Operand);
2345   SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2346 
2347   return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand);
2348 }
2349 
2350 // exp2(M_LOG2E_F * f);
2351 SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
2352   EVT VT = Op.getValueType();
2353   SDLoc SL(Op);
2354   SDValue Src = Op.getOperand(0);
2355 
2356   const SDValue K = DAG.getConstantFP(numbers::log2e, SL, VT);
2357   SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Src, K, Op->getFlags());
2358   return DAG.getNode(ISD::FEXP2, SL, VT, Mul, Op->getFlags());
2359 }
2360 
2361 static bool isCtlzOpc(unsigned Opc) {
2362   return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
2363 }
2364 
2365 static bool isCttzOpc(unsigned Opc) {
2366   return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
2367 }
2368 
2369 SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {
2370   SDLoc SL(Op);
2371   SDValue Src = Op.getOperand(0);
2372 
2373   assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
2374   bool Ctlz = isCtlzOpc(Op.getOpcode());
2375   unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
2376 
2377   bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
2378                    Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
2379 
2380   if (Src.getValueType() == MVT::i32) {
2381     // (ctlz hi:lo) -> (umin (ffbh src), 32)
2382     // (cttz hi:lo) -> (umin (ffbl src), 32)
2383     // (ctlz_zero_undef src) -> (ffbh src)
2384     // (cttz_zero_undef src) -> (ffbl src)
2385     SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
2386     if (!ZeroUndef) {
2387       const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
2388       NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const32);
2389     }
2390     return NewOpr;
2391   }
2392 
2393   SDValue Lo, Hi;
2394   std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2395 
2396   SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
2397   SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
2398 
2399   // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
2400   // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
2401   // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2402   // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2403 
2404   unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
2405   const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
2406   if (Ctlz)
2407     OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
2408   else
2409     OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
2410 
2411   SDValue NewOpr;
2412   NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
2413   if (!ZeroUndef) {
2414     const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
2415     NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
2416   }
2417 
2418   return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
2419 }
2420 
2421 SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
2422                                                bool Signed) const {
2423   // The regular method converting a 64-bit integer to float roughly consists of
2424   // 2 steps: normalization and rounding. In fact, after normalization, the
2425   // conversion from a 64-bit integer to a float is essentially the same as the
2426   // one from a 32-bit integer. The only difference is that it has more
2427   // trailing bits to be rounded. To leverage the native 32-bit conversion, a
2428   // 64-bit integer could be preprocessed and fit into a 32-bit integer then
2429   // converted into the correct float number. The basic steps for the unsigned
2430   // conversion are illustrated in the following pseudo code:
2431   //
2432   // f32 uitofp(i64 u) {
2433   //   i32 hi, lo = split(u);
2434   //   // Only count the leading zeros in hi as we have native support of the
2435   //   // conversion from i32 to f32. If hi is all 0s, the conversion is
2436   //   // reduced to a 32-bit one automatically.
2437   //   i32 shamt = clz(hi); // Return 32 if hi is all 0s.
2438   //   u <<= shamt;
2439   //   hi, lo = split(u);
2440   //   hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
2441   //   // convert it as a 32-bit integer and scale the result back.
2442   //   return uitofp(hi) * 2^(32 - shamt);
2443   // }
2444   //
2445   // The signed one follows the same principle but uses 'ffbh_i32' to count its
2446   // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
2447   // converted instead followed by negation based its sign bit.
2448 
2449   SDLoc SL(Op);
2450   SDValue Src = Op.getOperand(0);
2451 
2452   SDValue Lo, Hi;
2453   std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2454   SDValue Sign;
2455   SDValue ShAmt;
2456   if (Signed && Subtarget->isGCN()) {
2457     // We also need to consider the sign bit in Lo if Hi has just sign bits,
2458     // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
2459     // account. That is, the maximal shift is
2460     // - 32 if Lo and Hi have opposite signs;
2461     // - 33 if Lo and Hi have the same sign.
2462     //
2463     // Or, MaxShAmt = 33 + OppositeSign, where
2464     //
2465     // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
2466     // - -1 if Lo and Hi have opposite signs; and
2467     // -  0 otherwise.
2468     //
2469     // All in all, ShAmt is calculated as
2470     //
2471     //  umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
2472     //
2473     // or
2474     //
2475     //  umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
2476     //
2477     // to reduce the critical path.
2478     SDValue OppositeSign = DAG.getNode(
2479         ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
2480         DAG.getConstant(31, SL, MVT::i32));
2481     SDValue MaxShAmt =
2482         DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
2483                     OppositeSign);
2484     // Count the leading sign bits.
2485     ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
2486     // Different from unsigned conversion, the shift should be one bit less to
2487     // preserve the sign bit.
2488     ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
2489                         DAG.getConstant(1, SL, MVT::i32));
2490     ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
2491   } else {
2492     if (Signed) {
2493       // Without 'ffbh_i32', only leading zeros could be counted. Take the
2494       // absolute value first.
2495       Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
2496                          DAG.getConstant(63, SL, MVT::i64));
2497       SDValue Abs =
2498           DAG.getNode(ISD::XOR, SL, MVT::i64,
2499                       DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
2500       std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
2501     }
2502     // Count the leading zeros.
2503     ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
2504     // The shift amount for signed integers is [0, 32].
2505   }
2506   // Normalize the given 64-bit integer.
2507   SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
2508   // Split it again.
2509   std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
2510   // Calculate the adjust bit for rounding.
2511   // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
2512   SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
2513                                DAG.getConstant(1, SL, MVT::i32), Lo);
2514   // Get the 32-bit normalized integer.
2515   Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
2516   // Convert the normalized 32-bit integer into f32.
2517   unsigned Opc =
2518       (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
2519   SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
2520 
2521   // Finally, need to scale back the converted floating number as the original
2522   // 64-bit integer is converted as a 32-bit one.
2523   ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
2524                       ShAmt);
2525   // On GCN, use LDEXP directly.
2526   if (Subtarget->isGCN())
2527     return DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f32, FVal, ShAmt);
2528 
2529   // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
2530   // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
2531   // exponent is enough to avoid overflowing into the sign bit.
2532   SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
2533                             DAG.getConstant(23, SL, MVT::i32));
2534   SDValue IVal =
2535       DAG.getNode(ISD::ADD, SL, MVT::i32,
2536                   DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
2537   if (Signed) {
2538     // Set the sign bit.
2539     Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
2540                        DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
2541                        DAG.getConstant(31, SL, MVT::i32));
2542     IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
2543   }
2544   return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
2545 }
2546 
2547 SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
2548                                                bool Signed) const {
2549   SDLoc SL(Op);
2550   SDValue Src = Op.getOperand(0);
2551 
2552   SDValue Lo, Hi;
2553   std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2554 
2555   SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
2556                               SL, MVT::f64, Hi);
2557 
2558   SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
2559 
2560   SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
2561                               DAG.getConstant(32, SL, MVT::i32));
2562   // TODO: Should this propagate fast-math-flags?
2563   return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
2564 }
2565 
2566 SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
2567                                                SelectionDAG &DAG) const {
2568   // TODO: Factor out code common with LowerSINT_TO_FP.
2569   EVT DestVT = Op.getValueType();
2570   SDValue Src = Op.getOperand(0);
2571   EVT SrcVT = Src.getValueType();
2572 
2573   if (SrcVT == MVT::i16) {
2574     if (DestVT == MVT::f16)
2575       return Op;
2576     SDLoc DL(Op);
2577 
2578     // Promote src to i32
2579     SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
2580     return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
2581   }
2582 
2583   assert(SrcVT == MVT::i64 && "operation should be legal");
2584 
2585   if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2586     SDLoc DL(Op);
2587 
2588     SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2589     SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2590     SDValue FPRound =
2591         DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2592 
2593     return FPRound;
2594   }
2595 
2596   if (DestVT == MVT::f32)
2597     return LowerINT_TO_FP32(Op, DAG, false);
2598 
2599   assert(DestVT == MVT::f64);
2600   return LowerINT_TO_FP64(Op, DAG, false);
2601 }
2602 
2603 SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
2604                                               SelectionDAG &DAG) const {
2605   EVT DestVT = Op.getValueType();
2606 
2607   SDValue Src = Op.getOperand(0);
2608   EVT SrcVT = Src.getValueType();
2609 
2610   if (SrcVT == MVT::i16) {
2611     if (DestVT == MVT::f16)
2612       return Op;
2613 
2614     SDLoc DL(Op);
2615     // Promote src to i32
2616     SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
2617     return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
2618   }
2619 
2620   assert(SrcVT == MVT::i64 && "operation should be legal");
2621 
2622   // TODO: Factor out code common with LowerUINT_TO_FP.
2623 
2624   if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2625     SDLoc DL(Op);
2626     SDValue Src = Op.getOperand(0);
2627 
2628     SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2629     SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2630     SDValue FPRound =
2631         DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2632 
2633     return FPRound;
2634   }
2635 
2636   if (DestVT == MVT::f32)
2637     return LowerINT_TO_FP32(Op, DAG, true);
2638 
2639   assert(DestVT == MVT::f64);
2640   return LowerINT_TO_FP64(Op, DAG, true);
2641 }
2642 
2643 SDValue AMDGPUTargetLowering::LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG,
2644                                                bool Signed) const {
2645   SDLoc SL(Op);
2646 
2647   SDValue Src = Op.getOperand(0);
2648   EVT SrcVT = Src.getValueType();
2649 
2650   assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
2651 
2652   // The basic idea of converting a floating point number into a pair of 32-bit
2653   // integers is illustrated as follows:
2654   //
2655   //     tf := trunc(val);
2656   //    hif := floor(tf * 2^-32);
2657   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2658   //     hi := fptoi(hif);
2659   //     lo := fptoi(lof);
2660   //
2661   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
2662   SDValue Sign;
2663   if (Signed && SrcVT == MVT::f32) {
2664     // However, a 32-bit floating point number has only 23 bits mantissa and
2665     // it's not enough to hold all the significant bits of `lof` if val is
2666     // negative. To avoid the loss of precision, We need to take the absolute
2667     // value after truncating and flip the result back based on the original
2668     // signedness.
2669     Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
2670                        DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
2671                        DAG.getConstant(31, SL, MVT::i32));
2672     Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
2673   }
2674 
2675   SDValue K0, K1;
2676   if (SrcVT == MVT::f64) {
2677     K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)),
2678                            SL, SrcVT);
2679     K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)),
2680                            SL, SrcVT);
2681   } else {
2682     K0 = DAG.getConstantFP(BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)), SL,
2683                            SrcVT);
2684     K1 = DAG.getConstantFP(BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)), SL,
2685                            SrcVT);
2686   }
2687   // TODO: Should this propagate fast-math-flags?
2688   SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
2689 
2690   SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
2691 
2692   SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
2693 
2694   SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
2695                                                          : ISD::FP_TO_UINT,
2696                            SL, MVT::i32, FloorMul);
2697   SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
2698 
2699   SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
2700                                DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
2701 
2702   if (Signed && SrcVT == MVT::f32) {
2703     assert(Sign);
2704     // Flip the result based on the signedness, which is either all 0s or 1s.
2705     Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
2706                        DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
2707     // r := xor(r, sign) - sign;
2708     Result =
2709         DAG.getNode(ISD::SUB, SL, MVT::i64,
2710                     DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
2711   }
2712 
2713   return Result;
2714 }
2715 
2716 SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
2717   SDLoc DL(Op);
2718   SDValue N0 = Op.getOperand(0);
2719 
2720   // Convert to target node to get known bits
2721   if (N0.getValueType() == MVT::f32)
2722     return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
2723 
2724   if (getTargetMachine().Options.UnsafeFPMath) {
2725     // There is a generic expand for FP_TO_FP16 with unsafe fast math.
2726     return SDValue();
2727   }
2728 
2729   assert(N0.getSimpleValueType() == MVT::f64);
2730 
2731   // f64 -> f16 conversion using round-to-nearest-even rounding mode.
2732   const unsigned ExpMask = 0x7ff;
2733   const unsigned ExpBiasf64 = 1023;
2734   const unsigned ExpBiasf16 = 15;
2735   SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
2736   SDValue One = DAG.getConstant(1, DL, MVT::i32);
2737   SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
2738   SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
2739                            DAG.getConstant(32, DL, MVT::i64));
2740   UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
2741   U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
2742   SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2743                           DAG.getConstant(20, DL, MVT::i64));
2744   E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
2745                   DAG.getConstant(ExpMask, DL, MVT::i32));
2746   // Subtract the fp64 exponent bias (1023) to get the real exponent and
2747   // add the f16 bias (15) to get the biased exponent for the f16 format.
2748   E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
2749                   DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
2750 
2751   SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2752                           DAG.getConstant(8, DL, MVT::i32));
2753   M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
2754                   DAG.getConstant(0xffe, DL, MVT::i32));
2755 
2756   SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
2757                                   DAG.getConstant(0x1ff, DL, MVT::i32));
2758   MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
2759 
2760   SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
2761   M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
2762 
2763   // (M != 0 ? 0x0200 : 0) | 0x7c00;
2764   SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
2765       DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
2766                       Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
2767 
2768   // N = M | (E << 12);
2769   SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2770       DAG.getNode(ISD::SHL, DL, MVT::i32, E,
2771                   DAG.getConstant(12, DL, MVT::i32)));
2772 
2773   // B = clamp(1-E, 0, 13);
2774   SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
2775                                   One, E);
2776   SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
2777   B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
2778                   DAG.getConstant(13, DL, MVT::i32));
2779 
2780   SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2781                                    DAG.getConstant(0x1000, DL, MVT::i32));
2782 
2783   SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
2784   SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
2785   SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
2786   D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
2787 
2788   SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
2789   SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
2790                               DAG.getConstant(0x7, DL, MVT::i32));
2791   V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
2792                   DAG.getConstant(2, DL, MVT::i32));
2793   SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
2794                                One, Zero, ISD::SETEQ);
2795   SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
2796                                One, Zero, ISD::SETGT);
2797   V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
2798   V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
2799 
2800   V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
2801                       DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
2802   V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
2803                       I, V, ISD::SETEQ);
2804 
2805   // Extract the sign bit.
2806   SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2807                             DAG.getConstant(16, DL, MVT::i32));
2808   Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
2809                      DAG.getConstant(0x8000, DL, MVT::i32));
2810 
2811   V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
2812   return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
2813 }
2814 
2815 SDValue AMDGPUTargetLowering::LowerFP_TO_INT(SDValue Op,
2816                                              SelectionDAG &DAG) const {
2817   SDValue Src = Op.getOperand(0);
2818   unsigned OpOpcode = Op.getOpcode();
2819   EVT SrcVT = Src.getValueType();
2820   EVT DestVT = Op.getValueType();
2821 
2822   // Will be selected natively
2823   if (SrcVT == MVT::f16 && DestVT == MVT::i16)
2824     return Op;
2825 
2826   // Promote i16 to i32
2827   if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
2828     SDLoc DL(Op);
2829 
2830     SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
2831     return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
2832   }
2833 
2834   if (SrcVT == MVT::f16 ||
2835       (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
2836     SDLoc DL(Op);
2837 
2838     SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
2839     unsigned Ext =
2840         OpOpcode == ISD::FP_TO_SINT ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
2841     return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
2842   }
2843 
2844   if (DestVT == MVT::i64 && (SrcVT == MVT::f32 || SrcVT == MVT::f64))
2845     return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
2846 
2847   return SDValue();
2848 }
2849 
2850 SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
2851                                                      SelectionDAG &DAG) const {
2852   EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
2853   MVT VT = Op.getSimpleValueType();
2854   MVT ScalarVT = VT.getScalarType();
2855 
2856   assert(VT.isVector());
2857 
2858   SDValue Src = Op.getOperand(0);
2859   SDLoc DL(Op);
2860 
2861   // TODO: Don't scalarize on Evergreen?
2862   unsigned NElts = VT.getVectorNumElements();
2863   SmallVector<SDValue, 8> Args;
2864   DAG.ExtractVectorElements(Src, Args, 0, NElts);
2865 
2866   SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
2867   for (unsigned I = 0; I < NElts; ++I)
2868     Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
2869 
2870   return DAG.getBuildVector(VT, DL, Args);
2871 }
2872 
2873 //===----------------------------------------------------------------------===//
2874 // Custom DAG optimizations
2875 //===----------------------------------------------------------------------===//
2876 
2877 static bool isU24(SDValue Op, SelectionDAG &DAG) {
2878   return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
2879 }
2880 
2881 static bool isI24(SDValue Op, SelectionDAG &DAG) {
2882   EVT VT = Op.getValueType();
2883   return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
2884                                      // as unsigned 24-bit values.
2885          AMDGPUTargetLowering::numBitsSigned(Op, DAG) <= 24;
2886 }
2887 
2888 static SDValue simplifyMul24(SDNode *Node24,
2889                              TargetLowering::DAGCombinerInfo &DCI) {
2890   SelectionDAG &DAG = DCI.DAG;
2891   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2892   bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
2893 
2894   SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
2895   SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
2896   unsigned NewOpcode = Node24->getOpcode();
2897   if (IsIntrin) {
2898     unsigned IID = cast<ConstantSDNode>(Node24->getOperand(0))->getZExtValue();
2899     switch (IID) {
2900     case Intrinsic::amdgcn_mul_i24:
2901       NewOpcode = AMDGPUISD::MUL_I24;
2902       break;
2903     case Intrinsic::amdgcn_mul_u24:
2904       NewOpcode = AMDGPUISD::MUL_U24;
2905       break;
2906     case Intrinsic::amdgcn_mulhi_i24:
2907       NewOpcode = AMDGPUISD::MULHI_I24;
2908       break;
2909     case Intrinsic::amdgcn_mulhi_u24:
2910       NewOpcode = AMDGPUISD::MULHI_U24;
2911       break;
2912     default:
2913       llvm_unreachable("Expected 24-bit mul intrinsic");
2914     }
2915   }
2916 
2917   APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
2918 
2919   // First try to simplify using SimplifyMultipleUseDemandedBits which allows
2920   // the operands to have other uses, but will only perform simplifications that
2921   // involve bypassing some nodes for this user.
2922   SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
2923   SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
2924   if (DemandedLHS || DemandedRHS)
2925     return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
2926                        DemandedLHS ? DemandedLHS : LHS,
2927                        DemandedRHS ? DemandedRHS : RHS);
2928 
2929   // Now try SimplifyDemandedBits which can simplify the nodes used by our
2930   // operands if this node is the only user.
2931   if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
2932     return SDValue(Node24, 0);
2933   if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
2934     return SDValue(Node24, 0);
2935 
2936   return SDValue();
2937 }
2938 
2939 template <typename IntTy>
2940 static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
2941                                uint32_t Width, const SDLoc &DL) {
2942   if (Width + Offset < 32) {
2943     uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
2944     IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
2945     return DAG.getConstant(Result, DL, MVT::i32);
2946   }
2947 
2948   return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
2949 }
2950 
2951 static bool hasVolatileUser(SDNode *Val) {
2952   for (SDNode *U : Val->uses()) {
2953     if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
2954       if (M->isVolatile())
2955         return true;
2956     }
2957   }
2958 
2959   return false;
2960 }
2961 
2962 bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
2963   // i32 vectors are the canonical memory type.
2964   if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
2965     return false;
2966 
2967   if (!VT.isByteSized())
2968     return false;
2969 
2970   unsigned Size = VT.getStoreSize();
2971 
2972   if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
2973     return false;
2974 
2975   if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
2976     return false;
2977 
2978   return true;
2979 }
2980 
2981 // Replace load of an illegal type with a store of a bitcast to a friendlier
2982 // type.
2983 SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
2984                                                  DAGCombinerInfo &DCI) const {
2985   if (!DCI.isBeforeLegalize())
2986     return SDValue();
2987 
2988   LoadSDNode *LN = cast<LoadSDNode>(N);
2989   if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
2990     return SDValue();
2991 
2992   SDLoc SL(N);
2993   SelectionDAG &DAG = DCI.DAG;
2994   EVT VT = LN->getMemoryVT();
2995 
2996   unsigned Size = VT.getStoreSize();
2997   Align Alignment = LN->getAlign();
2998   if (Alignment < Size && isTypeLegal(VT)) {
2999     bool IsFast;
3000     unsigned AS = LN->getAddressSpace();
3001 
3002     // Expand unaligned loads earlier than legalization. Due to visitation order
3003     // problems during legalization, the emitted instructions to pack and unpack
3004     // the bytes again are not eliminated in the case of an unaligned copy.
3005     if (!allowsMisalignedMemoryAccesses(
3006             VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
3007       SDValue Ops[2];
3008 
3009       if (VT.isVector())
3010         std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(LN, DAG);
3011       else
3012         std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
3013 
3014       return DAG.getMergeValues(Ops, SDLoc(N));
3015     }
3016 
3017     if (!IsFast)
3018       return SDValue();
3019   }
3020 
3021   if (!shouldCombineMemoryType(VT))
3022     return SDValue();
3023 
3024   EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3025 
3026   SDValue NewLoad
3027     = DAG.getLoad(NewVT, SL, LN->getChain(),
3028                   LN->getBasePtr(), LN->getMemOperand());
3029 
3030   SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
3031   DCI.CombineTo(N, BC, NewLoad.getValue(1));
3032   return SDValue(N, 0);
3033 }
3034 
3035 // Replace store of an illegal type with a store of a bitcast to a friendlier
3036 // type.
3037 SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
3038                                                   DAGCombinerInfo &DCI) const {
3039   if (!DCI.isBeforeLegalize())
3040     return SDValue();
3041 
3042   StoreSDNode *SN = cast<StoreSDNode>(N);
3043   if (!SN->isSimple() || !ISD::isNormalStore(SN))
3044     return SDValue();
3045 
3046   EVT VT = SN->getMemoryVT();
3047   unsigned Size = VT.getStoreSize();
3048 
3049   SDLoc SL(N);
3050   SelectionDAG &DAG = DCI.DAG;
3051   Align Alignment = SN->getAlign();
3052   if (Alignment < Size && isTypeLegal(VT)) {
3053     bool IsFast;
3054     unsigned AS = SN->getAddressSpace();
3055 
3056     // Expand unaligned stores earlier than legalization. Due to visitation
3057     // order problems during legalization, the emitted instructions to pack and
3058     // unpack the bytes again are not eliminated in the case of an unaligned
3059     // copy.
3060     if (!allowsMisalignedMemoryAccesses(
3061             VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
3062       if (VT.isVector())
3063         return scalarizeVectorStore(SN, DAG);
3064 
3065       return expandUnalignedStore(SN, DAG);
3066     }
3067 
3068     if (!IsFast)
3069       return SDValue();
3070   }
3071 
3072   if (!shouldCombineMemoryType(VT))
3073     return SDValue();
3074 
3075   EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3076   SDValue Val = SN->getValue();
3077 
3078   //DCI.AddToWorklist(Val.getNode());
3079 
3080   bool OtherUses = !Val.hasOneUse();
3081   SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
3082   if (OtherUses) {
3083     SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
3084     DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
3085   }
3086 
3087   return DAG.getStore(SN->getChain(), SL, CastVal,
3088                       SN->getBasePtr(), SN->getMemOperand());
3089 }
3090 
3091 // FIXME: This should go in generic DAG combiner with an isTruncateFree check,
3092 // but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
3093 // issues.
3094 SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
3095                                                         DAGCombinerInfo &DCI) const {
3096   SelectionDAG &DAG = DCI.DAG;
3097   SDValue N0 = N->getOperand(0);
3098 
3099   // (vt2 (assertzext (truncate vt0:x), vt1)) ->
3100   //     (vt2 (truncate (assertzext vt0:x, vt1)))
3101   if (N0.getOpcode() == ISD::TRUNCATE) {
3102     SDValue N1 = N->getOperand(1);
3103     EVT ExtVT = cast<VTSDNode>(N1)->getVT();
3104     SDLoc SL(N);
3105 
3106     SDValue Src = N0.getOperand(0);
3107     EVT SrcVT = Src.getValueType();
3108     if (SrcVT.bitsGE(ExtVT)) {
3109       SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
3110       return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
3111     }
3112   }
3113 
3114   return SDValue();
3115 }
3116 
3117 SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
3118   SDNode *N, DAGCombinerInfo &DCI) const {
3119   unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
3120   switch (IID) {
3121   case Intrinsic::amdgcn_mul_i24:
3122   case Intrinsic::amdgcn_mul_u24:
3123   case Intrinsic::amdgcn_mulhi_i24:
3124   case Intrinsic::amdgcn_mulhi_u24:
3125     return simplifyMul24(N, DCI);
3126   case Intrinsic::amdgcn_fract:
3127   case Intrinsic::amdgcn_rsq:
3128   case Intrinsic::amdgcn_rcp_legacy:
3129   case Intrinsic::amdgcn_rsq_legacy:
3130   case Intrinsic::amdgcn_rsq_clamp:
3131   case Intrinsic::amdgcn_ldexp: {
3132     // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
3133     SDValue Src = N->getOperand(1);
3134     return Src.isUndef() ? Src : SDValue();
3135   }
3136   default:
3137     return SDValue();
3138   }
3139 }
3140 
3141 /// Split the 64-bit value \p LHS into two 32-bit components, and perform the
3142 /// binary operation \p Opc to it with the corresponding constant operands.
3143 SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
3144   DAGCombinerInfo &DCI, const SDLoc &SL,
3145   unsigned Opc, SDValue LHS,
3146   uint32_t ValLo, uint32_t ValHi) const {
3147   SelectionDAG &DAG = DCI.DAG;
3148   SDValue Lo, Hi;
3149   std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
3150 
3151   SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
3152   SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
3153 
3154   SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
3155   SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
3156 
3157   // Re-visit the ands. It's possible we eliminated one of them and it could
3158   // simplify the vector.
3159   DCI.AddToWorklist(Lo.getNode());
3160   DCI.AddToWorklist(Hi.getNode());
3161 
3162   SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
3163   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3164 }
3165 
3166 SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
3167                                                 DAGCombinerInfo &DCI) const {
3168   EVT VT = N->getValueType(0);
3169 
3170   ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3171   if (!RHS)
3172     return SDValue();
3173 
3174   SDValue LHS = N->getOperand(0);
3175   unsigned RHSVal = RHS->getZExtValue();
3176   if (!RHSVal)
3177     return LHS;
3178 
3179   SDLoc SL(N);
3180   SelectionDAG &DAG = DCI.DAG;
3181 
3182   switch (LHS->getOpcode()) {
3183   default:
3184     break;
3185   case ISD::ZERO_EXTEND:
3186   case ISD::SIGN_EXTEND:
3187   case ISD::ANY_EXTEND: {
3188     SDValue X = LHS->getOperand(0);
3189 
3190     if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
3191         isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
3192       // Prefer build_vector as the canonical form if packed types are legal.
3193       // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
3194       SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
3195        { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
3196       return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
3197     }
3198 
3199     // shl (ext x) => zext (shl x), if shift does not overflow int
3200     if (VT != MVT::i64)
3201       break;
3202     KnownBits Known = DAG.computeKnownBits(X);
3203     unsigned LZ = Known.countMinLeadingZeros();
3204     if (LZ < RHSVal)
3205       break;
3206     EVT XVT = X.getValueType();
3207     SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
3208     return DAG.getZExtOrTrunc(Shl, SL, VT);
3209   }
3210   }
3211 
3212   if (VT != MVT::i64)
3213     return SDValue();
3214 
3215   // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
3216 
3217   // On some subtargets, 64-bit shift is a quarter rate instruction. In the
3218   // common case, splitting this into a move and a 32-bit shift is faster and
3219   // the same code size.
3220   if (RHSVal < 32)
3221     return SDValue();
3222 
3223   SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
3224 
3225   SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
3226   SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
3227 
3228   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3229 
3230   SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
3231   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3232 }
3233 
3234 SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
3235                                                 DAGCombinerInfo &DCI) const {
3236   if (N->getValueType(0) != MVT::i64)
3237     return SDValue();
3238 
3239   const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3240   if (!RHS)
3241     return SDValue();
3242 
3243   SelectionDAG &DAG = DCI.DAG;
3244   SDLoc SL(N);
3245   unsigned RHSVal = RHS->getZExtValue();
3246 
3247   // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
3248   if (RHSVal == 32) {
3249     SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3250     SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3251                                    DAG.getConstant(31, SL, MVT::i32));
3252 
3253     SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
3254     return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3255   }
3256 
3257   // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
3258   if (RHSVal == 63) {
3259     SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3260     SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3261                                    DAG.getConstant(31, SL, MVT::i32));
3262     SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
3263     return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3264   }
3265 
3266   return SDValue();
3267 }
3268 
3269 SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
3270                                                 DAGCombinerInfo &DCI) const {
3271   auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3272   if (!RHS)
3273     return SDValue();
3274 
3275   EVT VT = N->getValueType(0);
3276   SDValue LHS = N->getOperand(0);
3277   unsigned ShiftAmt = RHS->getZExtValue();
3278   SelectionDAG &DAG = DCI.DAG;
3279   SDLoc SL(N);
3280 
3281   // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
3282   // this improves the ability to match BFE patterns in isel.
3283   if (LHS.getOpcode() == ISD::AND) {
3284     if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
3285       unsigned MaskIdx, MaskLen;
3286       if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
3287           MaskIdx == ShiftAmt) {
3288         return DAG.getNode(
3289             ISD::AND, SL, VT,
3290             DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
3291             DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
3292       }
3293     }
3294   }
3295 
3296   if (VT != MVT::i64)
3297     return SDValue();
3298 
3299   if (ShiftAmt < 32)
3300     return SDValue();
3301 
3302   // srl i64:x, C for C >= 32
3303   // =>
3304   //   build_pair (srl hi_32(x), C - 32), 0
3305   SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3306 
3307   SDValue Hi = getHiHalf64(LHS, DAG);
3308 
3309   SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
3310   SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
3311 
3312   SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
3313 
3314   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
3315 }
3316 
3317 SDValue AMDGPUTargetLowering::performTruncateCombine(
3318   SDNode *N, DAGCombinerInfo &DCI) const {
3319   SDLoc SL(N);
3320   SelectionDAG &DAG = DCI.DAG;
3321   EVT VT = N->getValueType(0);
3322   SDValue Src = N->getOperand(0);
3323 
3324   // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
3325   if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
3326     SDValue Vec = Src.getOperand(0);
3327     if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
3328       SDValue Elt0 = Vec.getOperand(0);
3329       EVT EltVT = Elt0.getValueType();
3330       if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
3331         if (EltVT.isFloatingPoint()) {
3332           Elt0 = DAG.getNode(ISD::BITCAST, SL,
3333                              EltVT.changeTypeToInteger(), Elt0);
3334         }
3335 
3336         return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
3337       }
3338     }
3339   }
3340 
3341   // Equivalent of above for accessing the high element of a vector as an
3342   // integer operation.
3343   // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
3344   if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
3345     if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
3346       if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
3347         SDValue BV = stripBitcast(Src.getOperand(0));
3348         if (BV.getOpcode() == ISD::BUILD_VECTOR &&
3349             BV.getValueType().getVectorNumElements() == 2) {
3350           SDValue SrcElt = BV.getOperand(1);
3351           EVT SrcEltVT = SrcElt.getValueType();
3352           if (SrcEltVT.isFloatingPoint()) {
3353             SrcElt = DAG.getNode(ISD::BITCAST, SL,
3354                                  SrcEltVT.changeTypeToInteger(), SrcElt);
3355           }
3356 
3357           return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
3358         }
3359       }
3360     }
3361   }
3362 
3363   // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
3364   //
3365   // i16 (trunc (srl i64:x, K)), K <= 16 ->
3366   //     i16 (trunc (srl (i32 (trunc x), K)))
3367   if (VT.getScalarSizeInBits() < 32) {
3368     EVT SrcVT = Src.getValueType();
3369     if (SrcVT.getScalarSizeInBits() > 32 &&
3370         (Src.getOpcode() == ISD::SRL ||
3371          Src.getOpcode() == ISD::SRA ||
3372          Src.getOpcode() == ISD::SHL)) {
3373       SDValue Amt = Src.getOperand(1);
3374       KnownBits Known = DAG.computeKnownBits(Amt);
3375       unsigned Size = VT.getScalarSizeInBits();
3376       if ((Known.isConstant() && Known.getConstant().ule(Size)) ||
3377           (Known.countMaxActiveBits() <= Log2_32(Size))) {
3378         EVT MidVT = VT.isVector() ?
3379           EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3380                            VT.getVectorNumElements()) : MVT::i32;
3381 
3382         EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
3383         SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
3384                                     Src.getOperand(0));
3385         DCI.AddToWorklist(Trunc.getNode());
3386 
3387         if (Amt.getValueType() != NewShiftVT) {
3388           Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
3389           DCI.AddToWorklist(Amt.getNode());
3390         }
3391 
3392         SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
3393                                           Trunc, Amt);
3394         return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
3395       }
3396     }
3397   }
3398 
3399   return SDValue();
3400 }
3401 
3402 // We need to specifically handle i64 mul here to avoid unnecessary conversion
3403 // instructions. If we only match on the legalized i64 mul expansion,
3404 // SimplifyDemandedBits will be unable to remove them because there will be
3405 // multiple uses due to the separate mul + mulh[su].
3406 static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
3407                         SDValue N0, SDValue N1, unsigned Size, bool Signed) {
3408   if (Size <= 32) {
3409     unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3410     return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
3411   }
3412 
3413   unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3414   unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
3415 
3416   SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
3417   SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
3418 
3419   return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
3420 }
3421 
3422 SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
3423                                                 DAGCombinerInfo &DCI) const {
3424   EVT VT = N->getValueType(0);
3425 
3426   // Don't generate 24-bit multiplies on values that are in SGPRs, since
3427   // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3428   // unnecessarily). isDivergent() is used as an approximation of whether the
3429   // value is in an SGPR.
3430   if (!N->isDivergent())
3431     return SDValue();
3432 
3433   unsigned Size = VT.getSizeInBits();
3434   if (VT.isVector() || Size > 64)
3435     return SDValue();
3436 
3437   // There are i16 integer mul/mad.
3438   if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
3439     return SDValue();
3440 
3441   SelectionDAG &DAG = DCI.DAG;
3442   SDLoc DL(N);
3443 
3444   SDValue N0 = N->getOperand(0);
3445   SDValue N1 = N->getOperand(1);
3446 
3447   // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
3448   // in the source into any_extends if the result of the mul is truncated. Since
3449   // we can assume the high bits are whatever we want, use the underlying value
3450   // to avoid the unknown high bits from interfering.
3451   if (N0.getOpcode() == ISD::ANY_EXTEND)
3452     N0 = N0.getOperand(0);
3453 
3454   if (N1.getOpcode() == ISD::ANY_EXTEND)
3455     N1 = N1.getOperand(0);
3456 
3457   SDValue Mul;
3458 
3459   if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
3460     N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3461     N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3462     Mul = getMul24(DAG, DL, N0, N1, Size, false);
3463   } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
3464     N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3465     N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3466     Mul = getMul24(DAG, DL, N0, N1, Size, true);
3467   } else {
3468     return SDValue();
3469   }
3470 
3471   // We need to use sext even for MUL_U24, because MUL_U24 is used
3472   // for signed multiply of 8 and 16-bit types.
3473   return DAG.getSExtOrTrunc(Mul, DL, VT);
3474 }
3475 
3476 SDValue
3477 AMDGPUTargetLowering::performMulLoHiCombine(SDNode *N,
3478                                             DAGCombinerInfo &DCI) const {
3479   if (N->getValueType(0) != MVT::i32)
3480     return SDValue();
3481 
3482   SelectionDAG &DAG = DCI.DAG;
3483   SDLoc DL(N);
3484 
3485   SDValue N0 = N->getOperand(0);
3486   SDValue N1 = N->getOperand(1);
3487 
3488   // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
3489   // in the source into any_extends if the result of the mul is truncated. Since
3490   // we can assume the high bits are whatever we want, use the underlying value
3491   // to avoid the unknown high bits from interfering.
3492   if (N0.getOpcode() == ISD::ANY_EXTEND)
3493     N0 = N0.getOperand(0);
3494   if (N1.getOpcode() == ISD::ANY_EXTEND)
3495     N1 = N1.getOperand(0);
3496 
3497   // Try to use two fast 24-bit multiplies (one for each half of the result)
3498   // instead of one slow extending multiply.
3499   unsigned LoOpcode, HiOpcode;
3500   if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
3501     N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3502     N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3503     LoOpcode = AMDGPUISD::MUL_U24;
3504     HiOpcode = AMDGPUISD::MULHI_U24;
3505   } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
3506     N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3507     N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3508     LoOpcode = AMDGPUISD::MUL_I24;
3509     HiOpcode = AMDGPUISD::MULHI_I24;
3510   } else {
3511     return SDValue();
3512   }
3513 
3514   SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
3515   SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
3516   DCI.CombineTo(N, Lo, Hi);
3517   return SDValue(N, 0);
3518 }
3519 
3520 SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
3521                                                   DAGCombinerInfo &DCI) const {
3522   EVT VT = N->getValueType(0);
3523 
3524   if (!Subtarget->hasMulI24() || VT.isVector())
3525     return SDValue();
3526 
3527   // Don't generate 24-bit multiplies on values that are in SGPRs, since
3528   // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3529   // unnecessarily). isDivergent() is used as an approximation of whether the
3530   // value is in an SGPR.
3531   // This doesn't apply if no s_mul_hi is available (since we'll end up with a
3532   // valu op anyway)
3533   if (Subtarget->hasSMulHi() && !N->isDivergent())
3534     return SDValue();
3535 
3536   SelectionDAG &DAG = DCI.DAG;
3537   SDLoc DL(N);
3538 
3539   SDValue N0 = N->getOperand(0);
3540   SDValue N1 = N->getOperand(1);
3541 
3542   if (!isI24(N0, DAG) || !isI24(N1, DAG))
3543     return SDValue();
3544 
3545   N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3546   N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3547 
3548   SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
3549   DCI.AddToWorklist(Mulhi.getNode());
3550   return DAG.getSExtOrTrunc(Mulhi, DL, VT);
3551 }
3552 
3553 SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
3554                                                   DAGCombinerInfo &DCI) const {
3555   EVT VT = N->getValueType(0);
3556 
3557   if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
3558     return SDValue();
3559 
3560   // Don't generate 24-bit multiplies on values that are in SGPRs, since
3561   // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3562   // unnecessarily). isDivergent() is used as an approximation of whether the
3563   // value is in an SGPR.
3564   // This doesn't apply if no s_mul_hi is available (since we'll end up with a
3565   // valu op anyway)
3566   if (Subtarget->hasSMulHi() && !N->isDivergent())
3567     return SDValue();
3568 
3569   SelectionDAG &DAG = DCI.DAG;
3570   SDLoc DL(N);
3571 
3572   SDValue N0 = N->getOperand(0);
3573   SDValue N1 = N->getOperand(1);
3574 
3575   if (!isU24(N0, DAG) || !isU24(N1, DAG))
3576     return SDValue();
3577 
3578   N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3579   N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3580 
3581   SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
3582   DCI.AddToWorklist(Mulhi.getNode());
3583   return DAG.getZExtOrTrunc(Mulhi, DL, VT);
3584 }
3585 
3586 static bool isNegativeOne(SDValue Val) {
3587   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
3588     return C->isAllOnes();
3589   return false;
3590 }
3591 
3592 SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
3593                                           SDValue Op,
3594                                           const SDLoc &DL,
3595                                           unsigned Opc) const {
3596   EVT VT = Op.getValueType();
3597   EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
3598   if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
3599                               LegalVT != MVT::i16))
3600     return SDValue();
3601 
3602   if (VT != MVT::i32)
3603     Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
3604 
3605   SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
3606   if (VT != MVT::i32)
3607     FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
3608 
3609   return FFBX;
3610 }
3611 
3612 // The native instructions return -1 on 0 input. Optimize out a select that
3613 // produces -1 on 0.
3614 //
3615 // TODO: If zero is not undef, we could also do this if the output is compared
3616 // against the bitwidth.
3617 //
3618 // TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
3619 SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond,
3620                                                  SDValue LHS, SDValue RHS,
3621                                                  DAGCombinerInfo &DCI) const {
3622   ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
3623   if (!CmpRhs || !CmpRhs->isZero())
3624     return SDValue();
3625 
3626   SelectionDAG &DAG = DCI.DAG;
3627   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
3628   SDValue CmpLHS = Cond.getOperand(0);
3629 
3630   // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
3631   // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
3632   if (CCOpcode == ISD::SETEQ &&
3633       (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
3634       RHS.getOperand(0) == CmpLHS && isNegativeOne(LHS)) {
3635     unsigned Opc =
3636         isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
3637     return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3638   }
3639 
3640   // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
3641   // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
3642   if (CCOpcode == ISD::SETNE &&
3643       (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
3644       LHS.getOperand(0) == CmpLHS && isNegativeOne(RHS)) {
3645     unsigned Opc =
3646         isCttzOpc(LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
3647 
3648     return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3649   }
3650 
3651   return SDValue();
3652 }
3653 
3654 static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
3655                                          unsigned Op,
3656                                          const SDLoc &SL,
3657                                          SDValue Cond,
3658                                          SDValue N1,
3659                                          SDValue N2) {
3660   SelectionDAG &DAG = DCI.DAG;
3661   EVT VT = N1.getValueType();
3662 
3663   SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
3664                                   N1.getOperand(0), N2.getOperand(0));
3665   DCI.AddToWorklist(NewSelect.getNode());
3666   return DAG.getNode(Op, SL, VT, NewSelect);
3667 }
3668 
3669 // Pull a free FP operation out of a select so it may fold into uses.
3670 //
3671 // select c, (fneg x), (fneg y) -> fneg (select c, x, y)
3672 // select c, (fneg x), k -> fneg (select c, x, (fneg k))
3673 //
3674 // select c, (fabs x), (fabs y) -> fabs (select c, x, y)
3675 // select c, (fabs x), +k -> fabs (select c, x, k)
3676 static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
3677                                     SDValue N) {
3678   SelectionDAG &DAG = DCI.DAG;
3679   SDValue Cond = N.getOperand(0);
3680   SDValue LHS = N.getOperand(1);
3681   SDValue RHS = N.getOperand(2);
3682 
3683   EVT VT = N.getValueType();
3684   if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
3685       (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
3686     return distributeOpThroughSelect(DCI, LHS.getOpcode(),
3687                                      SDLoc(N), Cond, LHS, RHS);
3688   }
3689 
3690   bool Inv = false;
3691   if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
3692     std::swap(LHS, RHS);
3693     Inv = true;
3694   }
3695 
3696   // TODO: Support vector constants.
3697   ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
3698   if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) {
3699     SDLoc SL(N);
3700     // If one side is an fneg/fabs and the other is a constant, we can push the
3701     // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
3702     SDValue NewLHS = LHS.getOperand(0);
3703     SDValue NewRHS = RHS;
3704 
3705     // Careful: if the neg can be folded up, don't try to pull it back down.
3706     bool ShouldFoldNeg = true;
3707 
3708     if (NewLHS.hasOneUse()) {
3709       unsigned Opc = NewLHS.getOpcode();
3710       if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc))
3711         ShouldFoldNeg = false;
3712       if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
3713         ShouldFoldNeg = false;
3714     }
3715 
3716     if (ShouldFoldNeg) {
3717       if (LHS.getOpcode() == ISD::FNEG)
3718         NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3719       else if (CRHS->isNegative())
3720         return SDValue();
3721 
3722       if (Inv)
3723         std::swap(NewLHS, NewRHS);
3724 
3725       SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
3726                                       Cond, NewLHS, NewRHS);
3727       DCI.AddToWorklist(NewSelect.getNode());
3728       return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
3729     }
3730   }
3731 
3732   return SDValue();
3733 }
3734 
3735 
3736 SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
3737                                                    DAGCombinerInfo &DCI) const {
3738   if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
3739     return Folded;
3740 
3741   SDValue Cond = N->getOperand(0);
3742   if (Cond.getOpcode() != ISD::SETCC)
3743     return SDValue();
3744 
3745   EVT VT = N->getValueType(0);
3746   SDValue LHS = Cond.getOperand(0);
3747   SDValue RHS = Cond.getOperand(1);
3748   SDValue CC = Cond.getOperand(2);
3749 
3750   SDValue True = N->getOperand(1);
3751   SDValue False = N->getOperand(2);
3752 
3753   if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
3754     SelectionDAG &DAG = DCI.DAG;
3755     if (DAG.isConstantValueOfAnyType(True) &&
3756         !DAG.isConstantValueOfAnyType(False)) {
3757       // Swap cmp + select pair to move constant to false input.
3758       // This will allow using VOPC cndmasks more often.
3759       // select (setcc x, y), k, x -> select (setccinv x, y), x, k
3760 
3761       SDLoc SL(N);
3762       ISD::CondCode NewCC =
3763           getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
3764 
3765       SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
3766       return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
3767     }
3768 
3769     if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
3770       SDValue MinMax
3771         = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
3772       // Revisit this node so we can catch min3/max3/med3 patterns.
3773       //DCI.AddToWorklist(MinMax.getNode());
3774       return MinMax;
3775     }
3776   }
3777 
3778   // There's no reason to not do this if the condition has other uses.
3779   return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
3780 }
3781 
3782 static bool isInv2Pi(const APFloat &APF) {
3783   static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
3784   static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
3785   static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
3786 
3787   return APF.bitwiseIsEqual(KF16) ||
3788          APF.bitwiseIsEqual(KF32) ||
3789          APF.bitwiseIsEqual(KF64);
3790 }
3791 
3792 // 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
3793 // additional cost to negate them.
3794 bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const {
3795   if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N)) {
3796     if (C->isZero() && !C->isNegative())
3797       return true;
3798 
3799     if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
3800       return true;
3801   }
3802 
3803   return false;
3804 }
3805 
3806 static unsigned inverseMinMax(unsigned Opc) {
3807   switch (Opc) {
3808   case ISD::FMAXNUM:
3809     return ISD::FMINNUM;
3810   case ISD::FMINNUM:
3811     return ISD::FMAXNUM;
3812   case ISD::FMAXNUM_IEEE:
3813     return ISD::FMINNUM_IEEE;
3814   case ISD::FMINNUM_IEEE:
3815     return ISD::FMAXNUM_IEEE;
3816   case AMDGPUISD::FMAX_LEGACY:
3817     return AMDGPUISD::FMIN_LEGACY;
3818   case AMDGPUISD::FMIN_LEGACY:
3819     return  AMDGPUISD::FMAX_LEGACY;
3820   default:
3821     llvm_unreachable("invalid min/max opcode");
3822   }
3823 }
3824 
3825 SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
3826                                                  DAGCombinerInfo &DCI) const {
3827   SelectionDAG &DAG = DCI.DAG;
3828   SDValue N0 = N->getOperand(0);
3829   EVT VT = N->getValueType(0);
3830 
3831   unsigned Opc = N0.getOpcode();
3832 
3833   // If the input has multiple uses and we can either fold the negate down, or
3834   // the other uses cannot, give up. This both prevents unprofitable
3835   // transformations and infinite loops: we won't repeatedly try to fold around
3836   // a negate that has no 'good' form.
3837   if (N0.hasOneUse()) {
3838     // This may be able to fold into the source, but at a code size cost. Don't
3839     // fold if the fold into the user is free.
3840     if (allUsesHaveSourceMods(N, 0))
3841       return SDValue();
3842   } else {
3843     if (fnegFoldsIntoOp(Opc) &&
3844         (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode())))
3845       return SDValue();
3846   }
3847 
3848   SDLoc SL(N);
3849   switch (Opc) {
3850   case ISD::FADD: {
3851     if (!mayIgnoreSignedZero(N0))
3852       return SDValue();
3853 
3854     // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
3855     SDValue LHS = N0.getOperand(0);
3856     SDValue RHS = N0.getOperand(1);
3857 
3858     if (LHS.getOpcode() != ISD::FNEG)
3859       LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3860     else
3861       LHS = LHS.getOperand(0);
3862 
3863     if (RHS.getOpcode() != ISD::FNEG)
3864       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3865     else
3866       RHS = RHS.getOperand(0);
3867 
3868     SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
3869     if (Res.getOpcode() != ISD::FADD)
3870       return SDValue(); // Op got folded away.
3871     if (!N0.hasOneUse())
3872       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3873     return Res;
3874   }
3875   case ISD::FMUL:
3876   case AMDGPUISD::FMUL_LEGACY: {
3877     // (fneg (fmul x, y)) -> (fmul x, (fneg y))
3878     // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
3879     SDValue LHS = N0.getOperand(0);
3880     SDValue RHS = N0.getOperand(1);
3881 
3882     if (LHS.getOpcode() == ISD::FNEG)
3883       LHS = LHS.getOperand(0);
3884     else if (RHS.getOpcode() == ISD::FNEG)
3885       RHS = RHS.getOperand(0);
3886     else
3887       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3888 
3889     SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
3890     if (Res.getOpcode() != Opc)
3891       return SDValue(); // Op got folded away.
3892     if (!N0.hasOneUse())
3893       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3894     return Res;
3895   }
3896   case ISD::FMA:
3897   case ISD::FMAD: {
3898     // TODO: handle llvm.amdgcn.fma.legacy
3899     if (!mayIgnoreSignedZero(N0))
3900       return SDValue();
3901 
3902     // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
3903     SDValue LHS = N0.getOperand(0);
3904     SDValue MHS = N0.getOperand(1);
3905     SDValue RHS = N0.getOperand(2);
3906 
3907     if (LHS.getOpcode() == ISD::FNEG)
3908       LHS = LHS.getOperand(0);
3909     else if (MHS.getOpcode() == ISD::FNEG)
3910       MHS = MHS.getOperand(0);
3911     else
3912       MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
3913 
3914     if (RHS.getOpcode() != ISD::FNEG)
3915       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3916     else
3917       RHS = RHS.getOperand(0);
3918 
3919     SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
3920     if (Res.getOpcode() != Opc)
3921       return SDValue(); // Op got folded away.
3922     if (!N0.hasOneUse())
3923       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3924     return Res;
3925   }
3926   case ISD::FMAXNUM:
3927   case ISD::FMINNUM:
3928   case ISD::FMAXNUM_IEEE:
3929   case ISD::FMINNUM_IEEE:
3930   case AMDGPUISD::FMAX_LEGACY:
3931   case AMDGPUISD::FMIN_LEGACY: {
3932     // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
3933     // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
3934     // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
3935     // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
3936 
3937     SDValue LHS = N0.getOperand(0);
3938     SDValue RHS = N0.getOperand(1);
3939 
3940     // 0 doesn't have a negated inline immediate.
3941     // TODO: This constant check should be generalized to other operations.
3942     if (isConstantCostlierToNegate(RHS))
3943       return SDValue();
3944 
3945     SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3946     SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3947     unsigned Opposite = inverseMinMax(Opc);
3948 
3949     SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
3950     if (Res.getOpcode() != Opposite)
3951       return SDValue(); // Op got folded away.
3952     if (!N0.hasOneUse())
3953       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3954     return Res;
3955   }
3956   case AMDGPUISD::FMED3: {
3957     SDValue Ops[3];
3958     for (unsigned I = 0; I < 3; ++I)
3959       Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
3960 
3961     SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
3962     if (Res.getOpcode() != AMDGPUISD::FMED3)
3963       return SDValue(); // Op got folded away.
3964 
3965     if (!N0.hasOneUse()) {
3966       SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
3967       DAG.ReplaceAllUsesWith(N0, Neg);
3968 
3969       for (SDNode *U : Neg->uses())
3970         DCI.AddToWorklist(U);
3971     }
3972 
3973     return Res;
3974   }
3975   case ISD::FP_EXTEND:
3976   case ISD::FTRUNC:
3977   case ISD::FRINT:
3978   case ISD::FNEARBYINT: // XXX - Should fround be handled?
3979   case ISD::FSIN:
3980   case ISD::FCANONICALIZE:
3981   case AMDGPUISD::RCP:
3982   case AMDGPUISD::RCP_LEGACY:
3983   case AMDGPUISD::RCP_IFLAG:
3984   case AMDGPUISD::SIN_HW: {
3985     SDValue CvtSrc = N0.getOperand(0);
3986     if (CvtSrc.getOpcode() == ISD::FNEG) {
3987       // (fneg (fp_extend (fneg x))) -> (fp_extend x)
3988       // (fneg (rcp (fneg x))) -> (rcp x)
3989       return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
3990     }
3991 
3992     if (!N0.hasOneUse())
3993       return SDValue();
3994 
3995     // (fneg (fp_extend x)) -> (fp_extend (fneg x))
3996     // (fneg (rcp x)) -> (rcp (fneg x))
3997     SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
3998     return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
3999   }
4000   case ISD::FP_ROUND: {
4001     SDValue CvtSrc = N0.getOperand(0);
4002 
4003     if (CvtSrc.getOpcode() == ISD::FNEG) {
4004       // (fneg (fp_round (fneg x))) -> (fp_round x)
4005       return DAG.getNode(ISD::FP_ROUND, SL, VT,
4006                          CvtSrc.getOperand(0), N0.getOperand(1));
4007     }
4008 
4009     if (!N0.hasOneUse())
4010       return SDValue();
4011 
4012     // (fneg (fp_round x)) -> (fp_round (fneg x))
4013     SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
4014     return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
4015   }
4016   case ISD::FP16_TO_FP: {
4017     // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
4018     // f16, but legalization of f16 fneg ends up pulling it out of the source.
4019     // Put the fneg back as a legal source operation that can be matched later.
4020     SDLoc SL(N);
4021 
4022     SDValue Src = N0.getOperand(0);
4023     EVT SrcVT = Src.getValueType();
4024 
4025     // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
4026     SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
4027                                   DAG.getConstant(0x8000, SL, SrcVT));
4028     return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
4029   }
4030   default:
4031     return SDValue();
4032   }
4033 }
4034 
4035 SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
4036                                                  DAGCombinerInfo &DCI) const {
4037   SelectionDAG &DAG = DCI.DAG;
4038   SDValue N0 = N->getOperand(0);
4039 
4040   if (!N0.hasOneUse())
4041     return SDValue();
4042 
4043   switch (N0.getOpcode()) {
4044   case ISD::FP16_TO_FP: {
4045     assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
4046     SDLoc SL(N);
4047     SDValue Src = N0.getOperand(0);
4048     EVT SrcVT = Src.getValueType();
4049 
4050     // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
4051     SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
4052                                   DAG.getConstant(0x7fff, SL, SrcVT));
4053     return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
4054   }
4055   default:
4056     return SDValue();
4057   }
4058 }
4059 
4060 SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N,
4061                                                 DAGCombinerInfo &DCI) const {
4062   const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
4063   if (!CFP)
4064     return SDValue();
4065 
4066   // XXX - Should this flush denormals?
4067   const APFloat &Val = CFP->getValueAPF();
4068   APFloat One(Val.getSemantics(), "1.0");
4069   return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
4070 }
4071 
4072 SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
4073                                                 DAGCombinerInfo &DCI) const {
4074   SelectionDAG &DAG = DCI.DAG;
4075   SDLoc DL(N);
4076 
4077   switch(N->getOpcode()) {
4078   default:
4079     break;
4080   case ISD::BITCAST: {
4081     EVT DestVT = N->getValueType(0);
4082 
4083     // Push casts through vector builds. This helps avoid emitting a large
4084     // number of copies when materializing floating point vector constants.
4085     //
4086     // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
4087     //   vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
4088     if (DestVT.isVector()) {
4089       SDValue Src = N->getOperand(0);
4090       if (Src.getOpcode() == ISD::BUILD_VECTOR) {
4091         EVT SrcVT = Src.getValueType();
4092         unsigned NElts = DestVT.getVectorNumElements();
4093 
4094         if (SrcVT.getVectorNumElements() == NElts) {
4095           EVT DestEltVT = DestVT.getVectorElementType();
4096 
4097           SmallVector<SDValue, 8> CastedElts;
4098           SDLoc SL(N);
4099           for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
4100             SDValue Elt = Src.getOperand(I);
4101             CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
4102           }
4103 
4104           return DAG.getBuildVector(DestVT, SL, CastedElts);
4105         }
4106       }
4107     }
4108 
4109     if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
4110       break;
4111 
4112     // Fold bitcasts of constants.
4113     //
4114     // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
4115     // TODO: Generalize and move to DAGCombiner
4116     SDValue Src = N->getOperand(0);
4117     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
4118       SDLoc SL(N);
4119       uint64_t CVal = C->getZExtValue();
4120       SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
4121                                DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
4122                                DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
4123       return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
4124     }
4125 
4126     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
4127       const APInt &Val = C->getValueAPF().bitcastToAPInt();
4128       SDLoc SL(N);
4129       uint64_t CVal = Val.getZExtValue();
4130       SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
4131                                 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
4132                                 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
4133 
4134       return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
4135     }
4136 
4137     break;
4138   }
4139   case ISD::SHL: {
4140     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
4141       break;
4142 
4143     return performShlCombine(N, DCI);
4144   }
4145   case ISD::SRL: {
4146     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
4147       break;
4148 
4149     return performSrlCombine(N, DCI);
4150   }
4151   case ISD::SRA: {
4152     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
4153       break;
4154 
4155     return performSraCombine(N, DCI);
4156   }
4157   case ISD::TRUNCATE:
4158     return performTruncateCombine(N, DCI);
4159   case ISD::MUL:
4160     return performMulCombine(N, DCI);
4161   case ISD::SMUL_LOHI:
4162   case ISD::UMUL_LOHI:
4163     return performMulLoHiCombine(N, DCI);
4164   case ISD::MULHS:
4165     return performMulhsCombine(N, DCI);
4166   case ISD::MULHU:
4167     return performMulhuCombine(N, DCI);
4168   case AMDGPUISD::MUL_I24:
4169   case AMDGPUISD::MUL_U24:
4170   case AMDGPUISD::MULHI_I24:
4171   case AMDGPUISD::MULHI_U24:
4172     return simplifyMul24(N, DCI);
4173   case ISD::SELECT:
4174     return performSelectCombine(N, DCI);
4175   case ISD::FNEG:
4176     return performFNegCombine(N, DCI);
4177   case ISD::FABS:
4178     return performFAbsCombine(N, DCI);
4179   case AMDGPUISD::BFE_I32:
4180   case AMDGPUISD::BFE_U32: {
4181     assert(!N->getValueType(0).isVector() &&
4182            "Vector handling of BFE not implemented");
4183     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
4184     if (!Width)
4185       break;
4186 
4187     uint32_t WidthVal = Width->getZExtValue() & 0x1f;
4188     if (WidthVal == 0)
4189       return DAG.getConstant(0, DL, MVT::i32);
4190 
4191     ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
4192     if (!Offset)
4193       break;
4194 
4195     SDValue BitsFrom = N->getOperand(0);
4196     uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
4197 
4198     bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
4199 
4200     if (OffsetVal == 0) {
4201       // This is already sign / zero extended, so try to fold away extra BFEs.
4202       unsigned SignBits =  Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
4203 
4204       unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
4205       if (OpSignBits >= SignBits)
4206         return BitsFrom;
4207 
4208       EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
4209       if (Signed) {
4210         // This is a sign_extend_inreg. Replace it to take advantage of existing
4211         // DAG Combines. If not eliminated, we will match back to BFE during
4212         // selection.
4213 
4214         // TODO: The sext_inreg of extended types ends, although we can could
4215         // handle them in a single BFE.
4216         return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
4217                            DAG.getValueType(SmallVT));
4218       }
4219 
4220       return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
4221     }
4222 
4223     if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
4224       if (Signed) {
4225         return constantFoldBFE<int32_t>(DAG,
4226                                         CVal->getSExtValue(),
4227                                         OffsetVal,
4228                                         WidthVal,
4229                                         DL);
4230       }
4231 
4232       return constantFoldBFE<uint32_t>(DAG,
4233                                        CVal->getZExtValue(),
4234                                        OffsetVal,
4235                                        WidthVal,
4236                                        DL);
4237     }
4238 
4239     if ((OffsetVal + WidthVal) >= 32 &&
4240         !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
4241       SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
4242       return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
4243                          BitsFrom, ShiftVal);
4244     }
4245 
4246     if (BitsFrom.hasOneUse()) {
4247       APInt Demanded = APInt::getBitsSet(32,
4248                                          OffsetVal,
4249                                          OffsetVal + WidthVal);
4250 
4251       KnownBits Known;
4252       TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
4253                                             !DCI.isBeforeLegalizeOps());
4254       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4255       if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
4256           TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
4257         DCI.CommitTargetLoweringOpt(TLO);
4258       }
4259     }
4260 
4261     break;
4262   }
4263   case ISD::LOAD:
4264     return performLoadCombine(N, DCI);
4265   case ISD::STORE:
4266     return performStoreCombine(N, DCI);
4267   case AMDGPUISD::RCP:
4268   case AMDGPUISD::RCP_IFLAG:
4269     return performRcpCombine(N, DCI);
4270   case ISD::AssertZext:
4271   case ISD::AssertSext:
4272     return performAssertSZExtCombine(N, DCI);
4273   case ISD::INTRINSIC_WO_CHAIN:
4274     return performIntrinsicWOChainCombine(N, DCI);
4275   }
4276   return SDValue();
4277 }
4278 
4279 //===----------------------------------------------------------------------===//
4280 // Helper functions
4281 //===----------------------------------------------------------------------===//
4282 
4283 SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
4284                                                    const TargetRegisterClass *RC,
4285                                                    Register Reg, EVT VT,
4286                                                    const SDLoc &SL,
4287                                                    bool RawReg) const {
4288   MachineFunction &MF = DAG.getMachineFunction();
4289   MachineRegisterInfo &MRI = MF.getRegInfo();
4290   Register VReg;
4291 
4292   if (!MRI.isLiveIn(Reg)) {
4293     VReg = MRI.createVirtualRegister(RC);
4294     MRI.addLiveIn(Reg, VReg);
4295   } else {
4296     VReg = MRI.getLiveInVirtReg(Reg);
4297   }
4298 
4299   if (RawReg)
4300     return DAG.getRegister(VReg, VT);
4301 
4302   return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
4303 }
4304 
4305 // This may be called multiple times, and nothing prevents creating multiple
4306 // objects at the same offset. See if we already defined this object.
4307 static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size,
4308                                        int64_t Offset) {
4309   for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
4310     if (MFI.getObjectOffset(I) == Offset) {
4311       assert(MFI.getObjectSize(I) == Size);
4312       return I;
4313     }
4314   }
4315 
4316   return MFI.CreateFixedObject(Size, Offset, true);
4317 }
4318 
4319 SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
4320                                                   EVT VT,
4321                                                   const SDLoc &SL,
4322                                                   int64_t Offset) const {
4323   MachineFunction &MF = DAG.getMachineFunction();
4324   MachineFrameInfo &MFI = MF.getFrameInfo();
4325   int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
4326 
4327   auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
4328   SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
4329 
4330   return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),
4331                      MachineMemOperand::MODereferenceable |
4332                          MachineMemOperand::MOInvariant);
4333 }
4334 
4335 SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
4336                                                    const SDLoc &SL,
4337                                                    SDValue Chain,
4338                                                    SDValue ArgVal,
4339                                                    int64_t Offset) const {
4340   MachineFunction &MF = DAG.getMachineFunction();
4341   MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
4342   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4343 
4344   SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
4345   // Stores to the argument stack area are relative to the stack pointer.
4346   SDValue SP =
4347       DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
4348   Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
4349   SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
4350                                MachineMemOperand::MODereferenceable);
4351   return Store;
4352 }
4353 
4354 SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
4355                                              const TargetRegisterClass *RC,
4356                                              EVT VT, const SDLoc &SL,
4357                                              const ArgDescriptor &Arg) const {
4358   assert(Arg && "Attempting to load missing argument");
4359 
4360   SDValue V = Arg.isRegister() ?
4361     CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
4362     loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
4363 
4364   if (!Arg.isMasked())
4365     return V;
4366 
4367   unsigned Mask = Arg.getMask();
4368   unsigned Shift = countTrailingZeros<unsigned>(Mask);
4369   V = DAG.getNode(ISD::SRL, SL, VT, V,
4370                   DAG.getShiftAmountConstant(Shift, VT, SL));
4371   return DAG.getNode(ISD::AND, SL, VT, V,
4372                      DAG.getConstant(Mask >> Shift, SL, VT));
4373 }
4374 
4375 uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
4376     const MachineFunction &MF, const ImplicitParameter Param) const {
4377   const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
4378   const AMDGPUSubtarget &ST =
4379       AMDGPUSubtarget::get(getTargetMachine(), MF.getFunction());
4380   unsigned ExplicitArgOffset = ST.getExplicitKernelArgOffset(MF.getFunction());
4381   const Align Alignment = ST.getAlignmentForImplicitArgPtr();
4382   uint64_t ArgOffset = alignTo(MFI->getExplicitKernArgSize(), Alignment) +
4383                        ExplicitArgOffset;
4384   switch (Param) {
4385   case GRID_DIM:
4386     return ArgOffset;
4387   case GRID_OFFSET:
4388     return ArgOffset + 4;
4389   }
4390   llvm_unreachable("unexpected implicit parameter type");
4391 }
4392 
4393 #define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
4394 
4395 const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
4396   switch ((AMDGPUISD::NodeType)Opcode) {
4397   case AMDGPUISD::FIRST_NUMBER: break;
4398   // AMDIL DAG nodes
4399   NODE_NAME_CASE(UMUL);
4400   NODE_NAME_CASE(BRANCH_COND);
4401 
4402   // AMDGPU DAG nodes
4403   NODE_NAME_CASE(IF)
4404   NODE_NAME_CASE(ELSE)
4405   NODE_NAME_CASE(LOOP)
4406   NODE_NAME_CASE(CALL)
4407   NODE_NAME_CASE(TC_RETURN)
4408   NODE_NAME_CASE(TRAP)
4409   NODE_NAME_CASE(RET_FLAG)
4410   NODE_NAME_CASE(RETURN_TO_EPILOG)
4411   NODE_NAME_CASE(ENDPGM)
4412   NODE_NAME_CASE(DWORDADDR)
4413   NODE_NAME_CASE(FRACT)
4414   NODE_NAME_CASE(SETCC)
4415   NODE_NAME_CASE(SETREG)
4416   NODE_NAME_CASE(DENORM_MODE)
4417   NODE_NAME_CASE(FMA_W_CHAIN)
4418   NODE_NAME_CASE(FMUL_W_CHAIN)
4419   NODE_NAME_CASE(CLAMP)
4420   NODE_NAME_CASE(COS_HW)
4421   NODE_NAME_CASE(SIN_HW)
4422   NODE_NAME_CASE(FMAX_LEGACY)
4423   NODE_NAME_CASE(FMIN_LEGACY)
4424   NODE_NAME_CASE(FMAX3)
4425   NODE_NAME_CASE(SMAX3)
4426   NODE_NAME_CASE(UMAX3)
4427   NODE_NAME_CASE(FMIN3)
4428   NODE_NAME_CASE(SMIN3)
4429   NODE_NAME_CASE(UMIN3)
4430   NODE_NAME_CASE(FMED3)
4431   NODE_NAME_CASE(SMED3)
4432   NODE_NAME_CASE(UMED3)
4433   NODE_NAME_CASE(FDOT2)
4434   NODE_NAME_CASE(URECIP)
4435   NODE_NAME_CASE(DIV_SCALE)
4436   NODE_NAME_CASE(DIV_FMAS)
4437   NODE_NAME_CASE(DIV_FIXUP)
4438   NODE_NAME_CASE(FMAD_FTZ)
4439   NODE_NAME_CASE(RCP)
4440   NODE_NAME_CASE(RSQ)
4441   NODE_NAME_CASE(RCP_LEGACY)
4442   NODE_NAME_CASE(RCP_IFLAG)
4443   NODE_NAME_CASE(FMUL_LEGACY)
4444   NODE_NAME_CASE(RSQ_CLAMP)
4445   NODE_NAME_CASE(LDEXP)
4446   NODE_NAME_CASE(FP_CLASS)
4447   NODE_NAME_CASE(DOT4)
4448   NODE_NAME_CASE(CARRY)
4449   NODE_NAME_CASE(BORROW)
4450   NODE_NAME_CASE(BFE_U32)
4451   NODE_NAME_CASE(BFE_I32)
4452   NODE_NAME_CASE(BFI)
4453   NODE_NAME_CASE(BFM)
4454   NODE_NAME_CASE(FFBH_U32)
4455   NODE_NAME_CASE(FFBH_I32)
4456   NODE_NAME_CASE(FFBL_B32)
4457   NODE_NAME_CASE(MUL_U24)
4458   NODE_NAME_CASE(MUL_I24)
4459   NODE_NAME_CASE(MULHI_U24)
4460   NODE_NAME_CASE(MULHI_I24)
4461   NODE_NAME_CASE(MAD_U24)
4462   NODE_NAME_CASE(MAD_I24)
4463   NODE_NAME_CASE(MAD_I64_I32)
4464   NODE_NAME_CASE(MAD_U64_U32)
4465   NODE_NAME_CASE(PERM)
4466   NODE_NAME_CASE(TEXTURE_FETCH)
4467   NODE_NAME_CASE(R600_EXPORT)
4468   NODE_NAME_CASE(CONST_ADDRESS)
4469   NODE_NAME_CASE(REGISTER_LOAD)
4470   NODE_NAME_CASE(REGISTER_STORE)
4471   NODE_NAME_CASE(SAMPLE)
4472   NODE_NAME_CASE(SAMPLEB)
4473   NODE_NAME_CASE(SAMPLED)
4474   NODE_NAME_CASE(SAMPLEL)
4475   NODE_NAME_CASE(CVT_F32_UBYTE0)
4476   NODE_NAME_CASE(CVT_F32_UBYTE1)
4477   NODE_NAME_CASE(CVT_F32_UBYTE2)
4478   NODE_NAME_CASE(CVT_F32_UBYTE3)
4479   NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
4480   NODE_NAME_CASE(CVT_PKNORM_I16_F32)
4481   NODE_NAME_CASE(CVT_PKNORM_U16_F32)
4482   NODE_NAME_CASE(CVT_PK_I16_I32)
4483   NODE_NAME_CASE(CVT_PK_U16_U32)
4484   NODE_NAME_CASE(FP_TO_FP16)
4485   NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
4486   NODE_NAME_CASE(CONST_DATA_PTR)
4487   NODE_NAME_CASE(PC_ADD_REL_OFFSET)
4488   NODE_NAME_CASE(LDS)
4489   NODE_NAME_CASE(FPTRUNC_ROUND_UPWARD)
4490   NODE_NAME_CASE(FPTRUNC_ROUND_DOWNWARD)
4491   NODE_NAME_CASE(DUMMY_CHAIN)
4492   case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
4493   NODE_NAME_CASE(LOAD_D16_HI)
4494   NODE_NAME_CASE(LOAD_D16_LO)
4495   NODE_NAME_CASE(LOAD_D16_HI_I8)
4496   NODE_NAME_CASE(LOAD_D16_HI_U8)
4497   NODE_NAME_CASE(LOAD_D16_LO_I8)
4498   NODE_NAME_CASE(LOAD_D16_LO_U8)
4499   NODE_NAME_CASE(STORE_MSKOR)
4500   NODE_NAME_CASE(LOAD_CONSTANT)
4501   NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
4502   NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)
4503   NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
4504   NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
4505   NODE_NAME_CASE(DS_ORDERED_COUNT)
4506   NODE_NAME_CASE(ATOMIC_CMP_SWAP)
4507   NODE_NAME_CASE(ATOMIC_INC)
4508   NODE_NAME_CASE(ATOMIC_DEC)
4509   NODE_NAME_CASE(ATOMIC_LOAD_FMIN)
4510   NODE_NAME_CASE(ATOMIC_LOAD_FMAX)
4511   NODE_NAME_CASE(BUFFER_LOAD)
4512   NODE_NAME_CASE(BUFFER_LOAD_UBYTE)
4513   NODE_NAME_CASE(BUFFER_LOAD_USHORT)
4514   NODE_NAME_CASE(BUFFER_LOAD_BYTE)
4515   NODE_NAME_CASE(BUFFER_LOAD_SHORT)
4516   NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
4517   NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
4518   NODE_NAME_CASE(SBUFFER_LOAD)
4519   NODE_NAME_CASE(BUFFER_STORE)
4520   NODE_NAME_CASE(BUFFER_STORE_BYTE)
4521   NODE_NAME_CASE(BUFFER_STORE_SHORT)
4522   NODE_NAME_CASE(BUFFER_STORE_FORMAT)
4523   NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
4524   NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)
4525   NODE_NAME_CASE(BUFFER_ATOMIC_ADD)
4526   NODE_NAME_CASE(BUFFER_ATOMIC_SUB)
4527   NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)
4528   NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)
4529   NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)
4530   NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)
4531   NODE_NAME_CASE(BUFFER_ATOMIC_AND)
4532   NODE_NAME_CASE(BUFFER_ATOMIC_OR)
4533   NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
4534   NODE_NAME_CASE(BUFFER_ATOMIC_INC)
4535   NODE_NAME_CASE(BUFFER_ATOMIC_DEC)
4536   NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
4537   NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
4538   NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
4539   NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
4540   NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
4541 
4542   case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
4543   }
4544   return nullptr;
4545 }
4546 
4547 SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
4548                                               SelectionDAG &DAG, int Enabled,
4549                                               int &RefinementSteps,
4550                                               bool &UseOneConstNR,
4551                                               bool Reciprocal) const {
4552   EVT VT = Operand.getValueType();
4553 
4554   if (VT == MVT::f32) {
4555     RefinementSteps = 0;
4556     return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
4557   }
4558 
4559   // TODO: There is also f64 rsq instruction, but the documentation is less
4560   // clear on its precision.
4561 
4562   return SDValue();
4563 }
4564 
4565 SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
4566                                                SelectionDAG &DAG, int Enabled,
4567                                                int &RefinementSteps) const {
4568   EVT VT = Operand.getValueType();
4569 
4570   if (VT == MVT::f32) {
4571     // Reciprocal, < 1 ulp error.
4572     //
4573     // This reciprocal approximation converges to < 0.5 ulp error with one
4574     // newton rhapson performed with two fused multiple adds (FMAs).
4575 
4576     RefinementSteps = 0;
4577     return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
4578   }
4579 
4580   // TODO: There is also f64 rcp instruction, but the documentation is less
4581   // clear on its precision.
4582 
4583   return SDValue();
4584 }
4585 
4586 void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
4587     const SDValue Op, KnownBits &Known,
4588     const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
4589 
4590   Known.resetAll(); // Don't know anything.
4591 
4592   unsigned Opc = Op.getOpcode();
4593 
4594   switch (Opc) {
4595   default:
4596     break;
4597   case AMDGPUISD::CARRY:
4598   case AMDGPUISD::BORROW: {
4599     Known.Zero = APInt::getHighBitsSet(32, 31);
4600     break;
4601   }
4602 
4603   case AMDGPUISD::BFE_I32:
4604   case AMDGPUISD::BFE_U32: {
4605     ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4606     if (!CWidth)
4607       return;
4608 
4609     uint32_t Width = CWidth->getZExtValue() & 0x1f;
4610 
4611     if (Opc == AMDGPUISD::BFE_U32)
4612       Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
4613 
4614     break;
4615   }
4616   case AMDGPUISD::FP_TO_FP16: {
4617     unsigned BitWidth = Known.getBitWidth();
4618 
4619     // High bits are zero.
4620     Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
4621     break;
4622   }
4623   case AMDGPUISD::MUL_U24:
4624   case AMDGPUISD::MUL_I24: {
4625     KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
4626     KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
4627     unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
4628                       RHSKnown.countMinTrailingZeros();
4629     Known.Zero.setLowBits(std::min(TrailZ, 32u));
4630     // Skip extra check if all bits are known zeros.
4631     if (TrailZ >= 32)
4632       break;
4633 
4634     // Truncate to 24 bits.
4635     LHSKnown = LHSKnown.trunc(24);
4636     RHSKnown = RHSKnown.trunc(24);
4637 
4638     if (Opc == AMDGPUISD::MUL_I24) {
4639       unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
4640       unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
4641       unsigned MaxValBits = LHSValBits + RHSValBits;
4642       if (MaxValBits > 32)
4643         break;
4644       unsigned SignBits = 32 - MaxValBits + 1;
4645       bool LHSNegative = LHSKnown.isNegative();
4646       bool LHSNonNegative = LHSKnown.isNonNegative();
4647       bool LHSPositive = LHSKnown.isStrictlyPositive();
4648       bool RHSNegative = RHSKnown.isNegative();
4649       bool RHSNonNegative = RHSKnown.isNonNegative();
4650       bool RHSPositive = RHSKnown.isStrictlyPositive();
4651 
4652       if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
4653         Known.Zero.setHighBits(SignBits);
4654       else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
4655         Known.One.setHighBits(SignBits);
4656     } else {
4657       unsigned LHSValBits = LHSKnown.countMaxActiveBits();
4658       unsigned RHSValBits = RHSKnown.countMaxActiveBits();
4659       unsigned MaxValBits = LHSValBits + RHSValBits;
4660       if (MaxValBits >= 32)
4661         break;
4662       Known.Zero.setBitsFrom(MaxValBits);
4663     }
4664     break;
4665   }
4666   case AMDGPUISD::PERM: {
4667     ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4668     if (!CMask)
4669       return;
4670 
4671     KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
4672     KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
4673     unsigned Sel = CMask->getZExtValue();
4674 
4675     for (unsigned I = 0; I < 32; I += 8) {
4676       unsigned SelBits = Sel & 0xff;
4677       if (SelBits < 4) {
4678         SelBits *= 8;
4679         Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
4680         Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
4681       } else if (SelBits < 7) {
4682         SelBits = (SelBits & 3) * 8;
4683         Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
4684         Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
4685       } else if (SelBits == 0x0c) {
4686         Known.Zero |= 0xFFull << I;
4687       } else if (SelBits > 0x0c) {
4688         Known.One |= 0xFFull << I;
4689       }
4690       Sel >>= 8;
4691     }
4692     break;
4693   }
4694   case AMDGPUISD::BUFFER_LOAD_UBYTE:  {
4695     Known.Zero.setHighBits(24);
4696     break;
4697   }
4698   case AMDGPUISD::BUFFER_LOAD_USHORT: {
4699     Known.Zero.setHighBits(16);
4700     break;
4701   }
4702   case AMDGPUISD::LDS: {
4703     auto GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
4704     Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());
4705 
4706     Known.Zero.setHighBits(16);
4707     Known.Zero.setLowBits(Log2(Alignment));
4708     break;
4709   }
4710   case ISD::INTRINSIC_WO_CHAIN: {
4711     unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4712     switch (IID) {
4713     case Intrinsic::amdgcn_mbcnt_lo:
4714     case Intrinsic::amdgcn_mbcnt_hi: {
4715       const GCNSubtarget &ST =
4716           DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
4717       // These return at most the wavefront size - 1.
4718       unsigned Size = Op.getValueType().getSizeInBits();
4719       Known.Zero.setHighBits(Size - ST.getWavefrontSizeLog2());
4720       break;
4721     }
4722     default:
4723       break;
4724     }
4725   }
4726   }
4727 }
4728 
4729 unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
4730     SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
4731     unsigned Depth) const {
4732   switch (Op.getOpcode()) {
4733   case AMDGPUISD::BFE_I32: {
4734     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4735     if (!Width)
4736       return 1;
4737 
4738     unsigned SignBits = 32 - Width->getZExtValue() + 1;
4739     if (!isNullConstant(Op.getOperand(1)))
4740       return SignBits;
4741 
4742     // TODO: Could probably figure something out with non-0 offsets.
4743     unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
4744     return std::max(SignBits, Op0SignBits);
4745   }
4746 
4747   case AMDGPUISD::BFE_U32: {
4748     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4749     return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
4750   }
4751 
4752   case AMDGPUISD::CARRY:
4753   case AMDGPUISD::BORROW:
4754     return 31;
4755   case AMDGPUISD::BUFFER_LOAD_BYTE:
4756     return 25;
4757   case AMDGPUISD::BUFFER_LOAD_SHORT:
4758     return 17;
4759   case AMDGPUISD::BUFFER_LOAD_UBYTE:
4760     return 24;
4761   case AMDGPUISD::BUFFER_LOAD_USHORT:
4762     return 16;
4763   case AMDGPUISD::FP_TO_FP16:
4764     return 16;
4765   default:
4766     return 1;
4767   }
4768 }
4769 
4770 unsigned AMDGPUTargetLowering::computeNumSignBitsForTargetInstr(
4771   GISelKnownBits &Analysis, Register R,
4772   const APInt &DemandedElts, const MachineRegisterInfo &MRI,
4773   unsigned Depth) const {
4774   const MachineInstr *MI = MRI.getVRegDef(R);
4775   if (!MI)
4776     return 1;
4777 
4778   // TODO: Check range metadata on MMO.
4779   switch (MI->getOpcode()) {
4780   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
4781     return 25;
4782   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
4783     return 17;
4784   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
4785     return 24;
4786   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
4787     return 16;
4788   default:
4789     return 1;
4790   }
4791 }
4792 
4793 bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
4794                                                         const SelectionDAG &DAG,
4795                                                         bool SNaN,
4796                                                         unsigned Depth) const {
4797   unsigned Opcode = Op.getOpcode();
4798   switch (Opcode) {
4799   case AMDGPUISD::FMIN_LEGACY:
4800   case AMDGPUISD::FMAX_LEGACY: {
4801     if (SNaN)
4802       return true;
4803 
4804     // TODO: Can check no nans on one of the operands for each one, but which
4805     // one?
4806     return false;
4807   }
4808   case AMDGPUISD::FMUL_LEGACY:
4809   case AMDGPUISD::CVT_PKRTZ_F16_F32: {
4810     if (SNaN)
4811       return true;
4812     return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
4813            DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
4814   }
4815   case AMDGPUISD::FMED3:
4816   case AMDGPUISD::FMIN3:
4817   case AMDGPUISD::FMAX3:
4818   case AMDGPUISD::FMAD_FTZ: {
4819     if (SNaN)
4820       return true;
4821     return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
4822            DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
4823            DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
4824   }
4825   case AMDGPUISD::CVT_F32_UBYTE0:
4826   case AMDGPUISD::CVT_F32_UBYTE1:
4827   case AMDGPUISD::CVT_F32_UBYTE2:
4828   case AMDGPUISD::CVT_F32_UBYTE3:
4829     return true;
4830 
4831   case AMDGPUISD::RCP:
4832   case AMDGPUISD::RSQ:
4833   case AMDGPUISD::RCP_LEGACY:
4834   case AMDGPUISD::RSQ_CLAMP: {
4835     if (SNaN)
4836       return true;
4837 
4838     // TODO: Need is known positive check.
4839     return false;
4840   }
4841   case AMDGPUISD::LDEXP:
4842   case AMDGPUISD::FRACT: {
4843     if (SNaN)
4844       return true;
4845     return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
4846   }
4847   case AMDGPUISD::DIV_SCALE:
4848   case AMDGPUISD::DIV_FMAS:
4849   case AMDGPUISD::DIV_FIXUP:
4850     // TODO: Refine on operands.
4851     return SNaN;
4852   case AMDGPUISD::SIN_HW:
4853   case AMDGPUISD::COS_HW: {
4854     // TODO: Need check for infinity
4855     return SNaN;
4856   }
4857   case ISD::INTRINSIC_WO_CHAIN: {
4858     unsigned IntrinsicID
4859       = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4860     // TODO: Handle more intrinsics
4861     switch (IntrinsicID) {
4862     case Intrinsic::amdgcn_cubeid:
4863       return true;
4864 
4865     case Intrinsic::amdgcn_frexp_mant: {
4866       if (SNaN)
4867         return true;
4868       return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
4869     }
4870     case Intrinsic::amdgcn_cvt_pkrtz: {
4871       if (SNaN)
4872         return true;
4873       return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
4874              DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
4875     }
4876     case Intrinsic::amdgcn_rcp:
4877     case Intrinsic::amdgcn_rsq:
4878     case Intrinsic::amdgcn_rcp_legacy:
4879     case Intrinsic::amdgcn_rsq_legacy:
4880     case Intrinsic::amdgcn_rsq_clamp: {
4881       if (SNaN)
4882         return true;
4883 
4884       // TODO: Need is known positive check.
4885       return false;
4886     }
4887     case Intrinsic::amdgcn_trig_preop:
4888     case Intrinsic::amdgcn_fdot2:
4889       // TODO: Refine on operand
4890       return SNaN;
4891     case Intrinsic::amdgcn_fma_legacy:
4892       if (SNaN)
4893         return true;
4894       return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
4895              DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&
4896              DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);
4897     default:
4898       return false;
4899     }
4900   }
4901   default:
4902     return false;
4903   }
4904 }
4905 
4906 TargetLowering::AtomicExpansionKind
4907 AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
4908   switch (RMW->getOperation()) {
4909   case AtomicRMWInst::Nand:
4910   case AtomicRMWInst::FAdd:
4911   case AtomicRMWInst::FSub:
4912     return AtomicExpansionKind::CmpXChg;
4913   default:
4914     return AtomicExpansionKind::None;
4915   }
4916 }
4917 
4918 bool AMDGPUTargetLowering::isConstantUnsignedBitfieldExtractLegal(
4919     unsigned Opc, LLT Ty1, LLT Ty2) const {
4920   return (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64)) &&
4921          Ty2 == LLT::scalar(32);
4922 }
4923