1 //===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This is the parent TargetLowering class for hardware code gen
11 /// targets.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPUISelLowering.h"
16 #include "AMDGPU.h"
17 #include "AMDGPUInstrInfo.h"
18 #include "AMDGPUMachineFunction.h"
19 #include "GCNSubtarget.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "llvm/CodeGen/Analysis.h"
22 #include "llvm/CodeGen/MachineFrameInfo.h"
23 #include "llvm/IR/DiagnosticInfo.h"
24 #include "llvm/IR/IntrinsicsAMDGPU.h"
25 #include "llvm/Support/CommandLine.h"
26 #include "llvm/Support/KnownBits.h"
27 #include "llvm/Target/TargetMachine.h"
28 
29 using namespace llvm;
30 
31 #include "AMDGPUGenCallingConv.inc"
32 
33 static cl::opt<bool> AMDGPUBypassSlowDiv(
34   "amdgpu-bypass-slow-div",
35   cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
36   cl::init(true));
37 
38 // Find a larger type to do a load / store of a vector with.
39 EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
40   unsigned StoreSize = VT.getStoreSizeInBits();
41   if (StoreSize <= 32)
42     return EVT::getIntegerVT(Ctx, StoreSize);
43 
44   assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
45   return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
46 }
47 
48 unsigned AMDGPUTargetLowering::numBitsUnsigned(SDValue Op, SelectionDAG &DAG) {
49   return DAG.computeKnownBits(Op).countMaxActiveBits();
50 }
51 
52 unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
53   // In order for this to be a signed 24-bit value, bit 23, must
54   // be a sign bit.
55   return DAG.ComputeMaxSignificantBits(Op);
56 }
57 
58 AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
59                                            const AMDGPUSubtarget &STI)
60     : TargetLowering(TM), Subtarget(&STI) {
61   // Lower floating point store/load to integer store/load to reduce the number
62   // of patterns in tablegen.
63   setOperationAction(ISD::LOAD, MVT::f32, Promote);
64   AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
65 
66   setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
67   AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
68 
69   setOperationAction(ISD::LOAD, MVT::v3f32, Promote);
70   AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
71 
72   setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
73   AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
74 
75   setOperationAction(ISD::LOAD, MVT::v5f32, Promote);
76   AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
77 
78   setOperationAction(ISD::LOAD, MVT::v6f32, Promote);
79   AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
80 
81   setOperationAction(ISD::LOAD, MVT::v7f32, Promote);
82   AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
83 
84   setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
85   AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
86 
87   setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
88   AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
89 
90   setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
91   AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
92 
93   setOperationAction(ISD::LOAD, MVT::i64, Promote);
94   AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
95 
96   setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
97   AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
98 
99   setOperationAction(ISD::LOAD, MVT::f64, Promote);
100   AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
101 
102   setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
103   AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
104 
105   setOperationAction(ISD::LOAD, MVT::v3i64, Promote);
106   AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
107 
108   setOperationAction(ISD::LOAD, MVT::v4i64, Promote);
109   AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
110 
111   setOperationAction(ISD::LOAD, MVT::v3f64, Promote);
112   AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
113 
114   setOperationAction(ISD::LOAD, MVT::v4f64, Promote);
115   AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
116 
117   setOperationAction(ISD::LOAD, MVT::v8i64, Promote);
118   AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
119 
120   setOperationAction(ISD::LOAD, MVT::v8f64, Promote);
121   AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
122 
123   setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
124   AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
125 
126   setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
127   AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
128 
129   // There are no 64-bit extloads. These should be done as a 32-bit extload and
130   // an extension to 64-bit.
131   for (MVT VT : MVT::integer_valuetypes())
132     setLoadExtAction({ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}, MVT::i64, VT,
133                      Expand);
134 
135   for (MVT VT : MVT::integer_valuetypes()) {
136     if (VT == MVT::i64)
137       continue;
138 
139     for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
140       setLoadExtAction(Op, VT, MVT::i1, Promote);
141       setLoadExtAction(Op, VT, MVT::i8, Legal);
142       setLoadExtAction(Op, VT, MVT::i16, Legal);
143       setLoadExtAction(Op, VT, MVT::i32, Expand);
144     }
145   }
146 
147   for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
148     for (auto MemVT :
149          {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
150       setLoadExtAction({ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}, VT, MemVT,
151                        Expand);
152 
153   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
154   setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
155   setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
156   setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
157   setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
158   setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
159   setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
160 
161   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
162   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
163   setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
164   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
165   setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
166   setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
167 
168   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
169   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
170   setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
171   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
172   setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
173   setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
174 
175   setOperationAction(ISD::STORE, MVT::f32, Promote);
176   AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
177 
178   setOperationAction(ISD::STORE, MVT::v2f32, Promote);
179   AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
180 
181   setOperationAction(ISD::STORE, MVT::v3f32, Promote);
182   AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
183 
184   setOperationAction(ISD::STORE, MVT::v4f32, Promote);
185   AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
186 
187   setOperationAction(ISD::STORE, MVT::v5f32, Promote);
188   AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
189 
190   setOperationAction(ISD::STORE, MVT::v6f32, Promote);
191   AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
192 
193   setOperationAction(ISD::STORE, MVT::v7f32, Promote);
194   AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
195 
196   setOperationAction(ISD::STORE, MVT::v8f32, Promote);
197   AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
198 
199   setOperationAction(ISD::STORE, MVT::v16f32, Promote);
200   AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
201 
202   setOperationAction(ISD::STORE, MVT::v32f32, Promote);
203   AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
204 
205   setOperationAction(ISD::STORE, MVT::i64, Promote);
206   AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
207 
208   setOperationAction(ISD::STORE, MVT::v2i64, Promote);
209   AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
210 
211   setOperationAction(ISD::STORE, MVT::f64, Promote);
212   AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
213 
214   setOperationAction(ISD::STORE, MVT::v2f64, Promote);
215   AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
216 
217   setOperationAction(ISD::STORE, MVT::v3i64, Promote);
218   AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
219 
220   setOperationAction(ISD::STORE, MVT::v3f64, Promote);
221   AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
222 
223   setOperationAction(ISD::STORE, MVT::v4i64, Promote);
224   AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
225 
226   setOperationAction(ISD::STORE, MVT::v4f64, Promote);
227   AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
228 
229   setOperationAction(ISD::STORE, MVT::v8i64, Promote);
230   AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
231 
232   setOperationAction(ISD::STORE, MVT::v8f64, Promote);
233   AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
234 
235   setOperationAction(ISD::STORE, MVT::v16i64, Promote);
236   AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
237 
238   setOperationAction(ISD::STORE, MVT::v16f64, Promote);
239   AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
240 
241   setTruncStoreAction(MVT::i64, MVT::i1, Expand);
242   setTruncStoreAction(MVT::i64, MVT::i8, Expand);
243   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
244   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
245 
246   setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
247   setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
248   setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
249   setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
250 
251   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
252   setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
253   setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
254   setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
255   setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
256   setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
257   setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
258 
259   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
260   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
261 
262   setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
263   setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
264 
265   setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
266   setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
267   setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
268   setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
269 
270   setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
271   setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
272   setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
273   setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
274 
275   setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
276   setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
277 
278   setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
279   setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
280   setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
281   setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
282   setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
283   setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
284   setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
285 
286   setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);
287   setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);
288 
289   setOperationAction({ISD::BR_JT, ISD::BRIND}, MVT::Other, Expand);
290 
291   // This is totally unsupported, just custom lower to produce an error.
292   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
293 
294   // Library functions.  These default to Expand, but we have instructions
295   // for them.
296   setOperationAction({ISD::FCEIL, ISD::FEXP2, ISD::FPOW, ISD::FLOG2, ISD::FABS,
297                       ISD::FFLOOR, ISD::FRINT, ISD::FTRUNC, ISD::FMINNUM,
298                       ISD::FMAXNUM},
299                      MVT::f32, Legal);
300 
301   setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
302 
303   setOperationAction({ISD::FLOG, ISD::FLOG10, ISD::FEXP}, MVT::f32, Custom);
304 
305   setOperationAction(ISD::FNEARBYINT, {MVT::f32, MVT::f64}, Custom);
306 
307   setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom);
308 
309   // Expand to fneg + fadd.
310   setOperationAction(ISD::FSUB, MVT::f64, Expand);
311 
312   setOperationAction(ISD::CONCAT_VECTORS,
313                      {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
314                       MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
315                       MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32},
316                      Custom);
317   setOperationAction(ISD::EXTRACT_SUBVECTOR,
318                      {MVT::v2f16,  MVT::v2i16,  MVT::v4f16,  MVT::v4i16,
319                       MVT::v2f32,  MVT::v2i32,  MVT::v3f32,  MVT::v3i32,
320                       MVT::v4f32,  MVT::v4i32,  MVT::v5f32,  MVT::v5i32,
321                       MVT::v6f32,  MVT::v6i32,  MVT::v7f32,  MVT::v7i32,
322                       MVT::v8f32,  MVT::v8i32,  MVT::v16f32, MVT::v16i32,
323                       MVT::v32f32, MVT::v32i32, MVT::v2f64,  MVT::v2i64,
324                       MVT::v3f64,  MVT::v3i64,  MVT::v4f64,  MVT::v4i64,
325                       MVT::v8f64,  MVT::v8i64,  MVT::v16f64, MVT::v16i64},
326                      Custom);
327 
328   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
329   setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
330 
331   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
332   for (MVT VT : ScalarIntVTs) {
333     // These should use [SU]DIVREM, so set them to expand
334     setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM}, VT,
335                        Expand);
336 
337     // GPU does not have divrem function for signed or unsigned.
338     setOperationAction({ISD::SDIVREM, ISD::UDIVREM}, VT, Custom);
339 
340     // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
341     setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, VT, Expand);
342 
343     setOperationAction({ISD::BSWAP, ISD::CTTZ, ISD::CTLZ}, VT, Expand);
344 
345     // AMDGPU uses ADDC/SUBC/ADDE/SUBE
346     setOperationAction({ISD::ADDC, ISD::SUBC, ISD::ADDE, ISD::SUBE}, VT, Legal);
347   }
348 
349   // The hardware supports 32-bit FSHR, but not FSHL.
350   setOperationAction(ISD::FSHR, MVT::i32, Legal);
351 
352   // The hardware supports 32-bit ROTR, but not ROTL.
353   setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand);
354   setOperationAction(ISD::ROTR, MVT::i64, Expand);
355 
356   setOperationAction({ISD::MULHU, ISD::MULHS}, MVT::i16, Expand);
357 
358   setOperationAction({ISD::MUL, ISD::MULHU, ISD::MULHS}, MVT::i64, Expand);
359   setOperationAction(
360       {ISD::UINT_TO_FP, ISD::SINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT},
361       MVT::i64, Custom);
362   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
363 
364   setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i32,
365                      Legal);
366 
367   setOperationAction(
368       {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF},
369       MVT::i64, Custom);
370 
371   static const MVT::SimpleValueType VectorIntTypes[] = {
372       MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32};
373 
374   for (MVT VT : VectorIntTypes) {
375     // Expand the following operations for the current type by default.
376     setOperationAction({ISD::ADD,        ISD::AND,     ISD::FP_TO_SINT,
377                         ISD::FP_TO_UINT, ISD::MUL,     ISD::MULHU,
378                         ISD::MULHS,      ISD::OR,      ISD::SHL,
379                         ISD::SRA,        ISD::SRL,     ISD::ROTL,
380                         ISD::ROTR,       ISD::SUB,     ISD::SINT_TO_FP,
381                         ISD::UINT_TO_FP, ISD::SDIV,    ISD::UDIV,
382                         ISD::SREM,       ISD::UREM,    ISD::SMUL_LOHI,
383                         ISD::UMUL_LOHI,  ISD::SDIVREM, ISD::UDIVREM,
384                         ISD::SELECT,     ISD::VSELECT, ISD::SELECT_CC,
385                         ISD::XOR,        ISD::BSWAP,   ISD::CTPOP,
386                         ISD::CTTZ,       ISD::CTLZ,    ISD::VECTOR_SHUFFLE,
387                         ISD::SETCC},
388                        VT, Expand);
389   }
390 
391   static const MVT::SimpleValueType FloatVectorTypes[] = {
392       MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32};
393 
394   for (MVT VT : FloatVectorTypes) {
395     setOperationAction(
396         {ISD::FABS,    ISD::FMINNUM,      ISD::FMAXNUM,   ISD::FADD,
397          ISD::FCEIL,   ISD::FCOS,         ISD::FDIV,      ISD::FEXP2,
398          ISD::FEXP,    ISD::FLOG2,        ISD::FREM,      ISD::FLOG,
399          ISD::FLOG10,  ISD::FPOW,         ISD::FFLOOR,    ISD::FTRUNC,
400          ISD::FMUL,    ISD::FMA,          ISD::FRINT,     ISD::FNEARBYINT,
401          ISD::FSQRT,   ISD::FSIN,         ISD::FSUB,      ISD::FNEG,
402          ISD::VSELECT, ISD::SELECT_CC,    ISD::FCOPYSIGN, ISD::VECTOR_SHUFFLE,
403          ISD::SETCC,   ISD::FCANONICALIZE},
404         VT, Expand);
405   }
406 
407   // This causes using an unrolled select operation rather than expansion with
408   // bit operations. This is in general better, but the alternative using BFI
409   // instructions may be better if the select sources are SGPRs.
410   setOperationAction(ISD::SELECT, MVT::v2f32, Promote);
411   AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
412 
413   setOperationAction(ISD::SELECT, MVT::v3f32, Promote);
414   AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
415 
416   setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
417   AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
418 
419   setOperationAction(ISD::SELECT, MVT::v5f32, Promote);
420   AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
421 
422   setOperationAction(ISD::SELECT, MVT::v6f32, Promote);
423   AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
424 
425   setOperationAction(ISD::SELECT, MVT::v7f32, Promote);
426   AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
427 
428   // There are no libcalls of any kind.
429   for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
430     setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
431 
432   setSchedulingPreference(Sched::RegPressure);
433   setJumpIsExpensive(true);
434 
435   // FIXME: This is only partially true. If we have to do vector compares, any
436   // SGPR pair can be a condition register. If we have a uniform condition, we
437   // are better off doing SALU operations, where there is only one SCC. For now,
438   // we don't have a way of knowing during instruction selection if a condition
439   // will be uniform and we always use vector compares. Assume we are using
440   // vector compares until that is fixed.
441   setHasMultipleConditionRegisters(true);
442 
443   setMinCmpXchgSizeInBits(32);
444   setSupportsUnalignedAtomics(false);
445 
446   PredictableSelectIsExpensive = false;
447 
448   // We want to find all load dependencies for long chains of stores to enable
449   // merging into very wide vectors. The problem is with vectors with > 4
450   // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
451   // vectors are a legal type, even though we have to split the loads
452   // usually. When we can more precisely specify load legality per address
453   // space, we should be able to make FindBetterChain/MergeConsecutiveStores
454   // smarter so that they can figure out what to do in 2 iterations without all
455   // N > 4 stores on the same chain.
456   GatherAllAliasesMaxDepth = 16;
457 
458   // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
459   // about these during lowering.
460   MaxStoresPerMemcpy  = 0xffffffff;
461   MaxStoresPerMemmove = 0xffffffff;
462   MaxStoresPerMemset  = 0xffffffff;
463 
464   // The expansion for 64-bit division is enormous.
465   if (AMDGPUBypassSlowDiv)
466     addBypassSlowDiv(64, 32);
467 
468   setTargetDAGCombine({ISD::BITCAST,    ISD::SHL,
469                        ISD::SRA,        ISD::SRL,
470                        ISD::TRUNCATE,   ISD::MUL,
471                        ISD::SMUL_LOHI,  ISD::UMUL_LOHI,
472                        ISD::MULHU,      ISD::MULHS,
473                        ISD::SELECT,     ISD::SELECT_CC,
474                        ISD::STORE,      ISD::FADD,
475                        ISD::FSUB,       ISD::FNEG,
476                        ISD::FABS,       ISD::AssertZext,
477                        ISD::AssertSext, ISD::INTRINSIC_WO_CHAIN});
478 }
479 
480 bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const {
481   if (getTargetMachine().Options.NoSignedZerosFPMath)
482     return true;
483 
484   const auto Flags = Op.getNode()->getFlags();
485   if (Flags.hasNoSignedZeros())
486     return true;
487 
488   return false;
489 }
490 
491 //===----------------------------------------------------------------------===//
492 // Target Information
493 //===----------------------------------------------------------------------===//
494 
495 LLVM_READNONE
496 static bool fnegFoldsIntoOp(unsigned Opc) {
497   switch (Opc) {
498   case ISD::FADD:
499   case ISD::FSUB:
500   case ISD::FMUL:
501   case ISD::FMA:
502   case ISD::FMAD:
503   case ISD::FMINNUM:
504   case ISD::FMAXNUM:
505   case ISD::FMINNUM_IEEE:
506   case ISD::FMAXNUM_IEEE:
507   case ISD::FSIN:
508   case ISD::FTRUNC:
509   case ISD::FRINT:
510   case ISD::FNEARBYINT:
511   case ISD::FCANONICALIZE:
512   case AMDGPUISD::RCP:
513   case AMDGPUISD::RCP_LEGACY:
514   case AMDGPUISD::RCP_IFLAG:
515   case AMDGPUISD::SIN_HW:
516   case AMDGPUISD::FMUL_LEGACY:
517   case AMDGPUISD::FMIN_LEGACY:
518   case AMDGPUISD::FMAX_LEGACY:
519   case AMDGPUISD::FMED3:
520     // TODO: handle llvm.amdgcn.fma.legacy
521     return true;
522   default:
523     return false;
524   }
525 }
526 
527 /// \p returns true if the operation will definitely need to use a 64-bit
528 /// encoding, and thus will use a VOP3 encoding regardless of the source
529 /// modifiers.
530 LLVM_READONLY
531 static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
532   return N->getNumOperands() > 2 || VT == MVT::f64;
533 }
534 
535 // Most FP instructions support source modifiers, but this could be refined
536 // slightly.
537 LLVM_READONLY
538 static bool hasSourceMods(const SDNode *N) {
539   if (isa<MemSDNode>(N))
540     return false;
541 
542   switch (N->getOpcode()) {
543   case ISD::CopyToReg:
544   case ISD::SELECT:
545   case ISD::FDIV:
546   case ISD::FREM:
547   case ISD::INLINEASM:
548   case ISD::INLINEASM_BR:
549   case AMDGPUISD::DIV_SCALE:
550   case ISD::INTRINSIC_W_CHAIN:
551 
552   // TODO: Should really be looking at the users of the bitcast. These are
553   // problematic because bitcasts are used to legalize all stores to integer
554   // types.
555   case ISD::BITCAST:
556     return false;
557   case ISD::INTRINSIC_WO_CHAIN: {
558     switch (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()) {
559     case Intrinsic::amdgcn_interp_p1:
560     case Intrinsic::amdgcn_interp_p2:
561     case Intrinsic::amdgcn_interp_mov:
562     case Intrinsic::amdgcn_interp_p1_f16:
563     case Intrinsic::amdgcn_interp_p2_f16:
564       return false;
565     default:
566       return true;
567     }
568   }
569   default:
570     return true;
571   }
572 }
573 
574 bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
575                                                  unsigned CostThreshold) {
576   // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
577   // it is truly free to use a source modifier in all cases. If there are
578   // multiple users but for each one will necessitate using VOP3, there will be
579   // a code size increase. Try to avoid increasing code size unless we know it
580   // will save on the instruction count.
581   unsigned NumMayIncreaseSize = 0;
582   MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
583 
584   // XXX - Should this limit number of uses to check?
585   for (const SDNode *U : N->uses()) {
586     if (!hasSourceMods(U))
587       return false;
588 
589     if (!opMustUseVOP3Encoding(U, VT)) {
590       if (++NumMayIncreaseSize > CostThreshold)
591         return false;
592     }
593   }
594 
595   return true;
596 }
597 
598 EVT AMDGPUTargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
599                                               ISD::NodeType ExtendKind) const {
600   assert(!VT.isVector() && "only scalar expected");
601 
602   // Round to the next multiple of 32-bits.
603   unsigned Size = VT.getSizeInBits();
604   if (Size <= 32)
605     return MVT::i32;
606   return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
607 }
608 
609 MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
610   return MVT::i32;
611 }
612 
613 bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
614   return true;
615 }
616 
617 // The backend supports 32 and 64 bit floating point immediates.
618 // FIXME: Why are we reporting vectors of FP immediates as legal?
619 bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
620                                         bool ForCodeSize) const {
621   EVT ScalarVT = VT.getScalarType();
622   return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
623          (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
624 }
625 
626 // We don't want to shrink f64 / f32 constants.
627 bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
628   EVT ScalarVT = VT.getScalarType();
629   return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
630 }
631 
632 bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
633                                                  ISD::LoadExtType ExtTy,
634                                                  EVT NewVT) const {
635   // TODO: This may be worth removing. Check regression tests for diffs.
636   if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT))
637     return false;
638 
639   unsigned NewSize = NewVT.getStoreSizeInBits();
640 
641   // If we are reducing to a 32-bit load or a smaller multi-dword load,
642   // this is always better.
643   if (NewSize >= 32)
644     return true;
645 
646   EVT OldVT = N->getValueType(0);
647   unsigned OldSize = OldVT.getStoreSizeInBits();
648 
649   MemSDNode *MN = cast<MemSDNode>(N);
650   unsigned AS = MN->getAddressSpace();
651   // Do not shrink an aligned scalar load to sub-dword.
652   // Scalar engine cannot do sub-dword loads.
653   if (OldSize >= 32 && NewSize < 32 && MN->getAlignment() >= 4 &&
654       (AS == AMDGPUAS::CONSTANT_ADDRESS ||
655        AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
656        (isa<LoadSDNode>(N) &&
657         AS == AMDGPUAS::GLOBAL_ADDRESS && MN->isInvariant())) &&
658       AMDGPUInstrInfo::isUniformMMO(MN->getMemOperand()))
659     return false;
660 
661   // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
662   // extloads, so doing one requires using a buffer_load. In cases where we
663   // still couldn't use a scalar load, using the wider load shouldn't really
664   // hurt anything.
665 
666   // If the old size already had to be an extload, there's no harm in continuing
667   // to reduce the width.
668   return (OldSize < 32);
669 }
670 
671 bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy,
672                                                    const SelectionDAG &DAG,
673                                                    const MachineMemOperand &MMO) const {
674 
675   assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
676 
677   if (LoadTy.getScalarType() == MVT::i32)
678     return false;
679 
680   unsigned LScalarSize = LoadTy.getScalarSizeInBits();
681   unsigned CastScalarSize = CastTy.getScalarSizeInBits();
682 
683   if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
684     return false;
685 
686   bool Fast = false;
687   return allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
688                                         CastTy, MMO, &Fast) &&
689          Fast;
690 }
691 
692 // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
693 // profitable with the expansion for 64-bit since it's generally good to
694 // speculate things.
695 // FIXME: These should really have the size as a parameter.
696 bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const {
697   return true;
698 }
699 
700 bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
701   return true;
702 }
703 
704 bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode *N) const {
705   switch (N->getOpcode()) {
706   case ISD::EntryToken:
707   case ISD::TokenFactor:
708     return true;
709   case ISD::INTRINSIC_WO_CHAIN: {
710     unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
711     switch (IntrID) {
712     case Intrinsic::amdgcn_readfirstlane:
713     case Intrinsic::amdgcn_readlane:
714       return true;
715     }
716     return false;
717   }
718   case ISD::LOAD:
719     if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
720         AMDGPUAS::CONSTANT_ADDRESS_32BIT)
721       return true;
722     return false;
723   case AMDGPUISD::SETCC: // ballot-style instruction
724     return true;
725   }
726   return false;
727 }
728 
729 SDValue AMDGPUTargetLowering::getNegatedExpression(
730     SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
731     NegatibleCost &Cost, unsigned Depth) const {
732 
733   switch (Op.getOpcode()) {
734   case ISD::FMA:
735   case ISD::FMAD: {
736     // Negating a fma is not free if it has users without source mods.
737     if (!allUsesHaveSourceMods(Op.getNode()))
738       return SDValue();
739     break;
740   }
741   default:
742     break;
743   }
744 
745   return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
746                                               ForCodeSize, Cost, Depth);
747 }
748 
749 //===---------------------------------------------------------------------===//
750 // Target Properties
751 //===---------------------------------------------------------------------===//
752 
753 bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
754   assert(VT.isFloatingPoint());
755 
756   // Packed operations do not have a fabs modifier.
757   return VT == MVT::f32 || VT == MVT::f64 ||
758          (Subtarget->has16BitInsts() && VT == MVT::f16);
759 }
760 
761 bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
762   assert(VT.isFloatingPoint());
763   // Report this based on the end legalized type.
764   VT = VT.getScalarType();
765   return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16;
766 }
767 
768 bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
769                                                          unsigned NumElem,
770                                                          unsigned AS) const {
771   return true;
772 }
773 
774 bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
775   // There are few operations which truly have vector input operands. Any vector
776   // operation is going to involve operations on each component, and a
777   // build_vector will be a copy per element, so it always makes sense to use a
778   // build_vector input in place of the extracted element to avoid a copy into a
779   // super register.
780   //
781   // We should probably only do this if all users are extracts only, but this
782   // should be the common case.
783   return true;
784 }
785 
786 bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
787   // Truncate is just accessing a subregister.
788 
789   unsigned SrcSize = Source.getSizeInBits();
790   unsigned DestSize = Dest.getSizeInBits();
791 
792   return DestSize < SrcSize && DestSize % 32 == 0 ;
793 }
794 
795 bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
796   // Truncate is just accessing a subregister.
797 
798   unsigned SrcSize = Source->getScalarSizeInBits();
799   unsigned DestSize = Dest->getScalarSizeInBits();
800 
801   if (DestSize== 16 && Subtarget->has16BitInsts())
802     return SrcSize >= 32;
803 
804   return DestSize < SrcSize && DestSize % 32 == 0;
805 }
806 
807 bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
808   unsigned SrcSize = Src->getScalarSizeInBits();
809   unsigned DestSize = Dest->getScalarSizeInBits();
810 
811   if (SrcSize == 16 && Subtarget->has16BitInsts())
812     return DestSize >= 32;
813 
814   return SrcSize == 32 && DestSize == 64;
815 }
816 
817 bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
818   // Any register load of a 64-bit value really requires 2 32-bit moves. For all
819   // practical purposes, the extra mov 0 to load a 64-bit is free.  As used,
820   // this will enable reducing 64-bit operations the 32-bit, which is always
821   // good.
822 
823   if (Src == MVT::i16)
824     return Dest == MVT::i32 ||Dest == MVT::i64 ;
825 
826   return Src == MVT::i32 && Dest == MVT::i64;
827 }
828 
829 bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
830   return isZExtFree(Val.getValueType(), VT2);
831 }
832 
833 bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
834   // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
835   // limited number of native 64-bit operations. Shrinking an operation to fit
836   // in a single 32-bit register should always be helpful. As currently used,
837   // this is much less general than the name suggests, and is only used in
838   // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
839   // not profitable, and may actually be harmful.
840   return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
841 }
842 
843 //===---------------------------------------------------------------------===//
844 // TargetLowering Callbacks
845 //===---------------------------------------------------------------------===//
846 
847 CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
848                                                   bool IsVarArg) {
849   switch (CC) {
850   case CallingConv::AMDGPU_VS:
851   case CallingConv::AMDGPU_GS:
852   case CallingConv::AMDGPU_PS:
853   case CallingConv::AMDGPU_CS:
854   case CallingConv::AMDGPU_HS:
855   case CallingConv::AMDGPU_ES:
856   case CallingConv::AMDGPU_LS:
857     return CC_AMDGPU;
858   case CallingConv::C:
859   case CallingConv::Fast:
860   case CallingConv::Cold:
861     return CC_AMDGPU_Func;
862   case CallingConv::AMDGPU_Gfx:
863     return CC_SI_Gfx;
864   case CallingConv::AMDGPU_KERNEL:
865   case CallingConv::SPIR_KERNEL:
866   default:
867     report_fatal_error("Unsupported calling convention for call");
868   }
869 }
870 
871 CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
872                                                     bool IsVarArg) {
873   switch (CC) {
874   case CallingConv::AMDGPU_KERNEL:
875   case CallingConv::SPIR_KERNEL:
876     llvm_unreachable("kernels should not be handled here");
877   case CallingConv::AMDGPU_VS:
878   case CallingConv::AMDGPU_GS:
879   case CallingConv::AMDGPU_PS:
880   case CallingConv::AMDGPU_CS:
881   case CallingConv::AMDGPU_HS:
882   case CallingConv::AMDGPU_ES:
883   case CallingConv::AMDGPU_LS:
884     return RetCC_SI_Shader;
885   case CallingConv::AMDGPU_Gfx:
886     return RetCC_SI_Gfx;
887   case CallingConv::C:
888   case CallingConv::Fast:
889   case CallingConv::Cold:
890     return RetCC_AMDGPU_Func;
891   default:
892     report_fatal_error("Unsupported calling convention.");
893   }
894 }
895 
896 /// The SelectionDAGBuilder will automatically promote function arguments
897 /// with illegal types.  However, this does not work for the AMDGPU targets
898 /// since the function arguments are stored in memory as these illegal types.
899 /// In order to handle this properly we need to get the original types sizes
900 /// from the LLVM IR Function and fixup the ISD:InputArg values before
901 /// passing them to AnalyzeFormalArguments()
902 
903 /// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
904 /// input values across multiple registers.  Each item in the Ins array
905 /// represents a single value that will be stored in registers.  Ins[x].VT is
906 /// the value type of the value that will be stored in the register, so
907 /// whatever SDNode we lower the argument to needs to be this type.
908 ///
909 /// In order to correctly lower the arguments we need to know the size of each
910 /// argument.  Since Ins[x].VT gives us the size of the register that will
911 /// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
912 /// for the original function argument so that we can deduce the correct memory
913 /// type to use for Ins[x].  In most cases the correct memory type will be
914 /// Ins[x].ArgVT.  However, this will not always be the case.  If, for example,
915 /// we have a kernel argument of type v8i8, this argument will be split into
916 /// 8 parts and each part will be represented by its own item in the Ins array.
917 /// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
918 /// the argument before it was split.  From this, we deduce that the memory type
919 /// for each individual part is i8.  We pass the memory type as LocVT to the
920 /// calling convention analysis function and the register type (Ins[x].VT) as
921 /// the ValVT.
922 void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
923   CCState &State,
924   const SmallVectorImpl<ISD::InputArg> &Ins) const {
925   const MachineFunction &MF = State.getMachineFunction();
926   const Function &Fn = MF.getFunction();
927   LLVMContext &Ctx = Fn.getParent()->getContext();
928   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
929   const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn);
930   CallingConv::ID CC = Fn.getCallingConv();
931 
932   Align MaxAlign = Align(1);
933   uint64_t ExplicitArgOffset = 0;
934   const DataLayout &DL = Fn.getParent()->getDataLayout();
935 
936   unsigned InIndex = 0;
937 
938   for (const Argument &Arg : Fn.args()) {
939     const bool IsByRef = Arg.hasByRefAttr();
940     Type *BaseArgTy = Arg.getType();
941     Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
942     MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
943     if (!Alignment)
944       Alignment = DL.getABITypeAlign(MemArgTy);
945     MaxAlign = max(Alignment, MaxAlign);
946     uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
947 
948     uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
949     ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
950 
951     // We're basically throwing away everything passed into us and starting over
952     // to get accurate in-memory offsets. The "PartOffset" is completely useless
953     // to us as computed in Ins.
954     //
955     // We also need to figure out what type legalization is trying to do to get
956     // the correct memory offsets.
957 
958     SmallVector<EVT, 16> ValueVTs;
959     SmallVector<uint64_t, 16> Offsets;
960     ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, &Offsets, ArgOffset);
961 
962     for (unsigned Value = 0, NumValues = ValueVTs.size();
963          Value != NumValues; ++Value) {
964       uint64_t BasePartOffset = Offsets[Value];
965 
966       EVT ArgVT = ValueVTs[Value];
967       EVT MemVT = ArgVT;
968       MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
969       unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
970 
971       if (NumRegs == 1) {
972         // This argument is not split, so the IR type is the memory type.
973         if (ArgVT.isExtended()) {
974           // We have an extended type, like i24, so we should just use the
975           // register type.
976           MemVT = RegisterVT;
977         } else {
978           MemVT = ArgVT;
979         }
980       } else if (ArgVT.isVector() && RegisterVT.isVector() &&
981                  ArgVT.getScalarType() == RegisterVT.getScalarType()) {
982         assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
983         // We have a vector value which has been split into a vector with
984         // the same scalar type, but fewer elements.  This should handle
985         // all the floating-point vector types.
986         MemVT = RegisterVT;
987       } else if (ArgVT.isVector() &&
988                  ArgVT.getVectorNumElements() == NumRegs) {
989         // This arg has been split so that each element is stored in a separate
990         // register.
991         MemVT = ArgVT.getScalarType();
992       } else if (ArgVT.isExtended()) {
993         // We have an extended type, like i65.
994         MemVT = RegisterVT;
995       } else {
996         unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
997         assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
998         if (RegisterVT.isInteger()) {
999           MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1000         } else if (RegisterVT.isVector()) {
1001           assert(!RegisterVT.getScalarType().isFloatingPoint());
1002           unsigned NumElements = RegisterVT.getVectorNumElements();
1003           assert(MemoryBits % NumElements == 0);
1004           // This vector type has been split into another vector type with
1005           // a different elements size.
1006           EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1007                                            MemoryBits / NumElements);
1008           MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1009         } else {
1010           llvm_unreachable("cannot deduce memory type.");
1011         }
1012       }
1013 
1014       // Convert one element vectors to scalar.
1015       if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1016         MemVT = MemVT.getScalarType();
1017 
1018       // Round up vec3/vec5 argument.
1019       if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1020         assert(MemVT.getVectorNumElements() == 3 ||
1021                MemVT.getVectorNumElements() == 5);
1022         MemVT = MemVT.getPow2VectorType(State.getContext());
1023       } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1024         MemVT = MemVT.getRoundIntegerType(State.getContext());
1025       }
1026 
1027       unsigned PartOffset = 0;
1028       for (unsigned i = 0; i != NumRegs; ++i) {
1029         State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1030                                                BasePartOffset + PartOffset,
1031                                                MemVT.getSimpleVT(),
1032                                                CCValAssign::Full));
1033         PartOffset += MemVT.getStoreSize();
1034       }
1035     }
1036   }
1037 }
1038 
1039 SDValue AMDGPUTargetLowering::LowerReturn(
1040   SDValue Chain, CallingConv::ID CallConv,
1041   bool isVarArg,
1042   const SmallVectorImpl<ISD::OutputArg> &Outs,
1043   const SmallVectorImpl<SDValue> &OutVals,
1044   const SDLoc &DL, SelectionDAG &DAG) const {
1045   // FIXME: Fails for r600 tests
1046   //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1047   // "wave terminate should not have return values");
1048   return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1049 }
1050 
1051 //===---------------------------------------------------------------------===//
1052 // Target specific lowering
1053 //===---------------------------------------------------------------------===//
1054 
1055 /// Selects the correct CCAssignFn for a given CallingConvention value.
1056 CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
1057                                                     bool IsVarArg) {
1058   return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
1059 }
1060 
1061 CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
1062                                                       bool IsVarArg) {
1063   return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
1064 }
1065 
1066 SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain,
1067                                                   SelectionDAG &DAG,
1068                                                   MachineFrameInfo &MFI,
1069                                                   int ClobberedFI) const {
1070   SmallVector<SDValue, 8> ArgChains;
1071   int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1072   int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1073 
1074   // Include the original chain at the beginning of the list. When this is
1075   // used by target LowerCall hooks, this helps legalize find the
1076   // CALLSEQ_BEGIN node.
1077   ArgChains.push_back(Chain);
1078 
1079   // Add a chain value for each stack argument corresponding
1080   for (SDNode *U : DAG.getEntryNode().getNode()->uses()) {
1081     if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1082       if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1083         if (FI->getIndex() < 0) {
1084           int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1085           int64_t InLastByte = InFirstByte;
1086           InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1087 
1088           if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1089               (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1090             ArgChains.push_back(SDValue(L, 1));
1091         }
1092       }
1093     }
1094   }
1095 
1096   // Build a tokenfactor for all the chains.
1097   return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1098 }
1099 
1100 SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,
1101                                                  SmallVectorImpl<SDValue> &InVals,
1102                                                  StringRef Reason) const {
1103   SDValue Callee = CLI.Callee;
1104   SelectionDAG &DAG = CLI.DAG;
1105 
1106   const Function &Fn = DAG.getMachineFunction().getFunction();
1107 
1108   StringRef FuncName("<unknown>");
1109 
1110   if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1111     FuncName = G->getSymbol();
1112   else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1113     FuncName = G->getGlobal()->getName();
1114 
1115   DiagnosticInfoUnsupported NoCalls(
1116     Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1117   DAG.getContext()->diagnose(NoCalls);
1118 
1119   if (!CLI.IsTailCall) {
1120     for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
1121       InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
1122   }
1123 
1124   return DAG.getEntryNode();
1125 }
1126 
1127 SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
1128                                         SmallVectorImpl<SDValue> &InVals) const {
1129   return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1130 }
1131 
1132 SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
1133                                                       SelectionDAG &DAG) const {
1134   const Function &Fn = DAG.getMachineFunction().getFunction();
1135 
1136   DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1137                                             SDLoc(Op).getDebugLoc());
1138   DAG.getContext()->diagnose(NoDynamicAlloca);
1139   auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1140   return DAG.getMergeValues(Ops, SDLoc());
1141 }
1142 
1143 SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
1144                                              SelectionDAG &DAG) const {
1145   switch (Op.getOpcode()) {
1146   default:
1147     Op->print(errs(), &DAG);
1148     llvm_unreachable("Custom lowering code for this "
1149                      "instruction is not implemented yet!");
1150     break;
1151   case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
1152   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1153   case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
1154   case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1155   case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1156   case ISD::FREM: return LowerFREM(Op, DAG);
1157   case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1158   case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1159   case ISD::FRINT: return LowerFRINT(Op, DAG);
1160   case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1161   case ISD::FROUND: return LowerFROUND(Op, DAG);
1162   case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1163   case ISD::FLOG:
1164     return LowerFLOG(Op, DAG, numbers::ln2f);
1165   case ISD::FLOG10:
1166     return LowerFLOG(Op, DAG, numbers::ln2f / numbers::ln10f);
1167   case ISD::FEXP:
1168     return lowerFEXP(Op, DAG);
1169   case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1170   case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1171   case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1172   case ISD::FP_TO_SINT:
1173   case ISD::FP_TO_UINT:
1174     return LowerFP_TO_INT(Op, DAG);
1175   case ISD::CTTZ:
1176   case ISD::CTTZ_ZERO_UNDEF:
1177   case ISD::CTLZ:
1178   case ISD::CTLZ_ZERO_UNDEF:
1179     return LowerCTLZ_CTTZ(Op, DAG);
1180   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
1181   }
1182   return Op;
1183 }
1184 
1185 void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
1186                                               SmallVectorImpl<SDValue> &Results,
1187                                               SelectionDAG &DAG) const {
1188   switch (N->getOpcode()) {
1189   case ISD::SIGN_EXTEND_INREG:
1190     // Different parts of legalization seem to interpret which type of
1191     // sign_extend_inreg is the one to check for custom lowering. The extended
1192     // from type is what really matters, but some places check for custom
1193     // lowering of the result type. This results in trying to use
1194     // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1195     // nothing here and let the illegal result integer be handled normally.
1196     return;
1197   default:
1198     return;
1199   }
1200 }
1201 
1202 SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
1203                                                  SDValue Op,
1204                                                  SelectionDAG &DAG) const {
1205 
1206   const DataLayout &DL = DAG.getDataLayout();
1207   GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1208   const GlobalValue *GV = G->getGlobal();
1209 
1210   if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1211       G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1212     if (!MFI->isModuleEntryFunction() &&
1213         !GV->getName().equals("llvm.amdgcn.module.lds")) {
1214       SDLoc DL(Op);
1215       const Function &Fn = DAG.getMachineFunction().getFunction();
1216       DiagnosticInfoUnsupported BadLDSDecl(
1217         Fn, "local memory global used by non-kernel function",
1218         DL.getDebugLoc(), DS_Warning);
1219       DAG.getContext()->diagnose(BadLDSDecl);
1220 
1221       // We currently don't have a way to correctly allocate LDS objects that
1222       // aren't directly associated with a kernel. We do force inlining of
1223       // functions that use local objects. However, if these dead functions are
1224       // not eliminated, we don't want a compile time error. Just emit a warning
1225       // and a trap, since there should be no callable path here.
1226       SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1227       SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1228                                         Trap, DAG.getRoot());
1229       DAG.setRoot(OutputChain);
1230       return DAG.getUNDEF(Op.getValueType());
1231     }
1232 
1233     // XXX: What does the value of G->getOffset() mean?
1234     assert(G->getOffset() == 0 &&
1235          "Do not know what to do with an non-zero offset");
1236 
1237     // TODO: We could emit code to handle the initialization somewhere.
1238     // We ignore the initializer for now and legalize it to allow selection.
1239     // The initializer will anyway get errored out during assembly emission.
1240     unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1241     return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1242   }
1243   return SDValue();
1244 }
1245 
1246 SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
1247                                                   SelectionDAG &DAG) const {
1248   SmallVector<SDValue, 8> Args;
1249 
1250   EVT VT = Op.getValueType();
1251   if (VT == MVT::v4i16 || VT == MVT::v4f16) {
1252     SDLoc SL(Op);
1253     SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(0));
1254     SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(1));
1255 
1256     SDValue BV = DAG.getBuildVector(MVT::v2i32, SL, { Lo, Hi });
1257     return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1258   }
1259 
1260   for (const SDUse &U : Op->ops())
1261     DAG.ExtractVectorElements(U.get(), Args);
1262 
1263   return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1264 }
1265 
1266 SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
1267                                                      SelectionDAG &DAG) const {
1268 
1269   SmallVector<SDValue, 8> Args;
1270   unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
1271   EVT VT = Op.getValueType();
1272   EVT SrcVT = Op.getOperand(0).getValueType();
1273 
1274   // For these types, we have some TableGen patterns except if the index is 1
1275   if (((SrcVT == MVT::v4f16 && VT == MVT::v2f16) ||
1276        (SrcVT == MVT::v4i16 && VT == MVT::v2i16)) &&
1277       Start != 1)
1278     return Op;
1279 
1280   if (((SrcVT == MVT::v8f16 && VT == MVT::v4f16) ||
1281        (SrcVT == MVT::v8i16 && VT == MVT::v4i16)) &&
1282       (Start == 0 || Start == 4))
1283     return Op;
1284 
1285   DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1286                             VT.getVectorNumElements());
1287 
1288   return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1289 }
1290 
1291 /// Generate Min/Max node
1292 SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
1293                                                    SDValue LHS, SDValue RHS,
1294                                                    SDValue True, SDValue False,
1295                                                    SDValue CC,
1296                                                    DAGCombinerInfo &DCI) const {
1297   if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
1298     return SDValue();
1299 
1300   SelectionDAG &DAG = DCI.DAG;
1301   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1302   switch (CCOpcode) {
1303   case ISD::SETOEQ:
1304   case ISD::SETONE:
1305   case ISD::SETUNE:
1306   case ISD::SETNE:
1307   case ISD::SETUEQ:
1308   case ISD::SETEQ:
1309   case ISD::SETFALSE:
1310   case ISD::SETFALSE2:
1311   case ISD::SETTRUE:
1312   case ISD::SETTRUE2:
1313   case ISD::SETUO:
1314   case ISD::SETO:
1315     break;
1316   case ISD::SETULE:
1317   case ISD::SETULT: {
1318     if (LHS == True)
1319       return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1320     return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1321   }
1322   case ISD::SETOLE:
1323   case ISD::SETOLT:
1324   case ISD::SETLE:
1325   case ISD::SETLT: {
1326     // Ordered. Assume ordered for undefined.
1327 
1328     // Only do this after legalization to avoid interfering with other combines
1329     // which might occur.
1330     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1331         !DCI.isCalledByLegalizer())
1332       return SDValue();
1333 
1334     // We need to permute the operands to get the correct NaN behavior. The
1335     // selected operand is the second one based on the failing compare with NaN,
1336     // so permute it based on the compare type the hardware uses.
1337     if (LHS == True)
1338       return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1339     return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1340   }
1341   case ISD::SETUGE:
1342   case ISD::SETUGT: {
1343     if (LHS == True)
1344       return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1345     return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1346   }
1347   case ISD::SETGT:
1348   case ISD::SETGE:
1349   case ISD::SETOGE:
1350   case ISD::SETOGT: {
1351     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1352         !DCI.isCalledByLegalizer())
1353       return SDValue();
1354 
1355     if (LHS == True)
1356       return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1357     return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1358   }
1359   case ISD::SETCC_INVALID:
1360     llvm_unreachable("Invalid setcc condcode!");
1361   }
1362   return SDValue();
1363 }
1364 
1365 std::pair<SDValue, SDValue>
1366 AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
1367   SDLoc SL(Op);
1368 
1369   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1370 
1371   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1372   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1373 
1374   SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1375   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1376 
1377   return std::make_pair(Lo, Hi);
1378 }
1379 
1380 SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
1381   SDLoc SL(Op);
1382 
1383   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1384   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1385   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1386 }
1387 
1388 SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
1389   SDLoc SL(Op);
1390 
1391   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1392   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1393   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1394 }
1395 
1396 // Split a vector type into two parts. The first part is a power of two vector.
1397 // The second part is whatever is left over, and is a scalar if it would
1398 // otherwise be a 1-vector.
1399 std::pair<EVT, EVT>
1400 AMDGPUTargetLowering::getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const {
1401   EVT LoVT, HiVT;
1402   EVT EltVT = VT.getVectorElementType();
1403   unsigned NumElts = VT.getVectorNumElements();
1404   unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1405   LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1406   HiVT = NumElts - LoNumElts == 1
1407              ? EltVT
1408              : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1409   return std::make_pair(LoVT, HiVT);
1410 }
1411 
1412 // Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1413 // scalar.
1414 std::pair<SDValue, SDValue>
1415 AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL,
1416                                   const EVT &LoVT, const EVT &HiVT,
1417                                   SelectionDAG &DAG) const {
1418   assert(LoVT.getVectorNumElements() +
1419                  (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1420              N.getValueType().getVectorNumElements() &&
1421          "More vector elements requested than available!");
1422   SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
1423                            DAG.getVectorIdxConstant(0, DL));
1424   SDValue Hi = DAG.getNode(
1425       HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL,
1426       HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));
1427   return std::make_pair(Lo, Hi);
1428 }
1429 
1430 SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
1431                                               SelectionDAG &DAG) const {
1432   LoadSDNode *Load = cast<LoadSDNode>(Op);
1433   EVT VT = Op.getValueType();
1434   SDLoc SL(Op);
1435 
1436 
1437   // If this is a 2 element vector, we really want to scalarize and not create
1438   // weird 1 element vectors.
1439   if (VT.getVectorNumElements() == 2) {
1440     SDValue Ops[2];
1441     std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1442     return DAG.getMergeValues(Ops, SL);
1443   }
1444 
1445   SDValue BasePtr = Load->getBasePtr();
1446   EVT MemVT = Load->getMemoryVT();
1447 
1448   const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1449 
1450   EVT LoVT, HiVT;
1451   EVT LoMemVT, HiMemVT;
1452   SDValue Lo, Hi;
1453 
1454   std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1455   std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1456   std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1457 
1458   unsigned Size = LoMemVT.getStoreSize();
1459   unsigned BaseAlign = Load->getAlignment();
1460   unsigned HiAlign = MinAlign(BaseAlign, Size);
1461 
1462   SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1463                                   Load->getChain(), BasePtr, SrcValue, LoMemVT,
1464                                   BaseAlign, Load->getMemOperand()->getFlags());
1465   SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Size));
1466   SDValue HiLoad =
1467       DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1468                      HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1469                      HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1470 
1471   SDValue Join;
1472   if (LoVT == HiVT) {
1473     // This is the case that the vector is power of two so was evenly split.
1474     Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1475   } else {
1476     Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
1477                        DAG.getVectorIdxConstant(0, SL));
1478     Join = DAG.getNode(
1479         HiVT.isVector() ? ISD::INSERT_SUBVECTOR : ISD::INSERT_VECTOR_ELT, SL,
1480         VT, Join, HiLoad,
1481         DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), SL));
1482   }
1483 
1484   SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1485                                      LoLoad.getValue(1), HiLoad.getValue(1))};
1486 
1487   return DAG.getMergeValues(Ops, SL);
1488 }
1489 
1490 SDValue AMDGPUTargetLowering::WidenOrSplitVectorLoad(SDValue Op,
1491                                                      SelectionDAG &DAG) const {
1492   LoadSDNode *Load = cast<LoadSDNode>(Op);
1493   EVT VT = Op.getValueType();
1494   SDValue BasePtr = Load->getBasePtr();
1495   EVT MemVT = Load->getMemoryVT();
1496   SDLoc SL(Op);
1497   const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1498   unsigned BaseAlign = Load->getAlignment();
1499   unsigned NumElements = MemVT.getVectorNumElements();
1500 
1501   // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1502   // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1503   if (NumElements != 3 ||
1504       (BaseAlign < 8 &&
1505        !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1506     return SplitVectorLoad(Op, DAG);
1507 
1508   assert(NumElements == 3);
1509 
1510   EVT WideVT =
1511       EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
1512   EVT WideMemVT =
1513       EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), 4);
1514   SDValue WideLoad = DAG.getExtLoad(
1515       Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1516       WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1517   return DAG.getMergeValues(
1518       {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1519                    DAG.getVectorIdxConstant(0, SL)),
1520        WideLoad.getValue(1)},
1521       SL);
1522 }
1523 
1524 SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
1525                                                SelectionDAG &DAG) const {
1526   StoreSDNode *Store = cast<StoreSDNode>(Op);
1527   SDValue Val = Store->getValue();
1528   EVT VT = Val.getValueType();
1529 
1530   // If this is a 2 element vector, we really want to scalarize and not create
1531   // weird 1 element vectors.
1532   if (VT.getVectorNumElements() == 2)
1533     return scalarizeVectorStore(Store, DAG);
1534 
1535   EVT MemVT = Store->getMemoryVT();
1536   SDValue Chain = Store->getChain();
1537   SDValue BasePtr = Store->getBasePtr();
1538   SDLoc SL(Op);
1539 
1540   EVT LoVT, HiVT;
1541   EVT LoMemVT, HiMemVT;
1542   SDValue Lo, Hi;
1543 
1544   std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1545   std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1546   std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1547 
1548   SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1549 
1550   const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1551   unsigned BaseAlign = Store->getAlignment();
1552   unsigned Size = LoMemVT.getStoreSize();
1553   unsigned HiAlign = MinAlign(BaseAlign, Size);
1554 
1555   SDValue LoStore =
1556       DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1557                         Store->getMemOperand()->getFlags());
1558   SDValue HiStore =
1559       DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1560                         HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1561 
1562   return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1563 }
1564 
1565 // This is a shortcut for integer division because we have fast i32<->f32
1566 // conversions, and fast f32 reciprocal instructions. The fractional part of a
1567 // float is enough to accurately represent up to a 24-bit signed integer.
1568 SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
1569                                             bool Sign) const {
1570   SDLoc DL(Op);
1571   EVT VT = Op.getValueType();
1572   SDValue LHS = Op.getOperand(0);
1573   SDValue RHS = Op.getOperand(1);
1574   MVT IntVT = MVT::i32;
1575   MVT FltVT = MVT::f32;
1576 
1577   unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1578   if (LHSSignBits < 9)
1579     return SDValue();
1580 
1581   unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1582   if (RHSSignBits < 9)
1583     return SDValue();
1584 
1585   unsigned BitSize = VT.getSizeInBits();
1586   unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1587   unsigned DivBits = BitSize - SignBits;
1588   if (Sign)
1589     ++DivBits;
1590 
1591   ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
1592   ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
1593 
1594   SDValue jq = DAG.getConstant(1, DL, IntVT);
1595 
1596   if (Sign) {
1597     // char|short jq = ia ^ ib;
1598     jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1599 
1600     // jq = jq >> (bitsize - 2)
1601     jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1602                      DAG.getConstant(BitSize - 2, DL, VT));
1603 
1604     // jq = jq | 0x1
1605     jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1606   }
1607 
1608   // int ia = (int)LHS;
1609   SDValue ia = LHS;
1610 
1611   // int ib, (int)RHS;
1612   SDValue ib = RHS;
1613 
1614   // float fa = (float)ia;
1615   SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1616 
1617   // float fb = (float)ib;
1618   SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1619 
1620   SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1621                            fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1622 
1623   // fq = trunc(fq);
1624   fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1625 
1626   // float fqneg = -fq;
1627   SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1628 
1629   MachineFunction &MF = DAG.getMachineFunction();
1630   const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
1631 
1632   // float fr = mad(fqneg, fb, fa);
1633   unsigned OpCode = !Subtarget->hasMadMacF32Insts() ?
1634                     (unsigned)ISD::FMA :
1635                     !MFI->getMode().allFP32Denormals() ?
1636                     (unsigned)ISD::FMAD :
1637                     (unsigned)AMDGPUISD::FMAD_FTZ;
1638   SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1639 
1640   // int iq = (int)fq;
1641   SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1642 
1643   // fr = fabs(fr);
1644   fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1645 
1646   // fb = fabs(fb);
1647   fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1648 
1649   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1650 
1651   // int cv = fr >= fb;
1652   SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1653 
1654   // jq = (cv ? jq : 0);
1655   jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1656 
1657   // dst = iq + jq;
1658   SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
1659 
1660   // Rem needs compensation, it's easier to recompute it
1661   SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
1662   Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
1663 
1664   // Truncate to number of bits this divide really is.
1665   if (Sign) {
1666     SDValue InRegSize
1667       = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
1668     Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
1669     Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
1670   } else {
1671     SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
1672     Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
1673     Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
1674   }
1675 
1676   return DAG.getMergeValues({ Div, Rem }, DL);
1677 }
1678 
1679 void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
1680                                       SelectionDAG &DAG,
1681                                       SmallVectorImpl<SDValue> &Results) const {
1682   SDLoc DL(Op);
1683   EVT VT = Op.getValueType();
1684 
1685   assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
1686 
1687   EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1688 
1689   SDValue One = DAG.getConstant(1, DL, HalfVT);
1690   SDValue Zero = DAG.getConstant(0, DL, HalfVT);
1691 
1692   //HiLo split
1693   SDValue LHS = Op.getOperand(0);
1694   SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
1695   SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, One);
1696 
1697   SDValue RHS = Op.getOperand(1);
1698   SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
1699   SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, One);
1700 
1701   if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
1702       DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
1703 
1704     SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1705                               LHS_Lo, RHS_Lo);
1706 
1707     SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
1708     SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
1709 
1710     Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
1711     Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
1712     return;
1713   }
1714 
1715   if (isTypeLegal(MVT::i64)) {
1716     // The algorithm here is based on ideas from "Software Integer Division",
1717     // Tom Rodeheffer, August 2008.
1718 
1719     MachineFunction &MF = DAG.getMachineFunction();
1720     const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1721 
1722     // Compute denominator reciprocal.
1723     unsigned FMAD = !Subtarget->hasMadMacF32Insts() ?
1724                     (unsigned)ISD::FMA :
1725                     !MFI->getMode().allFP32Denormals() ?
1726                     (unsigned)ISD::FMAD :
1727                     (unsigned)AMDGPUISD::FMAD_FTZ;
1728 
1729     SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
1730     SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
1731     SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
1732       DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
1733       Cvt_Lo);
1734     SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
1735     SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
1736       DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
1737     SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
1738       DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
1739     SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
1740     SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
1741       DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
1742       Mul1);
1743     SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
1744     SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
1745     SDValue Rcp64 = DAG.getBitcast(VT,
1746                         DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
1747 
1748     SDValue Zero64 = DAG.getConstant(0, DL, VT);
1749     SDValue One64  = DAG.getConstant(1, DL, VT);
1750     SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
1751     SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
1752 
1753     // First round of UNR (Unsigned integer Newton-Raphson).
1754     SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
1755     SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
1756     SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
1757     SDValue Mulhi1_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
1758                                     Zero);
1759     SDValue Mulhi1_Hi =
1760         DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1, One);
1761     SDValue Add1_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Lo,
1762                                   Mulhi1_Lo, Zero1);
1763     SDValue Add1_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Hi,
1764                                   Mulhi1_Hi, Add1_Lo.getValue(1));
1765     SDValue Add1 = DAG.getBitcast(VT,
1766                         DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
1767 
1768     // Second round of UNR.
1769     SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
1770     SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
1771     SDValue Mulhi2_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
1772                                     Zero);
1773     SDValue Mulhi2_Hi =
1774         DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2, One);
1775     SDValue Add2_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Lo,
1776                                   Mulhi2_Lo, Zero1);
1777     SDValue Add2_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Hi,
1778                                   Mulhi2_Hi, Add2_Lo.getValue(1));
1779     SDValue Add2 = DAG.getBitcast(VT,
1780                         DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
1781 
1782     SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
1783 
1784     SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
1785 
1786     SDValue Mul3_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, Zero);
1787     SDValue Mul3_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, One);
1788     SDValue Sub1_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Lo,
1789                                   Mul3_Lo, Zero1);
1790     SDValue Sub1_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Hi,
1791                                   Mul3_Hi, Sub1_Lo.getValue(1));
1792     SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
1793     SDValue Sub1 = DAG.getBitcast(VT,
1794                         DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
1795 
1796     SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
1797     SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
1798                                  ISD::SETUGE);
1799     SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
1800                                  ISD::SETUGE);
1801     SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
1802 
1803     // TODO: Here and below portions of the code can be enclosed into if/endif.
1804     // Currently control flow is unconditional and we have 4 selects after
1805     // potential endif to substitute PHIs.
1806 
1807     // if C3 != 0 ...
1808     SDValue Sub2_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Lo,
1809                                   RHS_Lo, Zero1);
1810     SDValue Sub2_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Mi,
1811                                   RHS_Hi, Sub1_Lo.getValue(1));
1812     SDValue Sub2_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1813                                   Zero, Sub2_Lo.getValue(1));
1814     SDValue Sub2 = DAG.getBitcast(VT,
1815                         DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
1816 
1817     SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
1818 
1819     SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
1820                                  ISD::SETUGE);
1821     SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
1822                                  ISD::SETUGE);
1823     SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
1824 
1825     // if (C6 != 0)
1826     SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
1827 
1828     SDValue Sub3_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Lo,
1829                                   RHS_Lo, Zero1);
1830     SDValue Sub3_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
1831                                   RHS_Hi, Sub2_Lo.getValue(1));
1832     SDValue Sub3_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub3_Mi,
1833                                   Zero, Sub3_Lo.getValue(1));
1834     SDValue Sub3 = DAG.getBitcast(VT,
1835                         DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
1836 
1837     // endif C6
1838     // endif C3
1839 
1840     SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
1841     SDValue Div  = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
1842 
1843     SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
1844     SDValue Rem  = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
1845 
1846     Results.push_back(Div);
1847     Results.push_back(Rem);
1848 
1849     return;
1850   }
1851 
1852   // r600 expandion.
1853   // Get Speculative values
1854   SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
1855   SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
1856 
1857   SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
1858   SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
1859   REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
1860 
1861   SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
1862   SDValue DIV_Lo = Zero;
1863 
1864   const unsigned halfBitWidth = HalfVT.getSizeInBits();
1865 
1866   for (unsigned i = 0; i < halfBitWidth; ++i) {
1867     const unsigned bitPos = halfBitWidth - i - 1;
1868     SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
1869     // Get value of high bit
1870     SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
1871     HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
1872     HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
1873 
1874     // Shift
1875     REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
1876     // Add LHS high bit
1877     REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
1878 
1879     SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
1880     SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
1881 
1882     DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
1883 
1884     // Update REM
1885     SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
1886     REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
1887   }
1888 
1889   SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
1890   DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
1891   Results.push_back(DIV);
1892   Results.push_back(REM);
1893 }
1894 
1895 SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
1896                                            SelectionDAG &DAG) const {
1897   SDLoc DL(Op);
1898   EVT VT = Op.getValueType();
1899 
1900   if (VT == MVT::i64) {
1901     SmallVector<SDValue, 2> Results;
1902     LowerUDIVREM64(Op, DAG, Results);
1903     return DAG.getMergeValues(Results, DL);
1904   }
1905 
1906   if (VT == MVT::i32) {
1907     if (SDValue Res = LowerDIVREM24(Op, DAG, false))
1908       return Res;
1909   }
1910 
1911   SDValue X = Op.getOperand(0);
1912   SDValue Y = Op.getOperand(1);
1913 
1914   // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
1915   // algorithm used here.
1916 
1917   // Initial estimate of inv(y).
1918   SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
1919 
1920   // One round of UNR.
1921   SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
1922   SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
1923   Z = DAG.getNode(ISD::ADD, DL, VT, Z,
1924                   DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
1925 
1926   // Quotient/remainder estimate.
1927   SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
1928   SDValue R =
1929       DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
1930 
1931   // First quotient/remainder refinement.
1932   EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1933   SDValue One = DAG.getConstant(1, DL, VT);
1934   SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
1935   Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
1936                   DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
1937   R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
1938                   DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
1939 
1940   // Second quotient/remainder refinement.
1941   Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
1942   Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
1943                   DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
1944   R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
1945                   DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
1946 
1947   return DAG.getMergeValues({Q, R}, DL);
1948 }
1949 
1950 SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
1951                                            SelectionDAG &DAG) const {
1952   SDLoc DL(Op);
1953   EVT VT = Op.getValueType();
1954 
1955   SDValue LHS = Op.getOperand(0);
1956   SDValue RHS = Op.getOperand(1);
1957 
1958   SDValue Zero = DAG.getConstant(0, DL, VT);
1959   SDValue NegOne = DAG.getConstant(-1, DL, VT);
1960 
1961   if (VT == MVT::i32) {
1962     if (SDValue Res = LowerDIVREM24(Op, DAG, true))
1963       return Res;
1964   }
1965 
1966   if (VT == MVT::i64 &&
1967       DAG.ComputeNumSignBits(LHS) > 32 &&
1968       DAG.ComputeNumSignBits(RHS) > 32) {
1969     EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1970 
1971     //HiLo split
1972     SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
1973     SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
1974     SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1975                                  LHS_Lo, RHS_Lo);
1976     SDValue Res[2] = {
1977       DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
1978       DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
1979     };
1980     return DAG.getMergeValues(Res, DL);
1981   }
1982 
1983   SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
1984   SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
1985   SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
1986   SDValue RSign = LHSign; // Remainder sign is the same as LHS
1987 
1988   LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
1989   RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
1990 
1991   LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
1992   RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
1993 
1994   SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
1995   SDValue Rem = Div.getValue(1);
1996 
1997   Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
1998   Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
1999 
2000   Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2001   Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2002 
2003   SDValue Res[2] = {
2004     Div,
2005     Rem
2006   };
2007   return DAG.getMergeValues(Res, DL);
2008 }
2009 
2010 // (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x)
2011 SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
2012   SDLoc SL(Op);
2013   EVT VT = Op.getValueType();
2014   auto Flags = Op->getFlags();
2015   SDValue X = Op.getOperand(0);
2016   SDValue Y = Op.getOperand(1);
2017 
2018   SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags);
2019   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags);
2020   SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags);
2021   // TODO: For f32 use FMAD instead if !hasFastFMA32?
2022   return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags);
2023 }
2024 
2025 SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
2026   SDLoc SL(Op);
2027   SDValue Src = Op.getOperand(0);
2028 
2029   // result = trunc(src)
2030   // if (src > 0.0 && src != result)
2031   //   result += 1.0
2032 
2033   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2034 
2035   const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2036   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2037 
2038   EVT SetCCVT =
2039       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2040 
2041   SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2042   SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2043   SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2044 
2045   SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2046   // TODO: Should this propagate fast-math-flags?
2047   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2048 }
2049 
2050 static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
2051                                   SelectionDAG &DAG) {
2052   const unsigned FractBits = 52;
2053   const unsigned ExpBits = 11;
2054 
2055   SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2056                                 Hi,
2057                                 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2058                                 DAG.getConstant(ExpBits, SL, MVT::i32));
2059   SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2060                             DAG.getConstant(1023, SL, MVT::i32));
2061 
2062   return Exp;
2063 }
2064 
2065 SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
2066   SDLoc SL(Op);
2067   SDValue Src = Op.getOperand(0);
2068 
2069   assert(Op.getValueType() == MVT::f64);
2070 
2071   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2072 
2073   // Extract the upper half, since this is where we will find the sign and
2074   // exponent.
2075   SDValue Hi = getHiHalf64(Src, DAG);
2076 
2077   SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2078 
2079   const unsigned FractBits = 52;
2080 
2081   // Extract the sign bit.
2082   const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2083   SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2084 
2085   // Extend back to 64-bits.
2086   SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2087   SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2088 
2089   SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2090   const SDValue FractMask
2091     = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2092 
2093   SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2094   SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2095   SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2096 
2097   EVT SetCCVT =
2098       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2099 
2100   const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2101 
2102   SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2103   SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2104 
2105   SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2106   SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2107 
2108   return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2109 }
2110 
2111 SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
2112   SDLoc SL(Op);
2113   SDValue Src = Op.getOperand(0);
2114 
2115   assert(Op.getValueType() == MVT::f64);
2116 
2117   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2118   SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2119   SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2120 
2121   // TODO: Should this propagate fast-math-flags?
2122 
2123   SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2124   SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2125 
2126   SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2127 
2128   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2129   SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2130 
2131   EVT SetCCVT =
2132       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2133   SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2134 
2135   return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2136 }
2137 
2138 SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const {
2139   // FNEARBYINT and FRINT are the same, except in their handling of FP
2140   // exceptions. Those aren't really meaningful for us, and OpenCL only has
2141   // rint, so just treat them as equivalent.
2142   return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
2143 }
2144 
2145 // XXX - May require not supporting f32 denormals?
2146 
2147 // Don't handle v2f16. The extra instructions to scalarize and repack around the
2148 // compare and vselect end up producing worse code than scalarizing the whole
2149 // operation.
2150 SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2151   SDLoc SL(Op);
2152   SDValue X = Op.getOperand(0);
2153   EVT VT = Op.getValueType();
2154 
2155   SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2156 
2157   // TODO: Should this propagate fast-math-flags?
2158 
2159   SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2160 
2161   SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2162 
2163   const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2164   const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2165   const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2166 
2167   SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X);
2168 
2169   EVT SetCCVT =
2170       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2171 
2172   SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2173 
2174   SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero);
2175 
2176   return DAG.getNode(ISD::FADD, SL, VT, T, Sel);
2177 }
2178 
2179 SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
2180   SDLoc SL(Op);
2181   SDValue Src = Op.getOperand(0);
2182 
2183   // result = trunc(src);
2184   // if (src < 0.0 && src != result)
2185   //   result += -1.0.
2186 
2187   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2188 
2189   const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2190   const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2191 
2192   EVT SetCCVT =
2193       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2194 
2195   SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2196   SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2197   SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2198 
2199   SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2200   // TODO: Should this propagate fast-math-flags?
2201   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2202 }
2203 
2204 SDValue AMDGPUTargetLowering::LowerFLOG(SDValue Op, SelectionDAG &DAG,
2205                                         double Log2BaseInverted) const {
2206   EVT VT = Op.getValueType();
2207 
2208   SDLoc SL(Op);
2209   SDValue Operand = Op.getOperand(0);
2210   SDValue Log2Operand = DAG.getNode(ISD::FLOG2, SL, VT, Operand);
2211   SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2212 
2213   return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand);
2214 }
2215 
2216 // exp2(M_LOG2E_F * f);
2217 SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
2218   EVT VT = Op.getValueType();
2219   SDLoc SL(Op);
2220   SDValue Src = Op.getOperand(0);
2221 
2222   const SDValue K = DAG.getConstantFP(numbers::log2e, SL, VT);
2223   SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Src, K, Op->getFlags());
2224   return DAG.getNode(ISD::FEXP2, SL, VT, Mul, Op->getFlags());
2225 }
2226 
2227 static bool isCtlzOpc(unsigned Opc) {
2228   return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
2229 }
2230 
2231 static bool isCttzOpc(unsigned Opc) {
2232   return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
2233 }
2234 
2235 SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {
2236   SDLoc SL(Op);
2237   SDValue Src = Op.getOperand(0);
2238 
2239   assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
2240   bool Ctlz = isCtlzOpc(Op.getOpcode());
2241   unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
2242 
2243   bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
2244                    Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
2245 
2246   if (Src.getValueType() == MVT::i32) {
2247     // (ctlz hi:lo) -> (umin (ffbh src), 32)
2248     // (cttz hi:lo) -> (umin (ffbl src), 32)
2249     // (ctlz_zero_undef src) -> (ffbh src)
2250     // (cttz_zero_undef src) -> (ffbl src)
2251     SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
2252     if (!ZeroUndef) {
2253       const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
2254       NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const32);
2255     }
2256     return NewOpr;
2257   }
2258 
2259   SDValue Lo, Hi;
2260   std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2261 
2262   SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
2263   SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
2264 
2265   // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
2266   // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
2267   // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
2268   // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
2269 
2270   unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
2271   const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
2272   if (Ctlz)
2273     OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
2274   else
2275     OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
2276 
2277   SDValue NewOpr;
2278   NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
2279   if (!ZeroUndef) {
2280     const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
2281     NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
2282   }
2283 
2284   return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
2285 }
2286 
2287 SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
2288                                                bool Signed) const {
2289   // The regular method converting a 64-bit integer to float roughly consists of
2290   // 2 steps: normalization and rounding. In fact, after normalization, the
2291   // conversion from a 64-bit integer to a float is essentially the same as the
2292   // one from a 32-bit integer. The only difference is that it has more
2293   // trailing bits to be rounded. To leverage the native 32-bit conversion, a
2294   // 64-bit integer could be preprocessed and fit into a 32-bit integer then
2295   // converted into the correct float number. The basic steps for the unsigned
2296   // conversion are illustrated in the following pseudo code:
2297   //
2298   // f32 uitofp(i64 u) {
2299   //   i32 hi, lo = split(u);
2300   //   // Only count the leading zeros in hi as we have native support of the
2301   //   // conversion from i32 to f32. If hi is all 0s, the conversion is
2302   //   // reduced to a 32-bit one automatically.
2303   //   i32 shamt = clz(hi); // Return 32 if hi is all 0s.
2304   //   u <<= shamt;
2305   //   hi, lo = split(u);
2306   //   hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
2307   //   // convert it as a 32-bit integer and scale the result back.
2308   //   return uitofp(hi) * 2^(32 - shamt);
2309   // }
2310   //
2311   // The signed one follows the same principle but uses 'ffbh_i32' to count its
2312   // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
2313   // converted instead followed by negation based its sign bit.
2314 
2315   SDLoc SL(Op);
2316   SDValue Src = Op.getOperand(0);
2317 
2318   SDValue Lo, Hi;
2319   std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2320   SDValue Sign;
2321   SDValue ShAmt;
2322   if (Signed && Subtarget->isGCN()) {
2323     // We also need to consider the sign bit in Lo if Hi has just sign bits,
2324     // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
2325     // account. That is, the maximal shift is
2326     // - 32 if Lo and Hi have opposite signs;
2327     // - 33 if Lo and Hi have the same sign.
2328     //
2329     // Or, MaxShAmt = 33 + OppositeSign, where
2330     //
2331     // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
2332     // - -1 if Lo and Hi have opposite signs; and
2333     // -  0 otherwise.
2334     //
2335     // All in all, ShAmt is calculated as
2336     //
2337     //  umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
2338     //
2339     // or
2340     //
2341     //  umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
2342     //
2343     // to reduce the critical path.
2344     SDValue OppositeSign = DAG.getNode(
2345         ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
2346         DAG.getConstant(31, SL, MVT::i32));
2347     SDValue MaxShAmt =
2348         DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
2349                     OppositeSign);
2350     // Count the leading sign bits.
2351     ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
2352     // Different from unsigned conversion, the shift should be one bit less to
2353     // preserve the sign bit.
2354     ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
2355                         DAG.getConstant(1, SL, MVT::i32));
2356     ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
2357   } else {
2358     if (Signed) {
2359       // Without 'ffbh_i32', only leading zeros could be counted. Take the
2360       // absolute value first.
2361       Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
2362                          DAG.getConstant(63, SL, MVT::i64));
2363       SDValue Abs =
2364           DAG.getNode(ISD::XOR, SL, MVT::i64,
2365                       DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
2366       std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
2367     }
2368     // Count the leading zeros.
2369     ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
2370     // The shift amount for signed integers is [0, 32].
2371   }
2372   // Normalize the given 64-bit integer.
2373   SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
2374   // Split it again.
2375   std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
2376   // Calculate the adjust bit for rounding.
2377   // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
2378   SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
2379                                DAG.getConstant(1, SL, MVT::i32), Lo);
2380   // Get the 32-bit normalized integer.
2381   Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
2382   // Convert the normalized 32-bit integer into f32.
2383   unsigned Opc =
2384       (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
2385   SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
2386 
2387   // Finally, need to scale back the converted floating number as the original
2388   // 64-bit integer is converted as a 32-bit one.
2389   ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
2390                       ShAmt);
2391   // On GCN, use LDEXP directly.
2392   if (Subtarget->isGCN())
2393     return DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f32, FVal, ShAmt);
2394 
2395   // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
2396   // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
2397   // exponent is enough to avoid overflowing into the sign bit.
2398   SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
2399                             DAG.getConstant(23, SL, MVT::i32));
2400   SDValue IVal =
2401       DAG.getNode(ISD::ADD, SL, MVT::i32,
2402                   DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
2403   if (Signed) {
2404     // Set the sign bit.
2405     Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
2406                        DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
2407                        DAG.getConstant(31, SL, MVT::i32));
2408     IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
2409   }
2410   return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
2411 }
2412 
2413 SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
2414                                                bool Signed) const {
2415   SDLoc SL(Op);
2416   SDValue Src = Op.getOperand(0);
2417 
2418   SDValue Lo, Hi;
2419   std::tie(Lo, Hi) = split64BitValue(Src, DAG);
2420 
2421   SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
2422                               SL, MVT::f64, Hi);
2423 
2424   SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
2425 
2426   SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
2427                               DAG.getConstant(32, SL, MVT::i32));
2428   // TODO: Should this propagate fast-math-flags?
2429   return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
2430 }
2431 
2432 SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
2433                                                SelectionDAG &DAG) const {
2434   // TODO: Factor out code common with LowerSINT_TO_FP.
2435   EVT DestVT = Op.getValueType();
2436   SDValue Src = Op.getOperand(0);
2437   EVT SrcVT = Src.getValueType();
2438 
2439   if (SrcVT == MVT::i16) {
2440     if (DestVT == MVT::f16)
2441       return Op;
2442     SDLoc DL(Op);
2443 
2444     // Promote src to i32
2445     SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
2446     return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
2447   }
2448 
2449   assert(SrcVT == MVT::i64 && "operation should be legal");
2450 
2451   if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2452     SDLoc DL(Op);
2453 
2454     SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2455     SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2456     SDValue FPRound =
2457         DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2458 
2459     return FPRound;
2460   }
2461 
2462   if (DestVT == MVT::f32)
2463     return LowerINT_TO_FP32(Op, DAG, false);
2464 
2465   assert(DestVT == MVT::f64);
2466   return LowerINT_TO_FP64(Op, DAG, false);
2467 }
2468 
2469 SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
2470                                               SelectionDAG &DAG) const {
2471   EVT DestVT = Op.getValueType();
2472 
2473   SDValue Src = Op.getOperand(0);
2474   EVT SrcVT = Src.getValueType();
2475 
2476   if (SrcVT == MVT::i16) {
2477     if (DestVT == MVT::f16)
2478       return Op;
2479 
2480     SDLoc DL(Op);
2481     // Promote src to i32
2482     SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
2483     return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
2484   }
2485 
2486   assert(SrcVT == MVT::i64 && "operation should be legal");
2487 
2488   // TODO: Factor out code common with LowerUINT_TO_FP.
2489 
2490   if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2491     SDLoc DL(Op);
2492     SDValue Src = Op.getOperand(0);
2493 
2494     SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2495     SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2496     SDValue FPRound =
2497         DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2498 
2499     return FPRound;
2500   }
2501 
2502   if (DestVT == MVT::f32)
2503     return LowerINT_TO_FP32(Op, DAG, true);
2504 
2505   assert(DestVT == MVT::f64);
2506   return LowerINT_TO_FP64(Op, DAG, true);
2507 }
2508 
2509 SDValue AMDGPUTargetLowering::LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG,
2510                                                bool Signed) const {
2511   SDLoc SL(Op);
2512 
2513   SDValue Src = Op.getOperand(0);
2514   EVT SrcVT = Src.getValueType();
2515 
2516   assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
2517 
2518   // The basic idea of converting a floating point number into a pair of 32-bit
2519   // integers is illustrated as follows:
2520   //
2521   //     tf := trunc(val);
2522   //    hif := floor(tf * 2^-32);
2523   //    lof := tf - hif * 2^32; // lof is always positive due to floor.
2524   //     hi := fptoi(hif);
2525   //     lo := fptoi(lof);
2526   //
2527   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
2528   SDValue Sign;
2529   if (Signed && SrcVT == MVT::f32) {
2530     // However, a 32-bit floating point number has only 23 bits mantissa and
2531     // it's not enough to hold all the significant bits of `lof` if val is
2532     // negative. To avoid the loss of precision, We need to take the absolute
2533     // value after truncating and flip the result back based on the original
2534     // signedness.
2535     Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
2536                        DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
2537                        DAG.getConstant(31, SL, MVT::i32));
2538     Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
2539   }
2540 
2541   SDValue K0, K1;
2542   if (SrcVT == MVT::f64) {
2543     K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)),
2544                            SL, SrcVT);
2545     K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)),
2546                            SL, SrcVT);
2547   } else {
2548     K0 = DAG.getConstantFP(BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)), SL,
2549                            SrcVT);
2550     K1 = DAG.getConstantFP(BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)), SL,
2551                            SrcVT);
2552   }
2553   // TODO: Should this propagate fast-math-flags?
2554   SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
2555 
2556   SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
2557 
2558   SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
2559 
2560   SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
2561                                                          : ISD::FP_TO_UINT,
2562                            SL, MVT::i32, FloorMul);
2563   SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
2564 
2565   SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
2566                                DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
2567 
2568   if (Signed && SrcVT == MVT::f32) {
2569     assert(Sign);
2570     // Flip the result based on the signedness, which is either all 0s or 1s.
2571     Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
2572                        DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
2573     // r := xor(r, sign) - sign;
2574     Result =
2575         DAG.getNode(ISD::SUB, SL, MVT::i64,
2576                     DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
2577   }
2578 
2579   return Result;
2580 }
2581 
2582 SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
2583   SDLoc DL(Op);
2584   SDValue N0 = Op.getOperand(0);
2585 
2586   // Convert to target node to get known bits
2587   if (N0.getValueType() == MVT::f32)
2588     return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
2589 
2590   if (getTargetMachine().Options.UnsafeFPMath) {
2591     // There is a generic expand for FP_TO_FP16 with unsafe fast math.
2592     return SDValue();
2593   }
2594 
2595   assert(N0.getSimpleValueType() == MVT::f64);
2596 
2597   // f64 -> f16 conversion using round-to-nearest-even rounding mode.
2598   const unsigned ExpMask = 0x7ff;
2599   const unsigned ExpBiasf64 = 1023;
2600   const unsigned ExpBiasf16 = 15;
2601   SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
2602   SDValue One = DAG.getConstant(1, DL, MVT::i32);
2603   SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
2604   SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
2605                            DAG.getConstant(32, DL, MVT::i64));
2606   UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
2607   U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
2608   SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2609                           DAG.getConstant(20, DL, MVT::i64));
2610   E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
2611                   DAG.getConstant(ExpMask, DL, MVT::i32));
2612   // Subtract the fp64 exponent bias (1023) to get the real exponent and
2613   // add the f16 bias (15) to get the biased exponent for the f16 format.
2614   E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
2615                   DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
2616 
2617   SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2618                           DAG.getConstant(8, DL, MVT::i32));
2619   M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
2620                   DAG.getConstant(0xffe, DL, MVT::i32));
2621 
2622   SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
2623                                   DAG.getConstant(0x1ff, DL, MVT::i32));
2624   MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
2625 
2626   SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
2627   M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
2628 
2629   // (M != 0 ? 0x0200 : 0) | 0x7c00;
2630   SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
2631       DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
2632                       Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
2633 
2634   // N = M | (E << 12);
2635   SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2636       DAG.getNode(ISD::SHL, DL, MVT::i32, E,
2637                   DAG.getConstant(12, DL, MVT::i32)));
2638 
2639   // B = clamp(1-E, 0, 13);
2640   SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
2641                                   One, E);
2642   SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
2643   B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
2644                   DAG.getConstant(13, DL, MVT::i32));
2645 
2646   SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2647                                    DAG.getConstant(0x1000, DL, MVT::i32));
2648 
2649   SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
2650   SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
2651   SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
2652   D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
2653 
2654   SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
2655   SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
2656                               DAG.getConstant(0x7, DL, MVT::i32));
2657   V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
2658                   DAG.getConstant(2, DL, MVT::i32));
2659   SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
2660                                One, Zero, ISD::SETEQ);
2661   SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
2662                                One, Zero, ISD::SETGT);
2663   V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
2664   V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
2665 
2666   V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
2667                       DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
2668   V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
2669                       I, V, ISD::SETEQ);
2670 
2671   // Extract the sign bit.
2672   SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2673                             DAG.getConstant(16, DL, MVT::i32));
2674   Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
2675                      DAG.getConstant(0x8000, DL, MVT::i32));
2676 
2677   V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
2678   return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
2679 }
2680 
2681 SDValue AMDGPUTargetLowering::LowerFP_TO_INT(SDValue Op,
2682                                              SelectionDAG &DAG) const {
2683   SDValue Src = Op.getOperand(0);
2684   unsigned OpOpcode = Op.getOpcode();
2685   EVT SrcVT = Src.getValueType();
2686   EVT DestVT = Op.getValueType();
2687 
2688   // Will be selected natively
2689   if (SrcVT == MVT::f16 && DestVT == MVT::i16)
2690     return Op;
2691 
2692   // Promote i16 to i32
2693   if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
2694     SDLoc DL(Op);
2695 
2696     SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
2697     return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
2698   }
2699 
2700   if (SrcVT == MVT::f16 ||
2701       (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
2702     SDLoc DL(Op);
2703 
2704     SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
2705     unsigned Ext =
2706         OpOpcode == ISD::FP_TO_SINT ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
2707     return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
2708   }
2709 
2710   if (DestVT == MVT::i64 && (SrcVT == MVT::f32 || SrcVT == MVT::f64))
2711     return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
2712 
2713   return SDValue();
2714 }
2715 
2716 SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
2717                                                      SelectionDAG &DAG) const {
2718   EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
2719   MVT VT = Op.getSimpleValueType();
2720   MVT ScalarVT = VT.getScalarType();
2721 
2722   assert(VT.isVector());
2723 
2724   SDValue Src = Op.getOperand(0);
2725   SDLoc DL(Op);
2726 
2727   // TODO: Don't scalarize on Evergreen?
2728   unsigned NElts = VT.getVectorNumElements();
2729   SmallVector<SDValue, 8> Args;
2730   DAG.ExtractVectorElements(Src, Args, 0, NElts);
2731 
2732   SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
2733   for (unsigned I = 0; I < NElts; ++I)
2734     Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
2735 
2736   return DAG.getBuildVector(VT, DL, Args);
2737 }
2738 
2739 //===----------------------------------------------------------------------===//
2740 // Custom DAG optimizations
2741 //===----------------------------------------------------------------------===//
2742 
2743 static bool isU24(SDValue Op, SelectionDAG &DAG) {
2744   return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
2745 }
2746 
2747 static bool isI24(SDValue Op, SelectionDAG &DAG) {
2748   EVT VT = Op.getValueType();
2749   return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
2750                                      // as unsigned 24-bit values.
2751          AMDGPUTargetLowering::numBitsSigned(Op, DAG) <= 24;
2752 }
2753 
2754 static SDValue simplifyMul24(SDNode *Node24,
2755                              TargetLowering::DAGCombinerInfo &DCI) {
2756   SelectionDAG &DAG = DCI.DAG;
2757   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2758   bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
2759 
2760   SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
2761   SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
2762   unsigned NewOpcode = Node24->getOpcode();
2763   if (IsIntrin) {
2764     unsigned IID = cast<ConstantSDNode>(Node24->getOperand(0))->getZExtValue();
2765     switch (IID) {
2766     case Intrinsic::amdgcn_mul_i24:
2767       NewOpcode = AMDGPUISD::MUL_I24;
2768       break;
2769     case Intrinsic::amdgcn_mul_u24:
2770       NewOpcode = AMDGPUISD::MUL_U24;
2771       break;
2772     case Intrinsic::amdgcn_mulhi_i24:
2773       NewOpcode = AMDGPUISD::MULHI_I24;
2774       break;
2775     case Intrinsic::amdgcn_mulhi_u24:
2776       NewOpcode = AMDGPUISD::MULHI_U24;
2777       break;
2778     default:
2779       llvm_unreachable("Expected 24-bit mul intrinsic");
2780     }
2781   }
2782 
2783   APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
2784 
2785   // First try to simplify using SimplifyMultipleUseDemandedBits which allows
2786   // the operands to have other uses, but will only perform simplifications that
2787   // involve bypassing some nodes for this user.
2788   SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
2789   SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
2790   if (DemandedLHS || DemandedRHS)
2791     return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
2792                        DemandedLHS ? DemandedLHS : LHS,
2793                        DemandedRHS ? DemandedRHS : RHS);
2794 
2795   // Now try SimplifyDemandedBits which can simplify the nodes used by our
2796   // operands if this node is the only user.
2797   if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
2798     return SDValue(Node24, 0);
2799   if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
2800     return SDValue(Node24, 0);
2801 
2802   return SDValue();
2803 }
2804 
2805 template <typename IntTy>
2806 static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
2807                                uint32_t Width, const SDLoc &DL) {
2808   if (Width + Offset < 32) {
2809     uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
2810     IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
2811     return DAG.getConstant(Result, DL, MVT::i32);
2812   }
2813 
2814   return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
2815 }
2816 
2817 static bool hasVolatileUser(SDNode *Val) {
2818   for (SDNode *U : Val->uses()) {
2819     if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
2820       if (M->isVolatile())
2821         return true;
2822     }
2823   }
2824 
2825   return false;
2826 }
2827 
2828 bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
2829   // i32 vectors are the canonical memory type.
2830   if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
2831     return false;
2832 
2833   if (!VT.isByteSized())
2834     return false;
2835 
2836   unsigned Size = VT.getStoreSize();
2837 
2838   if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
2839     return false;
2840 
2841   if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
2842     return false;
2843 
2844   return true;
2845 }
2846 
2847 // Replace load of an illegal type with a store of a bitcast to a friendlier
2848 // type.
2849 SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
2850                                                  DAGCombinerInfo &DCI) const {
2851   if (!DCI.isBeforeLegalize())
2852     return SDValue();
2853 
2854   LoadSDNode *LN = cast<LoadSDNode>(N);
2855   if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
2856     return SDValue();
2857 
2858   SDLoc SL(N);
2859   SelectionDAG &DAG = DCI.DAG;
2860   EVT VT = LN->getMemoryVT();
2861 
2862   unsigned Size = VT.getStoreSize();
2863   Align Alignment = LN->getAlign();
2864   if (Alignment < Size && isTypeLegal(VT)) {
2865     bool IsFast;
2866     unsigned AS = LN->getAddressSpace();
2867 
2868     // Expand unaligned loads earlier than legalization. Due to visitation order
2869     // problems during legalization, the emitted instructions to pack and unpack
2870     // the bytes again are not eliminated in the case of an unaligned copy.
2871     if (!allowsMisalignedMemoryAccesses(
2872             VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
2873       if (VT.isVector())
2874         return SplitVectorLoad(SDValue(LN, 0), DAG);
2875 
2876       SDValue Ops[2];
2877       std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
2878 
2879       return DAG.getMergeValues(Ops, SDLoc(N));
2880     }
2881 
2882     if (!IsFast)
2883       return SDValue();
2884   }
2885 
2886   if (!shouldCombineMemoryType(VT))
2887     return SDValue();
2888 
2889   EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
2890 
2891   SDValue NewLoad
2892     = DAG.getLoad(NewVT, SL, LN->getChain(),
2893                   LN->getBasePtr(), LN->getMemOperand());
2894 
2895   SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
2896   DCI.CombineTo(N, BC, NewLoad.getValue(1));
2897   return SDValue(N, 0);
2898 }
2899 
2900 // Replace store of an illegal type with a store of a bitcast to a friendlier
2901 // type.
2902 SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
2903                                                   DAGCombinerInfo &DCI) const {
2904   if (!DCI.isBeforeLegalize())
2905     return SDValue();
2906 
2907   StoreSDNode *SN = cast<StoreSDNode>(N);
2908   if (!SN->isSimple() || !ISD::isNormalStore(SN))
2909     return SDValue();
2910 
2911   EVT VT = SN->getMemoryVT();
2912   unsigned Size = VT.getStoreSize();
2913 
2914   SDLoc SL(N);
2915   SelectionDAG &DAG = DCI.DAG;
2916   Align Alignment = SN->getAlign();
2917   if (Alignment < Size && isTypeLegal(VT)) {
2918     bool IsFast;
2919     unsigned AS = SN->getAddressSpace();
2920 
2921     // Expand unaligned stores earlier than legalization. Due to visitation
2922     // order problems during legalization, the emitted instructions to pack and
2923     // unpack the bytes again are not eliminated in the case of an unaligned
2924     // copy.
2925     if (!allowsMisalignedMemoryAccesses(
2926             VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
2927       if (VT.isVector())
2928         return SplitVectorStore(SDValue(SN, 0), DAG);
2929 
2930       return expandUnalignedStore(SN, DAG);
2931     }
2932 
2933     if (!IsFast)
2934       return SDValue();
2935   }
2936 
2937   if (!shouldCombineMemoryType(VT))
2938     return SDValue();
2939 
2940   EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
2941   SDValue Val = SN->getValue();
2942 
2943   //DCI.AddToWorklist(Val.getNode());
2944 
2945   bool OtherUses = !Val.hasOneUse();
2946   SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
2947   if (OtherUses) {
2948     SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
2949     DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
2950   }
2951 
2952   return DAG.getStore(SN->getChain(), SL, CastVal,
2953                       SN->getBasePtr(), SN->getMemOperand());
2954 }
2955 
2956 // FIXME: This should go in generic DAG combiner with an isTruncateFree check,
2957 // but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
2958 // issues.
2959 SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
2960                                                         DAGCombinerInfo &DCI) const {
2961   SelectionDAG &DAG = DCI.DAG;
2962   SDValue N0 = N->getOperand(0);
2963 
2964   // (vt2 (assertzext (truncate vt0:x), vt1)) ->
2965   //     (vt2 (truncate (assertzext vt0:x, vt1)))
2966   if (N0.getOpcode() == ISD::TRUNCATE) {
2967     SDValue N1 = N->getOperand(1);
2968     EVT ExtVT = cast<VTSDNode>(N1)->getVT();
2969     SDLoc SL(N);
2970 
2971     SDValue Src = N0.getOperand(0);
2972     EVT SrcVT = Src.getValueType();
2973     if (SrcVT.bitsGE(ExtVT)) {
2974       SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
2975       return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
2976     }
2977   }
2978 
2979   return SDValue();
2980 }
2981 
2982 SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
2983   SDNode *N, DAGCombinerInfo &DCI) const {
2984   unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
2985   switch (IID) {
2986   case Intrinsic::amdgcn_mul_i24:
2987   case Intrinsic::amdgcn_mul_u24:
2988   case Intrinsic::amdgcn_mulhi_i24:
2989   case Intrinsic::amdgcn_mulhi_u24:
2990     return simplifyMul24(N, DCI);
2991   case Intrinsic::amdgcn_fract:
2992   case Intrinsic::amdgcn_rsq:
2993   case Intrinsic::amdgcn_rcp_legacy:
2994   case Intrinsic::amdgcn_rsq_legacy:
2995   case Intrinsic::amdgcn_rsq_clamp:
2996   case Intrinsic::amdgcn_ldexp: {
2997     // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
2998     SDValue Src = N->getOperand(1);
2999     return Src.isUndef() ? Src : SDValue();
3000   }
3001   default:
3002     return SDValue();
3003   }
3004 }
3005 
3006 /// Split the 64-bit value \p LHS into two 32-bit components, and perform the
3007 /// binary operation \p Opc to it with the corresponding constant operands.
3008 SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
3009   DAGCombinerInfo &DCI, const SDLoc &SL,
3010   unsigned Opc, SDValue LHS,
3011   uint32_t ValLo, uint32_t ValHi) const {
3012   SelectionDAG &DAG = DCI.DAG;
3013   SDValue Lo, Hi;
3014   std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
3015 
3016   SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
3017   SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
3018 
3019   SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
3020   SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
3021 
3022   // Re-visit the ands. It's possible we eliminated one of them and it could
3023   // simplify the vector.
3024   DCI.AddToWorklist(Lo.getNode());
3025   DCI.AddToWorklist(Hi.getNode());
3026 
3027   SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
3028   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3029 }
3030 
3031 SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
3032                                                 DAGCombinerInfo &DCI) const {
3033   EVT VT = N->getValueType(0);
3034 
3035   ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3036   if (!RHS)
3037     return SDValue();
3038 
3039   SDValue LHS = N->getOperand(0);
3040   unsigned RHSVal = RHS->getZExtValue();
3041   if (!RHSVal)
3042     return LHS;
3043 
3044   SDLoc SL(N);
3045   SelectionDAG &DAG = DCI.DAG;
3046 
3047   switch (LHS->getOpcode()) {
3048   default:
3049     break;
3050   case ISD::ZERO_EXTEND:
3051   case ISD::SIGN_EXTEND:
3052   case ISD::ANY_EXTEND: {
3053     SDValue X = LHS->getOperand(0);
3054 
3055     if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
3056         isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
3057       // Prefer build_vector as the canonical form if packed types are legal.
3058       // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
3059       SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
3060        { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
3061       return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
3062     }
3063 
3064     // shl (ext x) => zext (shl x), if shift does not overflow int
3065     if (VT != MVT::i64)
3066       break;
3067     KnownBits Known = DAG.computeKnownBits(X);
3068     unsigned LZ = Known.countMinLeadingZeros();
3069     if (LZ < RHSVal)
3070       break;
3071     EVT XVT = X.getValueType();
3072     SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
3073     return DAG.getZExtOrTrunc(Shl, SL, VT);
3074   }
3075   }
3076 
3077   if (VT != MVT::i64)
3078     return SDValue();
3079 
3080   // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
3081 
3082   // On some subtargets, 64-bit shift is a quarter rate instruction. In the
3083   // common case, splitting this into a move and a 32-bit shift is faster and
3084   // the same code size.
3085   if (RHSVal < 32)
3086     return SDValue();
3087 
3088   SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
3089 
3090   SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
3091   SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
3092 
3093   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3094 
3095   SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
3096   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
3097 }
3098 
3099 SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
3100                                                 DAGCombinerInfo &DCI) const {
3101   if (N->getValueType(0) != MVT::i64)
3102     return SDValue();
3103 
3104   const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3105   if (!RHS)
3106     return SDValue();
3107 
3108   SelectionDAG &DAG = DCI.DAG;
3109   SDLoc SL(N);
3110   unsigned RHSVal = RHS->getZExtValue();
3111 
3112   // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
3113   if (RHSVal == 32) {
3114     SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3115     SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3116                                    DAG.getConstant(31, SL, MVT::i32));
3117 
3118     SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
3119     return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3120   }
3121 
3122   // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
3123   if (RHSVal == 63) {
3124     SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
3125     SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
3126                                    DAG.getConstant(31, SL, MVT::i32));
3127     SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
3128     return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
3129   }
3130 
3131   return SDValue();
3132 }
3133 
3134 SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
3135                                                 DAGCombinerInfo &DCI) const {
3136   auto *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
3137   if (!RHS)
3138     return SDValue();
3139 
3140   EVT VT = N->getValueType(0);
3141   SDValue LHS = N->getOperand(0);
3142   unsigned ShiftAmt = RHS->getZExtValue();
3143   SelectionDAG &DAG = DCI.DAG;
3144   SDLoc SL(N);
3145 
3146   // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
3147   // this improves the ability to match BFE patterns in isel.
3148   if (LHS.getOpcode() == ISD::AND) {
3149     if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
3150       unsigned MaskIdx, MaskLen;
3151       if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
3152           MaskIdx == ShiftAmt) {
3153         return DAG.getNode(
3154             ISD::AND, SL, VT,
3155             DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0), N->getOperand(1)),
3156             DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
3157       }
3158     }
3159   }
3160 
3161   if (VT != MVT::i64)
3162     return SDValue();
3163 
3164   if (ShiftAmt < 32)
3165     return SDValue();
3166 
3167   // srl i64:x, C for C >= 32
3168   // =>
3169   //   build_pair (srl hi_32(x), C - 32), 0
3170   SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
3171 
3172   SDValue Hi = getHiHalf64(LHS, DAG);
3173 
3174   SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
3175   SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
3176 
3177   SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
3178 
3179   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
3180 }
3181 
3182 SDValue AMDGPUTargetLowering::performTruncateCombine(
3183   SDNode *N, DAGCombinerInfo &DCI) const {
3184   SDLoc SL(N);
3185   SelectionDAG &DAG = DCI.DAG;
3186   EVT VT = N->getValueType(0);
3187   SDValue Src = N->getOperand(0);
3188 
3189   // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
3190   if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
3191     SDValue Vec = Src.getOperand(0);
3192     if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
3193       SDValue Elt0 = Vec.getOperand(0);
3194       EVT EltVT = Elt0.getValueType();
3195       if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
3196         if (EltVT.isFloatingPoint()) {
3197           Elt0 = DAG.getNode(ISD::BITCAST, SL,
3198                              EltVT.changeTypeToInteger(), Elt0);
3199         }
3200 
3201         return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
3202       }
3203     }
3204   }
3205 
3206   // Equivalent of above for accessing the high element of a vector as an
3207   // integer operation.
3208   // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
3209   if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
3210     if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
3211       if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
3212         SDValue BV = stripBitcast(Src.getOperand(0));
3213         if (BV.getOpcode() == ISD::BUILD_VECTOR &&
3214             BV.getValueType().getVectorNumElements() == 2) {
3215           SDValue SrcElt = BV.getOperand(1);
3216           EVT SrcEltVT = SrcElt.getValueType();
3217           if (SrcEltVT.isFloatingPoint()) {
3218             SrcElt = DAG.getNode(ISD::BITCAST, SL,
3219                                  SrcEltVT.changeTypeToInteger(), SrcElt);
3220           }
3221 
3222           return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
3223         }
3224       }
3225     }
3226   }
3227 
3228   // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
3229   //
3230   // i16 (trunc (srl i64:x, K)), K <= 16 ->
3231   //     i16 (trunc (srl (i32 (trunc x), K)))
3232   if (VT.getScalarSizeInBits() < 32) {
3233     EVT SrcVT = Src.getValueType();
3234     if (SrcVT.getScalarSizeInBits() > 32 &&
3235         (Src.getOpcode() == ISD::SRL ||
3236          Src.getOpcode() == ISD::SRA ||
3237          Src.getOpcode() == ISD::SHL)) {
3238       SDValue Amt = Src.getOperand(1);
3239       KnownBits Known = DAG.computeKnownBits(Amt);
3240       unsigned Size = VT.getScalarSizeInBits();
3241       if ((Known.isConstant() && Known.getConstant().ule(Size)) ||
3242           (Known.countMaxActiveBits() <= Log2_32(Size))) {
3243         EVT MidVT = VT.isVector() ?
3244           EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3245                            VT.getVectorNumElements()) : MVT::i32;
3246 
3247         EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
3248         SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
3249                                     Src.getOperand(0));
3250         DCI.AddToWorklist(Trunc.getNode());
3251 
3252         if (Amt.getValueType() != NewShiftVT) {
3253           Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
3254           DCI.AddToWorklist(Amt.getNode());
3255         }
3256 
3257         SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
3258                                           Trunc, Amt);
3259         return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
3260       }
3261     }
3262   }
3263 
3264   return SDValue();
3265 }
3266 
3267 // We need to specifically handle i64 mul here to avoid unnecessary conversion
3268 // instructions. If we only match on the legalized i64 mul expansion,
3269 // SimplifyDemandedBits will be unable to remove them because there will be
3270 // multiple uses due to the separate mul + mulh[su].
3271 static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
3272                         SDValue N0, SDValue N1, unsigned Size, bool Signed) {
3273   if (Size <= 32) {
3274     unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3275     return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
3276   }
3277 
3278   unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
3279   unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
3280 
3281   SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
3282   SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
3283 
3284   return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
3285 }
3286 
3287 SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
3288                                                 DAGCombinerInfo &DCI) const {
3289   EVT VT = N->getValueType(0);
3290 
3291   // Don't generate 24-bit multiplies on values that are in SGPRs, since
3292   // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3293   // unnecessarily). isDivergent() is used as an approximation of whether the
3294   // value is in an SGPR.
3295   if (!N->isDivergent())
3296     return SDValue();
3297 
3298   unsigned Size = VT.getSizeInBits();
3299   if (VT.isVector() || Size > 64)
3300     return SDValue();
3301 
3302   // There are i16 integer mul/mad.
3303   if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
3304     return SDValue();
3305 
3306   SelectionDAG &DAG = DCI.DAG;
3307   SDLoc DL(N);
3308 
3309   SDValue N0 = N->getOperand(0);
3310   SDValue N1 = N->getOperand(1);
3311 
3312   // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
3313   // in the source into any_extends if the result of the mul is truncated. Since
3314   // we can assume the high bits are whatever we want, use the underlying value
3315   // to avoid the unknown high bits from interfering.
3316   if (N0.getOpcode() == ISD::ANY_EXTEND)
3317     N0 = N0.getOperand(0);
3318 
3319   if (N1.getOpcode() == ISD::ANY_EXTEND)
3320     N1 = N1.getOperand(0);
3321 
3322   SDValue Mul;
3323 
3324   if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
3325     N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3326     N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3327     Mul = getMul24(DAG, DL, N0, N1, Size, false);
3328   } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
3329     N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3330     N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3331     Mul = getMul24(DAG, DL, N0, N1, Size, true);
3332   } else {
3333     return SDValue();
3334   }
3335 
3336   // We need to use sext even for MUL_U24, because MUL_U24 is used
3337   // for signed multiply of 8 and 16-bit types.
3338   return DAG.getSExtOrTrunc(Mul, DL, VT);
3339 }
3340 
3341 SDValue
3342 AMDGPUTargetLowering::performMulLoHiCombine(SDNode *N,
3343                                             DAGCombinerInfo &DCI) const {
3344   if (N->getValueType(0) != MVT::i32)
3345     return SDValue();
3346 
3347   SelectionDAG &DAG = DCI.DAG;
3348   SDLoc DL(N);
3349 
3350   SDValue N0 = N->getOperand(0);
3351   SDValue N1 = N->getOperand(1);
3352 
3353   // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
3354   // in the source into any_extends if the result of the mul is truncated. Since
3355   // we can assume the high bits are whatever we want, use the underlying value
3356   // to avoid the unknown high bits from interfering.
3357   if (N0.getOpcode() == ISD::ANY_EXTEND)
3358     N0 = N0.getOperand(0);
3359   if (N1.getOpcode() == ISD::ANY_EXTEND)
3360     N1 = N1.getOperand(0);
3361 
3362   // Try to use two fast 24-bit multiplies (one for each half of the result)
3363   // instead of one slow extending multiply.
3364   unsigned LoOpcode, HiOpcode;
3365   if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
3366     N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3367     N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3368     LoOpcode = AMDGPUISD::MUL_U24;
3369     HiOpcode = AMDGPUISD::MULHI_U24;
3370   } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
3371     N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3372     N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3373     LoOpcode = AMDGPUISD::MUL_I24;
3374     HiOpcode = AMDGPUISD::MULHI_I24;
3375   } else {
3376     return SDValue();
3377   }
3378 
3379   SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
3380   SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
3381   DCI.CombineTo(N, Lo, Hi);
3382   return SDValue(N, 0);
3383 }
3384 
3385 SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
3386                                                   DAGCombinerInfo &DCI) const {
3387   EVT VT = N->getValueType(0);
3388 
3389   if (!Subtarget->hasMulI24() || VT.isVector())
3390     return SDValue();
3391 
3392   // Don't generate 24-bit multiplies on values that are in SGPRs, since
3393   // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3394   // unnecessarily). isDivergent() is used as an approximation of whether the
3395   // value is in an SGPR.
3396   // This doesn't apply if no s_mul_hi is available (since we'll end up with a
3397   // valu op anyway)
3398   if (Subtarget->hasSMulHi() && !N->isDivergent())
3399     return SDValue();
3400 
3401   SelectionDAG &DAG = DCI.DAG;
3402   SDLoc DL(N);
3403 
3404   SDValue N0 = N->getOperand(0);
3405   SDValue N1 = N->getOperand(1);
3406 
3407   if (!isI24(N0, DAG) || !isI24(N1, DAG))
3408     return SDValue();
3409 
3410   N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
3411   N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
3412 
3413   SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
3414   DCI.AddToWorklist(Mulhi.getNode());
3415   return DAG.getSExtOrTrunc(Mulhi, DL, VT);
3416 }
3417 
3418 SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
3419                                                   DAGCombinerInfo &DCI) const {
3420   EVT VT = N->getValueType(0);
3421 
3422   if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
3423     return SDValue();
3424 
3425   // Don't generate 24-bit multiplies on values that are in SGPRs, since
3426   // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
3427   // unnecessarily). isDivergent() is used as an approximation of whether the
3428   // value is in an SGPR.
3429   // This doesn't apply if no s_mul_hi is available (since we'll end up with a
3430   // valu op anyway)
3431   if (Subtarget->hasSMulHi() && !N->isDivergent())
3432     return SDValue();
3433 
3434   SelectionDAG &DAG = DCI.DAG;
3435   SDLoc DL(N);
3436 
3437   SDValue N0 = N->getOperand(0);
3438   SDValue N1 = N->getOperand(1);
3439 
3440   if (!isU24(N0, DAG) || !isU24(N1, DAG))
3441     return SDValue();
3442 
3443   N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
3444   N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
3445 
3446   SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
3447   DCI.AddToWorklist(Mulhi.getNode());
3448   return DAG.getZExtOrTrunc(Mulhi, DL, VT);
3449 }
3450 
3451 static bool isNegativeOne(SDValue Val) {
3452   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
3453     return C->isAllOnes();
3454   return false;
3455 }
3456 
3457 SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
3458                                           SDValue Op,
3459                                           const SDLoc &DL,
3460                                           unsigned Opc) const {
3461   EVT VT = Op.getValueType();
3462   EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
3463   if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
3464                               LegalVT != MVT::i16))
3465     return SDValue();
3466 
3467   if (VT != MVT::i32)
3468     Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
3469 
3470   SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
3471   if (VT != MVT::i32)
3472     FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
3473 
3474   return FFBX;
3475 }
3476 
3477 // The native instructions return -1 on 0 input. Optimize out a select that
3478 // produces -1 on 0.
3479 //
3480 // TODO: If zero is not undef, we could also do this if the output is compared
3481 // against the bitwidth.
3482 //
3483 // TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
3484 SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond,
3485                                                  SDValue LHS, SDValue RHS,
3486                                                  DAGCombinerInfo &DCI) const {
3487   ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
3488   if (!CmpRhs || !CmpRhs->isZero())
3489     return SDValue();
3490 
3491   SelectionDAG &DAG = DCI.DAG;
3492   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
3493   SDValue CmpLHS = Cond.getOperand(0);
3494 
3495   // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
3496   // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
3497   if (CCOpcode == ISD::SETEQ &&
3498       (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
3499       RHS.getOperand(0) == CmpLHS && isNegativeOne(LHS)) {
3500     unsigned Opc =
3501         isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
3502     return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3503   }
3504 
3505   // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
3506   // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
3507   if (CCOpcode == ISD::SETNE &&
3508       (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
3509       LHS.getOperand(0) == CmpLHS && isNegativeOne(RHS)) {
3510     unsigned Opc =
3511         isCttzOpc(LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
3512 
3513     return getFFBX_U32(DAG, CmpLHS, SL, Opc);
3514   }
3515 
3516   return SDValue();
3517 }
3518 
3519 static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
3520                                          unsigned Op,
3521                                          const SDLoc &SL,
3522                                          SDValue Cond,
3523                                          SDValue N1,
3524                                          SDValue N2) {
3525   SelectionDAG &DAG = DCI.DAG;
3526   EVT VT = N1.getValueType();
3527 
3528   SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
3529                                   N1.getOperand(0), N2.getOperand(0));
3530   DCI.AddToWorklist(NewSelect.getNode());
3531   return DAG.getNode(Op, SL, VT, NewSelect);
3532 }
3533 
3534 // Pull a free FP operation out of a select so it may fold into uses.
3535 //
3536 // select c, (fneg x), (fneg y) -> fneg (select c, x, y)
3537 // select c, (fneg x), k -> fneg (select c, x, (fneg k))
3538 //
3539 // select c, (fabs x), (fabs y) -> fabs (select c, x, y)
3540 // select c, (fabs x), +k -> fabs (select c, x, k)
3541 static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
3542                                     SDValue N) {
3543   SelectionDAG &DAG = DCI.DAG;
3544   SDValue Cond = N.getOperand(0);
3545   SDValue LHS = N.getOperand(1);
3546   SDValue RHS = N.getOperand(2);
3547 
3548   EVT VT = N.getValueType();
3549   if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
3550       (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
3551     return distributeOpThroughSelect(DCI, LHS.getOpcode(),
3552                                      SDLoc(N), Cond, LHS, RHS);
3553   }
3554 
3555   bool Inv = false;
3556   if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
3557     std::swap(LHS, RHS);
3558     Inv = true;
3559   }
3560 
3561   // TODO: Support vector constants.
3562   ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
3563   if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) {
3564     SDLoc SL(N);
3565     // If one side is an fneg/fabs and the other is a constant, we can push the
3566     // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
3567     SDValue NewLHS = LHS.getOperand(0);
3568     SDValue NewRHS = RHS;
3569 
3570     // Careful: if the neg can be folded up, don't try to pull it back down.
3571     bool ShouldFoldNeg = true;
3572 
3573     if (NewLHS.hasOneUse()) {
3574       unsigned Opc = NewLHS.getOpcode();
3575       if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc))
3576         ShouldFoldNeg = false;
3577       if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
3578         ShouldFoldNeg = false;
3579     }
3580 
3581     if (ShouldFoldNeg) {
3582       if (LHS.getOpcode() == ISD::FNEG)
3583         NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3584       else if (CRHS->isNegative())
3585         return SDValue();
3586 
3587       if (Inv)
3588         std::swap(NewLHS, NewRHS);
3589 
3590       SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
3591                                       Cond, NewLHS, NewRHS);
3592       DCI.AddToWorklist(NewSelect.getNode());
3593       return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
3594     }
3595   }
3596 
3597   return SDValue();
3598 }
3599 
3600 
3601 SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
3602                                                    DAGCombinerInfo &DCI) const {
3603   if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
3604     return Folded;
3605 
3606   SDValue Cond = N->getOperand(0);
3607   if (Cond.getOpcode() != ISD::SETCC)
3608     return SDValue();
3609 
3610   EVT VT = N->getValueType(0);
3611   SDValue LHS = Cond.getOperand(0);
3612   SDValue RHS = Cond.getOperand(1);
3613   SDValue CC = Cond.getOperand(2);
3614 
3615   SDValue True = N->getOperand(1);
3616   SDValue False = N->getOperand(2);
3617 
3618   if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
3619     SelectionDAG &DAG = DCI.DAG;
3620     if (DAG.isConstantValueOfAnyType(True) &&
3621         !DAG.isConstantValueOfAnyType(False)) {
3622       // Swap cmp + select pair to move constant to false input.
3623       // This will allow using VOPC cndmasks more often.
3624       // select (setcc x, y), k, x -> select (setccinv x, y), x, k
3625 
3626       SDLoc SL(N);
3627       ISD::CondCode NewCC =
3628           getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
3629 
3630       SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
3631       return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
3632     }
3633 
3634     if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
3635       SDValue MinMax
3636         = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
3637       // Revisit this node so we can catch min3/max3/med3 patterns.
3638       //DCI.AddToWorklist(MinMax.getNode());
3639       return MinMax;
3640     }
3641   }
3642 
3643   // There's no reason to not do this if the condition has other uses.
3644   return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
3645 }
3646 
3647 static bool isInv2Pi(const APFloat &APF) {
3648   static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
3649   static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
3650   static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
3651 
3652   return APF.bitwiseIsEqual(KF16) ||
3653          APF.bitwiseIsEqual(KF32) ||
3654          APF.bitwiseIsEqual(KF64);
3655 }
3656 
3657 // 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
3658 // additional cost to negate them.
3659 bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const {
3660   if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N)) {
3661     if (C->isZero() && !C->isNegative())
3662       return true;
3663 
3664     if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
3665       return true;
3666   }
3667 
3668   return false;
3669 }
3670 
3671 static unsigned inverseMinMax(unsigned Opc) {
3672   switch (Opc) {
3673   case ISD::FMAXNUM:
3674     return ISD::FMINNUM;
3675   case ISD::FMINNUM:
3676     return ISD::FMAXNUM;
3677   case ISD::FMAXNUM_IEEE:
3678     return ISD::FMINNUM_IEEE;
3679   case ISD::FMINNUM_IEEE:
3680     return ISD::FMAXNUM_IEEE;
3681   case AMDGPUISD::FMAX_LEGACY:
3682     return AMDGPUISD::FMIN_LEGACY;
3683   case AMDGPUISD::FMIN_LEGACY:
3684     return  AMDGPUISD::FMAX_LEGACY;
3685   default:
3686     llvm_unreachable("invalid min/max opcode");
3687   }
3688 }
3689 
3690 SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
3691                                                  DAGCombinerInfo &DCI) const {
3692   SelectionDAG &DAG = DCI.DAG;
3693   SDValue N0 = N->getOperand(0);
3694   EVT VT = N->getValueType(0);
3695 
3696   unsigned Opc = N0.getOpcode();
3697 
3698   // If the input has multiple uses and we can either fold the negate down, or
3699   // the other uses cannot, give up. This both prevents unprofitable
3700   // transformations and infinite loops: we won't repeatedly try to fold around
3701   // a negate that has no 'good' form.
3702   if (N0.hasOneUse()) {
3703     // This may be able to fold into the source, but at a code size cost. Don't
3704     // fold if the fold into the user is free.
3705     if (allUsesHaveSourceMods(N, 0))
3706       return SDValue();
3707   } else {
3708     if (fnegFoldsIntoOp(Opc) &&
3709         (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode())))
3710       return SDValue();
3711   }
3712 
3713   SDLoc SL(N);
3714   switch (Opc) {
3715   case ISD::FADD: {
3716     if (!mayIgnoreSignedZero(N0))
3717       return SDValue();
3718 
3719     // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
3720     SDValue LHS = N0.getOperand(0);
3721     SDValue RHS = N0.getOperand(1);
3722 
3723     if (LHS.getOpcode() != ISD::FNEG)
3724       LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3725     else
3726       LHS = LHS.getOperand(0);
3727 
3728     if (RHS.getOpcode() != ISD::FNEG)
3729       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3730     else
3731       RHS = RHS.getOperand(0);
3732 
3733     SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
3734     if (Res.getOpcode() != ISD::FADD)
3735       return SDValue(); // Op got folded away.
3736     if (!N0.hasOneUse())
3737       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3738     return Res;
3739   }
3740   case ISD::FMUL:
3741   case AMDGPUISD::FMUL_LEGACY: {
3742     // (fneg (fmul x, y)) -> (fmul x, (fneg y))
3743     // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
3744     SDValue LHS = N0.getOperand(0);
3745     SDValue RHS = N0.getOperand(1);
3746 
3747     if (LHS.getOpcode() == ISD::FNEG)
3748       LHS = LHS.getOperand(0);
3749     else if (RHS.getOpcode() == ISD::FNEG)
3750       RHS = RHS.getOperand(0);
3751     else
3752       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3753 
3754     SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
3755     if (Res.getOpcode() != Opc)
3756       return SDValue(); // Op got folded away.
3757     if (!N0.hasOneUse())
3758       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3759     return Res;
3760   }
3761   case ISD::FMA:
3762   case ISD::FMAD: {
3763     // TODO: handle llvm.amdgcn.fma.legacy
3764     if (!mayIgnoreSignedZero(N0))
3765       return SDValue();
3766 
3767     // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
3768     SDValue LHS = N0.getOperand(0);
3769     SDValue MHS = N0.getOperand(1);
3770     SDValue RHS = N0.getOperand(2);
3771 
3772     if (LHS.getOpcode() == ISD::FNEG)
3773       LHS = LHS.getOperand(0);
3774     else if (MHS.getOpcode() == ISD::FNEG)
3775       MHS = MHS.getOperand(0);
3776     else
3777       MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
3778 
3779     if (RHS.getOpcode() != ISD::FNEG)
3780       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3781     else
3782       RHS = RHS.getOperand(0);
3783 
3784     SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
3785     if (Res.getOpcode() != Opc)
3786       return SDValue(); // Op got folded away.
3787     if (!N0.hasOneUse())
3788       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3789     return Res;
3790   }
3791   case ISD::FMAXNUM:
3792   case ISD::FMINNUM:
3793   case ISD::FMAXNUM_IEEE:
3794   case ISD::FMINNUM_IEEE:
3795   case AMDGPUISD::FMAX_LEGACY:
3796   case AMDGPUISD::FMIN_LEGACY: {
3797     // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
3798     // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
3799     // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
3800     // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
3801 
3802     SDValue LHS = N0.getOperand(0);
3803     SDValue RHS = N0.getOperand(1);
3804 
3805     // 0 doesn't have a negated inline immediate.
3806     // TODO: This constant check should be generalized to other operations.
3807     if (isConstantCostlierToNegate(RHS))
3808       return SDValue();
3809 
3810     SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3811     SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3812     unsigned Opposite = inverseMinMax(Opc);
3813 
3814     SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
3815     if (Res.getOpcode() != Opposite)
3816       return SDValue(); // Op got folded away.
3817     if (!N0.hasOneUse())
3818       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3819     return Res;
3820   }
3821   case AMDGPUISD::FMED3: {
3822     SDValue Ops[3];
3823     for (unsigned I = 0; I < 3; ++I)
3824       Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
3825 
3826     SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
3827     if (Res.getOpcode() != AMDGPUISD::FMED3)
3828       return SDValue(); // Op got folded away.
3829 
3830     if (!N0.hasOneUse()) {
3831       SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
3832       DAG.ReplaceAllUsesWith(N0, Neg);
3833 
3834       for (SDNode *U : Neg->uses())
3835         DCI.AddToWorklist(U);
3836     }
3837 
3838     return Res;
3839   }
3840   case ISD::FP_EXTEND:
3841   case ISD::FTRUNC:
3842   case ISD::FRINT:
3843   case ISD::FNEARBYINT: // XXX - Should fround be handled?
3844   case ISD::FSIN:
3845   case ISD::FCANONICALIZE:
3846   case AMDGPUISD::RCP:
3847   case AMDGPUISD::RCP_LEGACY:
3848   case AMDGPUISD::RCP_IFLAG:
3849   case AMDGPUISD::SIN_HW: {
3850     SDValue CvtSrc = N0.getOperand(0);
3851     if (CvtSrc.getOpcode() == ISD::FNEG) {
3852       // (fneg (fp_extend (fneg x))) -> (fp_extend x)
3853       // (fneg (rcp (fneg x))) -> (rcp x)
3854       return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
3855     }
3856 
3857     if (!N0.hasOneUse())
3858       return SDValue();
3859 
3860     // (fneg (fp_extend x)) -> (fp_extend (fneg x))
3861     // (fneg (rcp x)) -> (rcp (fneg x))
3862     SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
3863     return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
3864   }
3865   case ISD::FP_ROUND: {
3866     SDValue CvtSrc = N0.getOperand(0);
3867 
3868     if (CvtSrc.getOpcode() == ISD::FNEG) {
3869       // (fneg (fp_round (fneg x))) -> (fp_round x)
3870       return DAG.getNode(ISD::FP_ROUND, SL, VT,
3871                          CvtSrc.getOperand(0), N0.getOperand(1));
3872     }
3873 
3874     if (!N0.hasOneUse())
3875       return SDValue();
3876 
3877     // (fneg (fp_round x)) -> (fp_round (fneg x))
3878     SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
3879     return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
3880   }
3881   case ISD::FP16_TO_FP: {
3882     // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
3883     // f16, but legalization of f16 fneg ends up pulling it out of the source.
3884     // Put the fneg back as a legal source operation that can be matched later.
3885     SDLoc SL(N);
3886 
3887     SDValue Src = N0.getOperand(0);
3888     EVT SrcVT = Src.getValueType();
3889 
3890     // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
3891     SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
3892                                   DAG.getConstant(0x8000, SL, SrcVT));
3893     return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
3894   }
3895   default:
3896     return SDValue();
3897   }
3898 }
3899 
3900 SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
3901                                                  DAGCombinerInfo &DCI) const {
3902   SelectionDAG &DAG = DCI.DAG;
3903   SDValue N0 = N->getOperand(0);
3904 
3905   if (!N0.hasOneUse())
3906     return SDValue();
3907 
3908   switch (N0.getOpcode()) {
3909   case ISD::FP16_TO_FP: {
3910     assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
3911     SDLoc SL(N);
3912     SDValue Src = N0.getOperand(0);
3913     EVT SrcVT = Src.getValueType();
3914 
3915     // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
3916     SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
3917                                   DAG.getConstant(0x7fff, SL, SrcVT));
3918     return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
3919   }
3920   default:
3921     return SDValue();
3922   }
3923 }
3924 
3925 SDValue AMDGPUTargetLowering::performRcpCombine(SDNode *N,
3926                                                 DAGCombinerInfo &DCI) const {
3927   const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
3928   if (!CFP)
3929     return SDValue();
3930 
3931   // XXX - Should this flush denormals?
3932   const APFloat &Val = CFP->getValueAPF();
3933   APFloat One(Val.getSemantics(), "1.0");
3934   return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
3935 }
3936 
3937 SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
3938                                                 DAGCombinerInfo &DCI) const {
3939   SelectionDAG &DAG = DCI.DAG;
3940   SDLoc DL(N);
3941 
3942   switch(N->getOpcode()) {
3943   default:
3944     break;
3945   case ISD::BITCAST: {
3946     EVT DestVT = N->getValueType(0);
3947 
3948     // Push casts through vector builds. This helps avoid emitting a large
3949     // number of copies when materializing floating point vector constants.
3950     //
3951     // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
3952     //   vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
3953     if (DestVT.isVector()) {
3954       SDValue Src = N->getOperand(0);
3955       if (Src.getOpcode() == ISD::BUILD_VECTOR) {
3956         EVT SrcVT = Src.getValueType();
3957         unsigned NElts = DestVT.getVectorNumElements();
3958 
3959         if (SrcVT.getVectorNumElements() == NElts) {
3960           EVT DestEltVT = DestVT.getVectorElementType();
3961 
3962           SmallVector<SDValue, 8> CastedElts;
3963           SDLoc SL(N);
3964           for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
3965             SDValue Elt = Src.getOperand(I);
3966             CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
3967           }
3968 
3969           return DAG.getBuildVector(DestVT, SL, CastedElts);
3970         }
3971       }
3972     }
3973 
3974     if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
3975       break;
3976 
3977     // Fold bitcasts of constants.
3978     //
3979     // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
3980     // TODO: Generalize and move to DAGCombiner
3981     SDValue Src = N->getOperand(0);
3982     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
3983       SDLoc SL(N);
3984       uint64_t CVal = C->getZExtValue();
3985       SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
3986                                DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
3987                                DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
3988       return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
3989     }
3990 
3991     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
3992       const APInt &Val = C->getValueAPF().bitcastToAPInt();
3993       SDLoc SL(N);
3994       uint64_t CVal = Val.getZExtValue();
3995       SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
3996                                 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
3997                                 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
3998 
3999       return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
4000     }
4001 
4002     break;
4003   }
4004   case ISD::SHL: {
4005     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
4006       break;
4007 
4008     return performShlCombine(N, DCI);
4009   }
4010   case ISD::SRL: {
4011     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
4012       break;
4013 
4014     return performSrlCombine(N, DCI);
4015   }
4016   case ISD::SRA: {
4017     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
4018       break;
4019 
4020     return performSraCombine(N, DCI);
4021   }
4022   case ISD::TRUNCATE:
4023     return performTruncateCombine(N, DCI);
4024   case ISD::MUL:
4025     return performMulCombine(N, DCI);
4026   case ISD::SMUL_LOHI:
4027   case ISD::UMUL_LOHI:
4028     return performMulLoHiCombine(N, DCI);
4029   case ISD::MULHS:
4030     return performMulhsCombine(N, DCI);
4031   case ISD::MULHU:
4032     return performMulhuCombine(N, DCI);
4033   case AMDGPUISD::MUL_I24:
4034   case AMDGPUISD::MUL_U24:
4035   case AMDGPUISD::MULHI_I24:
4036   case AMDGPUISD::MULHI_U24:
4037     return simplifyMul24(N, DCI);
4038   case ISD::SELECT:
4039     return performSelectCombine(N, DCI);
4040   case ISD::FNEG:
4041     return performFNegCombine(N, DCI);
4042   case ISD::FABS:
4043     return performFAbsCombine(N, DCI);
4044   case AMDGPUISD::BFE_I32:
4045   case AMDGPUISD::BFE_U32: {
4046     assert(!N->getValueType(0).isVector() &&
4047            "Vector handling of BFE not implemented");
4048     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
4049     if (!Width)
4050       break;
4051 
4052     uint32_t WidthVal = Width->getZExtValue() & 0x1f;
4053     if (WidthVal == 0)
4054       return DAG.getConstant(0, DL, MVT::i32);
4055 
4056     ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
4057     if (!Offset)
4058       break;
4059 
4060     SDValue BitsFrom = N->getOperand(0);
4061     uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
4062 
4063     bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
4064 
4065     if (OffsetVal == 0) {
4066       // This is already sign / zero extended, so try to fold away extra BFEs.
4067       unsigned SignBits =  Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
4068 
4069       unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
4070       if (OpSignBits >= SignBits)
4071         return BitsFrom;
4072 
4073       EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
4074       if (Signed) {
4075         // This is a sign_extend_inreg. Replace it to take advantage of existing
4076         // DAG Combines. If not eliminated, we will match back to BFE during
4077         // selection.
4078 
4079         // TODO: The sext_inreg of extended types ends, although we can could
4080         // handle them in a single BFE.
4081         return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
4082                            DAG.getValueType(SmallVT));
4083       }
4084 
4085       return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
4086     }
4087 
4088     if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
4089       if (Signed) {
4090         return constantFoldBFE<int32_t>(DAG,
4091                                         CVal->getSExtValue(),
4092                                         OffsetVal,
4093                                         WidthVal,
4094                                         DL);
4095       }
4096 
4097       return constantFoldBFE<uint32_t>(DAG,
4098                                        CVal->getZExtValue(),
4099                                        OffsetVal,
4100                                        WidthVal,
4101                                        DL);
4102     }
4103 
4104     if ((OffsetVal + WidthVal) >= 32 &&
4105         !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
4106       SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
4107       return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
4108                          BitsFrom, ShiftVal);
4109     }
4110 
4111     if (BitsFrom.hasOneUse()) {
4112       APInt Demanded = APInt::getBitsSet(32,
4113                                          OffsetVal,
4114                                          OffsetVal + WidthVal);
4115 
4116       KnownBits Known;
4117       TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
4118                                             !DCI.isBeforeLegalizeOps());
4119       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4120       if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
4121           TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
4122         DCI.CommitTargetLoweringOpt(TLO);
4123       }
4124     }
4125 
4126     break;
4127   }
4128   case ISD::LOAD:
4129     return performLoadCombine(N, DCI);
4130   case ISD::STORE:
4131     return performStoreCombine(N, DCI);
4132   case AMDGPUISD::RCP:
4133   case AMDGPUISD::RCP_IFLAG:
4134     return performRcpCombine(N, DCI);
4135   case ISD::AssertZext:
4136   case ISD::AssertSext:
4137     return performAssertSZExtCombine(N, DCI);
4138   case ISD::INTRINSIC_WO_CHAIN:
4139     return performIntrinsicWOChainCombine(N, DCI);
4140   }
4141   return SDValue();
4142 }
4143 
4144 //===----------------------------------------------------------------------===//
4145 // Helper functions
4146 //===----------------------------------------------------------------------===//
4147 
4148 SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
4149                                                    const TargetRegisterClass *RC,
4150                                                    Register Reg, EVT VT,
4151                                                    const SDLoc &SL,
4152                                                    bool RawReg) const {
4153   MachineFunction &MF = DAG.getMachineFunction();
4154   MachineRegisterInfo &MRI = MF.getRegInfo();
4155   Register VReg;
4156 
4157   if (!MRI.isLiveIn(Reg)) {
4158     VReg = MRI.createVirtualRegister(RC);
4159     MRI.addLiveIn(Reg, VReg);
4160   } else {
4161     VReg = MRI.getLiveInVirtReg(Reg);
4162   }
4163 
4164   if (RawReg)
4165     return DAG.getRegister(VReg, VT);
4166 
4167   return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
4168 }
4169 
4170 // This may be called multiple times, and nothing prevents creating multiple
4171 // objects at the same offset. See if we already defined this object.
4172 static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size,
4173                                        int64_t Offset) {
4174   for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
4175     if (MFI.getObjectOffset(I) == Offset) {
4176       assert(MFI.getObjectSize(I) == Size);
4177       return I;
4178     }
4179   }
4180 
4181   return MFI.CreateFixedObject(Size, Offset, true);
4182 }
4183 
4184 SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
4185                                                   EVT VT,
4186                                                   const SDLoc &SL,
4187                                                   int64_t Offset) const {
4188   MachineFunction &MF = DAG.getMachineFunction();
4189   MachineFrameInfo &MFI = MF.getFrameInfo();
4190   int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
4191 
4192   auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
4193   SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
4194 
4195   return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),
4196                      MachineMemOperand::MODereferenceable |
4197                          MachineMemOperand::MOInvariant);
4198 }
4199 
4200 SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
4201                                                    const SDLoc &SL,
4202                                                    SDValue Chain,
4203                                                    SDValue ArgVal,
4204                                                    int64_t Offset) const {
4205   MachineFunction &MF = DAG.getMachineFunction();
4206   MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
4207   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4208 
4209   SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
4210   // Stores to the argument stack area are relative to the stack pointer.
4211   SDValue SP =
4212       DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
4213   Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
4214   SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
4215                                MachineMemOperand::MODereferenceable);
4216   return Store;
4217 }
4218 
4219 SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
4220                                              const TargetRegisterClass *RC,
4221                                              EVT VT, const SDLoc &SL,
4222                                              const ArgDescriptor &Arg) const {
4223   assert(Arg && "Attempting to load missing argument");
4224 
4225   SDValue V = Arg.isRegister() ?
4226     CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
4227     loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
4228 
4229   if (!Arg.isMasked())
4230     return V;
4231 
4232   unsigned Mask = Arg.getMask();
4233   unsigned Shift = countTrailingZeros<unsigned>(Mask);
4234   V = DAG.getNode(ISD::SRL, SL, VT, V,
4235                   DAG.getShiftAmountConstant(Shift, VT, SL));
4236   return DAG.getNode(ISD::AND, SL, VT, V,
4237                      DAG.getConstant(Mask >> Shift, SL, VT));
4238 }
4239 
4240 uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
4241     const MachineFunction &MF, const ImplicitParameter Param) const {
4242   const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
4243   const AMDGPUSubtarget &ST =
4244       AMDGPUSubtarget::get(getTargetMachine(), MF.getFunction());
4245   unsigned ExplicitArgOffset = ST.getExplicitKernelArgOffset(MF.getFunction());
4246   const Align Alignment = ST.getAlignmentForImplicitArgPtr();
4247   uint64_t ArgOffset = alignTo(MFI->getExplicitKernArgSize(), Alignment) +
4248                        ExplicitArgOffset;
4249   switch (Param) {
4250   case FIRST_IMPLICIT:
4251     return ArgOffset;
4252   case PRIVATE_BASE:
4253     return ArgOffset + AMDGPU::ImplicitArg::PRIVATE_BASE_OFFSET;
4254   case SHARED_BASE:
4255     return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
4256   case QUEUE_PTR:
4257     return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
4258   }
4259   llvm_unreachable("unexpected implicit parameter type");
4260 }
4261 
4262 #define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
4263 
4264 const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
4265   switch ((AMDGPUISD::NodeType)Opcode) {
4266   case AMDGPUISD::FIRST_NUMBER: break;
4267   // AMDIL DAG nodes
4268   NODE_NAME_CASE(UMUL);
4269   NODE_NAME_CASE(BRANCH_COND);
4270 
4271   // AMDGPU DAG nodes
4272   NODE_NAME_CASE(IF)
4273   NODE_NAME_CASE(ELSE)
4274   NODE_NAME_CASE(LOOP)
4275   NODE_NAME_CASE(CALL)
4276   NODE_NAME_CASE(TC_RETURN)
4277   NODE_NAME_CASE(TRAP)
4278   NODE_NAME_CASE(RET_FLAG)
4279   NODE_NAME_CASE(RETURN_TO_EPILOG)
4280   NODE_NAME_CASE(ENDPGM)
4281   NODE_NAME_CASE(DWORDADDR)
4282   NODE_NAME_CASE(FRACT)
4283   NODE_NAME_CASE(SETCC)
4284   NODE_NAME_CASE(SETREG)
4285   NODE_NAME_CASE(DENORM_MODE)
4286   NODE_NAME_CASE(FMA_W_CHAIN)
4287   NODE_NAME_CASE(FMUL_W_CHAIN)
4288   NODE_NAME_CASE(CLAMP)
4289   NODE_NAME_CASE(COS_HW)
4290   NODE_NAME_CASE(SIN_HW)
4291   NODE_NAME_CASE(FMAX_LEGACY)
4292   NODE_NAME_CASE(FMIN_LEGACY)
4293   NODE_NAME_CASE(FMAX3)
4294   NODE_NAME_CASE(SMAX3)
4295   NODE_NAME_CASE(UMAX3)
4296   NODE_NAME_CASE(FMIN3)
4297   NODE_NAME_CASE(SMIN3)
4298   NODE_NAME_CASE(UMIN3)
4299   NODE_NAME_CASE(FMED3)
4300   NODE_NAME_CASE(SMED3)
4301   NODE_NAME_CASE(UMED3)
4302   NODE_NAME_CASE(FDOT2)
4303   NODE_NAME_CASE(URECIP)
4304   NODE_NAME_CASE(DIV_SCALE)
4305   NODE_NAME_CASE(DIV_FMAS)
4306   NODE_NAME_CASE(DIV_FIXUP)
4307   NODE_NAME_CASE(FMAD_FTZ)
4308   NODE_NAME_CASE(RCP)
4309   NODE_NAME_CASE(RSQ)
4310   NODE_NAME_CASE(RCP_LEGACY)
4311   NODE_NAME_CASE(RCP_IFLAG)
4312   NODE_NAME_CASE(FMUL_LEGACY)
4313   NODE_NAME_CASE(RSQ_CLAMP)
4314   NODE_NAME_CASE(LDEXP)
4315   NODE_NAME_CASE(FP_CLASS)
4316   NODE_NAME_CASE(DOT4)
4317   NODE_NAME_CASE(CARRY)
4318   NODE_NAME_CASE(BORROW)
4319   NODE_NAME_CASE(BFE_U32)
4320   NODE_NAME_CASE(BFE_I32)
4321   NODE_NAME_CASE(BFI)
4322   NODE_NAME_CASE(BFM)
4323   NODE_NAME_CASE(FFBH_U32)
4324   NODE_NAME_CASE(FFBH_I32)
4325   NODE_NAME_CASE(FFBL_B32)
4326   NODE_NAME_CASE(MUL_U24)
4327   NODE_NAME_CASE(MUL_I24)
4328   NODE_NAME_CASE(MULHI_U24)
4329   NODE_NAME_CASE(MULHI_I24)
4330   NODE_NAME_CASE(MAD_U24)
4331   NODE_NAME_CASE(MAD_I24)
4332   NODE_NAME_CASE(MAD_I64_I32)
4333   NODE_NAME_CASE(MAD_U64_U32)
4334   NODE_NAME_CASE(PERM)
4335   NODE_NAME_CASE(TEXTURE_FETCH)
4336   NODE_NAME_CASE(R600_EXPORT)
4337   NODE_NAME_CASE(CONST_ADDRESS)
4338   NODE_NAME_CASE(REGISTER_LOAD)
4339   NODE_NAME_CASE(REGISTER_STORE)
4340   NODE_NAME_CASE(SAMPLE)
4341   NODE_NAME_CASE(SAMPLEB)
4342   NODE_NAME_CASE(SAMPLED)
4343   NODE_NAME_CASE(SAMPLEL)
4344   NODE_NAME_CASE(CVT_F32_UBYTE0)
4345   NODE_NAME_CASE(CVT_F32_UBYTE1)
4346   NODE_NAME_CASE(CVT_F32_UBYTE2)
4347   NODE_NAME_CASE(CVT_F32_UBYTE3)
4348   NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
4349   NODE_NAME_CASE(CVT_PKNORM_I16_F32)
4350   NODE_NAME_CASE(CVT_PKNORM_U16_F32)
4351   NODE_NAME_CASE(CVT_PK_I16_I32)
4352   NODE_NAME_CASE(CVT_PK_U16_U32)
4353   NODE_NAME_CASE(FP_TO_FP16)
4354   NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
4355   NODE_NAME_CASE(CONST_DATA_PTR)
4356   NODE_NAME_CASE(PC_ADD_REL_OFFSET)
4357   NODE_NAME_CASE(LDS)
4358   NODE_NAME_CASE(FPTRUNC_ROUND_UPWARD)
4359   NODE_NAME_CASE(FPTRUNC_ROUND_DOWNWARD)
4360   NODE_NAME_CASE(DUMMY_CHAIN)
4361   case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
4362   NODE_NAME_CASE(LOAD_D16_HI)
4363   NODE_NAME_CASE(LOAD_D16_LO)
4364   NODE_NAME_CASE(LOAD_D16_HI_I8)
4365   NODE_NAME_CASE(LOAD_D16_HI_U8)
4366   NODE_NAME_CASE(LOAD_D16_LO_I8)
4367   NODE_NAME_CASE(LOAD_D16_LO_U8)
4368   NODE_NAME_CASE(STORE_MSKOR)
4369   NODE_NAME_CASE(LOAD_CONSTANT)
4370   NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
4371   NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)
4372   NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
4373   NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
4374   NODE_NAME_CASE(DS_ORDERED_COUNT)
4375   NODE_NAME_CASE(ATOMIC_CMP_SWAP)
4376   NODE_NAME_CASE(ATOMIC_INC)
4377   NODE_NAME_CASE(ATOMIC_DEC)
4378   NODE_NAME_CASE(ATOMIC_LOAD_FMIN)
4379   NODE_NAME_CASE(ATOMIC_LOAD_FMAX)
4380   NODE_NAME_CASE(BUFFER_LOAD)
4381   NODE_NAME_CASE(BUFFER_LOAD_UBYTE)
4382   NODE_NAME_CASE(BUFFER_LOAD_USHORT)
4383   NODE_NAME_CASE(BUFFER_LOAD_BYTE)
4384   NODE_NAME_CASE(BUFFER_LOAD_SHORT)
4385   NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
4386   NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
4387   NODE_NAME_CASE(SBUFFER_LOAD)
4388   NODE_NAME_CASE(BUFFER_STORE)
4389   NODE_NAME_CASE(BUFFER_STORE_BYTE)
4390   NODE_NAME_CASE(BUFFER_STORE_SHORT)
4391   NODE_NAME_CASE(BUFFER_STORE_FORMAT)
4392   NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
4393   NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)
4394   NODE_NAME_CASE(BUFFER_ATOMIC_ADD)
4395   NODE_NAME_CASE(BUFFER_ATOMIC_SUB)
4396   NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)
4397   NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)
4398   NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)
4399   NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)
4400   NODE_NAME_CASE(BUFFER_ATOMIC_AND)
4401   NODE_NAME_CASE(BUFFER_ATOMIC_OR)
4402   NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
4403   NODE_NAME_CASE(BUFFER_ATOMIC_INC)
4404   NODE_NAME_CASE(BUFFER_ATOMIC_DEC)
4405   NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
4406   NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
4407   NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
4408   NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
4409   NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
4410 
4411   case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
4412   }
4413   return nullptr;
4414 }
4415 
4416 SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
4417                                               SelectionDAG &DAG, int Enabled,
4418                                               int &RefinementSteps,
4419                                               bool &UseOneConstNR,
4420                                               bool Reciprocal) const {
4421   EVT VT = Operand.getValueType();
4422 
4423   if (VT == MVT::f32) {
4424     RefinementSteps = 0;
4425     return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
4426   }
4427 
4428   // TODO: There is also f64 rsq instruction, but the documentation is less
4429   // clear on its precision.
4430 
4431   return SDValue();
4432 }
4433 
4434 SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
4435                                                SelectionDAG &DAG, int Enabled,
4436                                                int &RefinementSteps) const {
4437   EVT VT = Operand.getValueType();
4438 
4439   if (VT == MVT::f32) {
4440     // Reciprocal, < 1 ulp error.
4441     //
4442     // This reciprocal approximation converges to < 0.5 ulp error with one
4443     // newton rhapson performed with two fused multiple adds (FMAs).
4444 
4445     RefinementSteps = 0;
4446     return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
4447   }
4448 
4449   // TODO: There is also f64 rcp instruction, but the documentation is less
4450   // clear on its precision.
4451 
4452   return SDValue();
4453 }
4454 
4455 static unsigned workitemIntrinsicDim(unsigned ID) {
4456   switch (ID) {
4457   case Intrinsic::amdgcn_workitem_id_x:
4458     return 0;
4459   case Intrinsic::amdgcn_workitem_id_y:
4460     return 1;
4461   case Intrinsic::amdgcn_workitem_id_z:
4462     return 2;
4463   default:
4464     llvm_unreachable("not a workitem intrinsic");
4465   }
4466 }
4467 
4468 void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
4469     const SDValue Op, KnownBits &Known,
4470     const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
4471 
4472   Known.resetAll(); // Don't know anything.
4473 
4474   unsigned Opc = Op.getOpcode();
4475 
4476   switch (Opc) {
4477   default:
4478     break;
4479   case AMDGPUISD::CARRY:
4480   case AMDGPUISD::BORROW: {
4481     Known.Zero = APInt::getHighBitsSet(32, 31);
4482     break;
4483   }
4484 
4485   case AMDGPUISD::BFE_I32:
4486   case AMDGPUISD::BFE_U32: {
4487     ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4488     if (!CWidth)
4489       return;
4490 
4491     uint32_t Width = CWidth->getZExtValue() & 0x1f;
4492 
4493     if (Opc == AMDGPUISD::BFE_U32)
4494       Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
4495 
4496     break;
4497   }
4498   case AMDGPUISD::FP_TO_FP16: {
4499     unsigned BitWidth = Known.getBitWidth();
4500 
4501     // High bits are zero.
4502     Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
4503     break;
4504   }
4505   case AMDGPUISD::MUL_U24:
4506   case AMDGPUISD::MUL_I24: {
4507     KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
4508     KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
4509     unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
4510                       RHSKnown.countMinTrailingZeros();
4511     Known.Zero.setLowBits(std::min(TrailZ, 32u));
4512     // Skip extra check if all bits are known zeros.
4513     if (TrailZ >= 32)
4514       break;
4515 
4516     // Truncate to 24 bits.
4517     LHSKnown = LHSKnown.trunc(24);
4518     RHSKnown = RHSKnown.trunc(24);
4519 
4520     if (Opc == AMDGPUISD::MUL_I24) {
4521       unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
4522       unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
4523       unsigned MaxValBits = LHSValBits + RHSValBits;
4524       if (MaxValBits > 32)
4525         break;
4526       unsigned SignBits = 32 - MaxValBits + 1;
4527       bool LHSNegative = LHSKnown.isNegative();
4528       bool LHSNonNegative = LHSKnown.isNonNegative();
4529       bool LHSPositive = LHSKnown.isStrictlyPositive();
4530       bool RHSNegative = RHSKnown.isNegative();
4531       bool RHSNonNegative = RHSKnown.isNonNegative();
4532       bool RHSPositive = RHSKnown.isStrictlyPositive();
4533 
4534       if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
4535         Known.Zero.setHighBits(SignBits);
4536       else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
4537         Known.One.setHighBits(SignBits);
4538     } else {
4539       unsigned LHSValBits = LHSKnown.countMaxActiveBits();
4540       unsigned RHSValBits = RHSKnown.countMaxActiveBits();
4541       unsigned MaxValBits = LHSValBits + RHSValBits;
4542       if (MaxValBits >= 32)
4543         break;
4544       Known.Zero.setBitsFrom(MaxValBits);
4545     }
4546     break;
4547   }
4548   case AMDGPUISD::PERM: {
4549     ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4550     if (!CMask)
4551       return;
4552 
4553     KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
4554     KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
4555     unsigned Sel = CMask->getZExtValue();
4556 
4557     for (unsigned I = 0; I < 32; I += 8) {
4558       unsigned SelBits = Sel & 0xff;
4559       if (SelBits < 4) {
4560         SelBits *= 8;
4561         Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
4562         Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
4563       } else if (SelBits < 7) {
4564         SelBits = (SelBits & 3) * 8;
4565         Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
4566         Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
4567       } else if (SelBits == 0x0c) {
4568         Known.Zero |= 0xFFull << I;
4569       } else if (SelBits > 0x0c) {
4570         Known.One |= 0xFFull << I;
4571       }
4572       Sel >>= 8;
4573     }
4574     break;
4575   }
4576   case AMDGPUISD::BUFFER_LOAD_UBYTE:  {
4577     Known.Zero.setHighBits(24);
4578     break;
4579   }
4580   case AMDGPUISD::BUFFER_LOAD_USHORT: {
4581     Known.Zero.setHighBits(16);
4582     break;
4583   }
4584   case AMDGPUISD::LDS: {
4585     auto GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
4586     Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());
4587 
4588     Known.Zero.setHighBits(16);
4589     Known.Zero.setLowBits(Log2(Alignment));
4590     break;
4591   }
4592   case ISD::INTRINSIC_WO_CHAIN: {
4593     unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4594     switch (IID) {
4595     case Intrinsic::amdgcn_mbcnt_lo:
4596     case Intrinsic::amdgcn_mbcnt_hi: {
4597       const GCNSubtarget &ST =
4598           DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
4599       // These return at most the wavefront size - 1.
4600       unsigned Size = Op.getValueType().getSizeInBits();
4601       Known.Zero.setHighBits(Size - ST.getWavefrontSizeLog2());
4602       break;
4603     }
4604     case Intrinsic::amdgcn_workitem_id_x:
4605     case Intrinsic::amdgcn_workitem_id_y:
4606     case Intrinsic::amdgcn_workitem_id_z: {
4607       unsigned MaxValue = Subtarget->getMaxWorkitemID(
4608           DAG.getMachineFunction().getFunction(), workitemIntrinsicDim(IID));
4609       Known.Zero.setHighBits(countLeadingZeros(MaxValue));
4610       break;
4611     }
4612     default:
4613       break;
4614     }
4615   }
4616   }
4617 }
4618 
4619 unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
4620     SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
4621     unsigned Depth) const {
4622   switch (Op.getOpcode()) {
4623   case AMDGPUISD::BFE_I32: {
4624     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4625     if (!Width)
4626       return 1;
4627 
4628     unsigned SignBits = 32 - Width->getZExtValue() + 1;
4629     if (!isNullConstant(Op.getOperand(1)))
4630       return SignBits;
4631 
4632     // TODO: Could probably figure something out with non-0 offsets.
4633     unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
4634     return std::max(SignBits, Op0SignBits);
4635   }
4636 
4637   case AMDGPUISD::BFE_U32: {
4638     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
4639     return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
4640   }
4641 
4642   case AMDGPUISD::CARRY:
4643   case AMDGPUISD::BORROW:
4644     return 31;
4645   case AMDGPUISD::BUFFER_LOAD_BYTE:
4646     return 25;
4647   case AMDGPUISD::BUFFER_LOAD_SHORT:
4648     return 17;
4649   case AMDGPUISD::BUFFER_LOAD_UBYTE:
4650     return 24;
4651   case AMDGPUISD::BUFFER_LOAD_USHORT:
4652     return 16;
4653   case AMDGPUISD::FP_TO_FP16:
4654     return 16;
4655   default:
4656     return 1;
4657   }
4658 }
4659 
4660 unsigned AMDGPUTargetLowering::computeNumSignBitsForTargetInstr(
4661   GISelKnownBits &Analysis, Register R,
4662   const APInt &DemandedElts, const MachineRegisterInfo &MRI,
4663   unsigned Depth) const {
4664   const MachineInstr *MI = MRI.getVRegDef(R);
4665   if (!MI)
4666     return 1;
4667 
4668   // TODO: Check range metadata on MMO.
4669   switch (MI->getOpcode()) {
4670   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
4671     return 25;
4672   case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
4673     return 17;
4674   case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
4675     return 24;
4676   case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
4677     return 16;
4678   default:
4679     return 1;
4680   }
4681 }
4682 
4683 bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
4684                                                         const SelectionDAG &DAG,
4685                                                         bool SNaN,
4686                                                         unsigned Depth) const {
4687   unsigned Opcode = Op.getOpcode();
4688   switch (Opcode) {
4689   case AMDGPUISD::FMIN_LEGACY:
4690   case AMDGPUISD::FMAX_LEGACY: {
4691     if (SNaN)
4692       return true;
4693 
4694     // TODO: Can check no nans on one of the operands for each one, but which
4695     // one?
4696     return false;
4697   }
4698   case AMDGPUISD::FMUL_LEGACY:
4699   case AMDGPUISD::CVT_PKRTZ_F16_F32: {
4700     if (SNaN)
4701       return true;
4702     return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
4703            DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
4704   }
4705   case AMDGPUISD::FMED3:
4706   case AMDGPUISD::FMIN3:
4707   case AMDGPUISD::FMAX3:
4708   case AMDGPUISD::FMAD_FTZ: {
4709     if (SNaN)
4710       return true;
4711     return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
4712            DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
4713            DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
4714   }
4715   case AMDGPUISD::CVT_F32_UBYTE0:
4716   case AMDGPUISD::CVT_F32_UBYTE1:
4717   case AMDGPUISD::CVT_F32_UBYTE2:
4718   case AMDGPUISD::CVT_F32_UBYTE3:
4719     return true;
4720 
4721   case AMDGPUISD::RCP:
4722   case AMDGPUISD::RSQ:
4723   case AMDGPUISD::RCP_LEGACY:
4724   case AMDGPUISD::RSQ_CLAMP: {
4725     if (SNaN)
4726       return true;
4727 
4728     // TODO: Need is known positive check.
4729     return false;
4730   }
4731   case AMDGPUISD::LDEXP:
4732   case AMDGPUISD::FRACT: {
4733     if (SNaN)
4734       return true;
4735     return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
4736   }
4737   case AMDGPUISD::DIV_SCALE:
4738   case AMDGPUISD::DIV_FMAS:
4739   case AMDGPUISD::DIV_FIXUP:
4740     // TODO: Refine on operands.
4741     return SNaN;
4742   case AMDGPUISD::SIN_HW:
4743   case AMDGPUISD::COS_HW: {
4744     // TODO: Need check for infinity
4745     return SNaN;
4746   }
4747   case ISD::INTRINSIC_WO_CHAIN: {
4748     unsigned IntrinsicID
4749       = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4750     // TODO: Handle more intrinsics
4751     switch (IntrinsicID) {
4752     case Intrinsic::amdgcn_cubeid:
4753       return true;
4754 
4755     case Intrinsic::amdgcn_frexp_mant: {
4756       if (SNaN)
4757         return true;
4758       return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
4759     }
4760     case Intrinsic::amdgcn_cvt_pkrtz: {
4761       if (SNaN)
4762         return true;
4763       return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
4764              DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
4765     }
4766     case Intrinsic::amdgcn_rcp:
4767     case Intrinsic::amdgcn_rsq:
4768     case Intrinsic::amdgcn_rcp_legacy:
4769     case Intrinsic::amdgcn_rsq_legacy:
4770     case Intrinsic::amdgcn_rsq_clamp: {
4771       if (SNaN)
4772         return true;
4773 
4774       // TODO: Need is known positive check.
4775       return false;
4776     }
4777     case Intrinsic::amdgcn_trig_preop:
4778     case Intrinsic::amdgcn_fdot2:
4779       // TODO: Refine on operand
4780       return SNaN;
4781     case Intrinsic::amdgcn_fma_legacy:
4782       if (SNaN)
4783         return true;
4784       return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
4785              DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&
4786              DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);
4787     default:
4788       return false;
4789     }
4790   }
4791   default:
4792     return false;
4793   }
4794 }
4795 
4796 TargetLowering::AtomicExpansionKind
4797 AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
4798   switch (RMW->getOperation()) {
4799   case AtomicRMWInst::Nand:
4800   case AtomicRMWInst::FAdd:
4801   case AtomicRMWInst::FSub:
4802     return AtomicExpansionKind::CmpXChg;
4803   default:
4804     return AtomicExpansionKind::None;
4805   }
4806 }
4807 
4808 bool AMDGPUTargetLowering::isConstantUnsignedBitfieldExtractLegal(
4809     unsigned Opc, LLT Ty1, LLT Ty2) const {
4810   return (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64)) &&
4811          Ty2 == LLT::scalar(32);
4812 }
4813