1 //===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief This is the parent TargetLowering class for hardware code gen
12 /// targets.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPUISelLowering.h"
17 #include "AMDGPU.h"
18 #include "AMDGPUCallLowering.h"
19 #include "AMDGPUFrameLowering.h"
20 #include "AMDGPUIntrinsicInfo.h"
21 #include "AMDGPURegisterInfo.h"
22 #include "AMDGPUSubtarget.h"
23 #include "AMDGPUTargetMachine.h"
24 #include "R600MachineFunctionInfo.h"
25 #include "SIInstrInfo.h"
26 #include "SIMachineFunctionInfo.h"
27 #include "llvm/CodeGen/CallingConvLower.h"
28 #include "llvm/CodeGen/MachineFunction.h"
29 #include "llvm/CodeGen/MachineRegisterInfo.h"
30 #include "llvm/CodeGen/SelectionDAG.h"
31 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
32 #include "llvm/IR/DataLayout.h"
33 #include "llvm/IR/DiagnosticInfo.h"
34 #include "llvm/Support/KnownBits.h"
35 using namespace llvm;
36 
37 static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT,
38                             CCValAssign::LocInfo LocInfo,
39                             ISD::ArgFlagsTy ArgFlags, CCState &State) {
40   MachineFunction &MF = State.getMachineFunction();
41   AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
42 
43   uint64_t Offset = MFI->allocateKernArg(LocVT.getStoreSize(),
44                                          ArgFlags.getOrigAlign());
45   State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo));
46   return true;
47 }
48 
49 static bool allocateCCRegs(unsigned ValNo, MVT ValVT, MVT LocVT,
50                            CCValAssign::LocInfo LocInfo,
51                            ISD::ArgFlagsTy ArgFlags, CCState &State,
52                            const TargetRegisterClass *RC,
53                            unsigned NumRegs) {
54   ArrayRef<MCPhysReg> RegList = makeArrayRef(RC->begin(), NumRegs);
55   unsigned RegResult = State.AllocateReg(RegList);
56   if (RegResult == AMDGPU::NoRegister)
57     return false;
58 
59   State.addLoc(CCValAssign::getReg(ValNo, ValVT, RegResult, LocVT, LocInfo));
60   return true;
61 }
62 
63 static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
64                               CCValAssign::LocInfo LocInfo,
65                               ISD::ArgFlagsTy ArgFlags, CCState &State) {
66   switch (LocVT.SimpleTy) {
67   case MVT::i64:
68   case MVT::f64:
69   case MVT::v2i32:
70   case MVT::v2f32: {
71     // Up to SGPR0-SGPR39
72     return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
73                           &AMDGPU::SGPR_64RegClass, 20);
74   }
75   default:
76     return false;
77   }
78 }
79 
80 // Allocate up to VGPR31.
81 //
82 // TODO: Since there are no VGPR alignent requirements would it be better to
83 // split into individual scalar registers?
84 static bool allocateVGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
85                               CCValAssign::LocInfo LocInfo,
86                               ISD::ArgFlagsTy ArgFlags, CCState &State) {
87   switch (LocVT.SimpleTy) {
88   case MVT::i64:
89   case MVT::f64:
90   case MVT::v2i32:
91   case MVT::v2f32: {
92     return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
93                           &AMDGPU::VReg_64RegClass, 31);
94   }
95   case MVT::v4i32:
96   case MVT::v4f32:
97   case MVT::v2i64:
98   case MVT::v2f64: {
99     return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
100                           &AMDGPU::VReg_128RegClass, 29);
101   }
102   case MVT::v8i32:
103   case MVT::v8f32: {
104     return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
105                           &AMDGPU::VReg_256RegClass, 25);
106 
107   }
108   case MVT::v16i32:
109   case MVT::v16f32: {
110     return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
111                           &AMDGPU::VReg_512RegClass, 17);
112 
113   }
114   default:
115     return false;
116   }
117 }
118 
119 #include "AMDGPUGenCallingConv.inc"
120 
121 // Find a larger type to do a load / store of a vector with.
122 EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
123   unsigned StoreSize = VT.getStoreSizeInBits();
124   if (StoreSize <= 32)
125     return EVT::getIntegerVT(Ctx, StoreSize);
126 
127   assert(StoreSize % 32 == 0 && "Store size not a multiple of 32");
128   return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
129 }
130 
131 bool AMDGPUTargetLowering::isOrEquivalentToAdd(SelectionDAG &DAG, SDValue Op)
132 {
133   assert(Op.getOpcode() == ISD::OR);
134 
135   SDValue N0 = Op->getOperand(0);
136   SDValue N1 = Op->getOperand(1);
137   EVT VT = N0.getValueType();
138 
139   if (VT.isInteger() && !VT.isVector()) {
140     KnownBits LHSKnown, RHSKnown;
141     DAG.computeKnownBits(N0, LHSKnown);
142 
143     if (LHSKnown.Zero.getBoolValue()) {
144       DAG.computeKnownBits(N1, RHSKnown);
145 
146       if (!(~RHSKnown.Zero & ~LHSKnown.Zero))
147         return true;
148     }
149   }
150 
151   return false;
152 }
153 
154 AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
155                                            const AMDGPUSubtarget &STI)
156     : TargetLowering(TM), Subtarget(&STI) {
157   AMDGPUASI = AMDGPU::getAMDGPUAS(TM);
158   // Lower floating point store/load to integer store/load to reduce the number
159   // of patterns in tablegen.
160   setOperationAction(ISD::LOAD, MVT::f32, Promote);
161   AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
162 
163   setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
164   AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
165 
166   setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
167   AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
168 
169   setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
170   AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
171 
172   setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
173   AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
174 
175   setOperationAction(ISD::LOAD, MVT::i64, Promote);
176   AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
177 
178   setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
179   AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
180 
181   setOperationAction(ISD::LOAD, MVT::f64, Promote);
182   AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
183 
184   setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
185   AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
186 
187   // There are no 64-bit extloads. These should be done as a 32-bit extload and
188   // an extension to 64-bit.
189   for (MVT VT : MVT::integer_valuetypes()) {
190     setLoadExtAction(ISD::EXTLOAD, MVT::i64, VT, Expand);
191     setLoadExtAction(ISD::SEXTLOAD, MVT::i64, VT, Expand);
192     setLoadExtAction(ISD::ZEXTLOAD, MVT::i64, VT, Expand);
193   }
194 
195   for (MVT VT : MVT::integer_valuetypes()) {
196     if (VT == MVT::i64)
197       continue;
198 
199     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
200     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Legal);
201     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Legal);
202     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i32, Expand);
203 
204     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
205     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Legal);
206     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Legal);
207     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i32, Expand);
208 
209     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
210     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Legal);
211     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Legal);
212     setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
213   }
214 
215   for (MVT VT : MVT::integer_vector_valuetypes()) {
216     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
217     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
218     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
219     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Expand);
220     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Expand);
221     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i8, Expand);
222     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
223     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
224     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
225     setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
226     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
227     setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
228   }
229 
230   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
231   setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
232   setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
233   setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
234 
235   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
236   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
237   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
238   setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
239 
240   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
241   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
242   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
243   setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
244 
245   setOperationAction(ISD::STORE, MVT::f32, Promote);
246   AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
247 
248   setOperationAction(ISD::STORE, MVT::v2f32, Promote);
249   AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
250 
251   setOperationAction(ISD::STORE, MVT::v4f32, Promote);
252   AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
253 
254   setOperationAction(ISD::STORE, MVT::v8f32, Promote);
255   AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
256 
257   setOperationAction(ISD::STORE, MVT::v16f32, Promote);
258   AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
259 
260   setOperationAction(ISD::STORE, MVT::i64, Promote);
261   AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
262 
263   setOperationAction(ISD::STORE, MVT::v2i64, Promote);
264   AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
265 
266   setOperationAction(ISD::STORE, MVT::f64, Promote);
267   AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
268 
269   setOperationAction(ISD::STORE, MVT::v2f64, Promote);
270   AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
271 
272   setTruncStoreAction(MVT::i64, MVT::i1, Expand);
273   setTruncStoreAction(MVT::i64, MVT::i8, Expand);
274   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
275   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
276 
277   setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
278   setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
279   setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
280   setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
281 
282   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
283   setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
284   setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
285   setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
286 
287   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
288   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
289 
290   setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
291   setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
292 
293   setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
294   setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
295 
296   setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
297   setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
298 
299 
300   setOperationAction(ISD::Constant, MVT::i32, Legal);
301   setOperationAction(ISD::Constant, MVT::i64, Legal);
302   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
303   setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
304 
305   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
306   setOperationAction(ISD::BRIND, MVT::Other, Expand);
307 
308   // This is totally unsupported, just custom lower to produce an error.
309   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
310 
311   // Library functions.  These default to Expand, but we have instructions
312   // for them.
313   setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
314   setOperationAction(ISD::FEXP2,  MVT::f32, Legal);
315   setOperationAction(ISD::FPOW,   MVT::f32, Legal);
316   setOperationAction(ISD::FLOG2,  MVT::f32, Legal);
317   setOperationAction(ISD::FABS,   MVT::f32, Legal);
318   setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
319   setOperationAction(ISD::FRINT,  MVT::f32, Legal);
320   setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
321   setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
322   setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
323 
324   setOperationAction(ISD::FROUND, MVT::f32, Custom);
325   setOperationAction(ISD::FROUND, MVT::f64, Custom);
326 
327   setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
328   setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);
329 
330   setOperationAction(ISD::FREM, MVT::f32, Custom);
331   setOperationAction(ISD::FREM, MVT::f64, Custom);
332 
333   // v_mad_f32 does not support denormals according to some sources.
334   if (!Subtarget->hasFP32Denormals())
335     setOperationAction(ISD::FMAD, MVT::f32, Legal);
336 
337   // Expand to fneg + fadd.
338   setOperationAction(ISD::FSUB, MVT::f64, Expand);
339 
340   setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
341   setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
342   setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
343   setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f32, Custom);
344   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
345   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
346   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f32, Custom);
347   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i32, Custom);
348   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f32, Custom);
349   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i32, Custom);
350 
351   if (Subtarget->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
352     setOperationAction(ISD::FCEIL, MVT::f64, Custom);
353     setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
354     setOperationAction(ISD::FRINT, MVT::f64, Custom);
355     setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
356   }
357 
358   if (!Subtarget->hasBFI()) {
359     // fcopysign can be done in a single instruction with BFI.
360     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
361     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
362   }
363 
364   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
365   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);
366   setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom);
367 
368   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
369   for (MVT VT : ScalarIntVTs) {
370     // These should use [SU]DIVREM, so set them to expand
371     setOperationAction(ISD::SDIV, VT, Expand);
372     setOperationAction(ISD::UDIV, VT, Expand);
373     setOperationAction(ISD::SREM, VT, Expand);
374     setOperationAction(ISD::UREM, VT, Expand);
375 
376     // GPU does not have divrem function for signed or unsigned.
377     setOperationAction(ISD::SDIVREM, VT, Custom);
378     setOperationAction(ISD::UDIVREM, VT, Custom);
379 
380     // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
381     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
382     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
383 
384     setOperationAction(ISD::BSWAP, VT, Expand);
385     setOperationAction(ISD::CTTZ, VT, Expand);
386     setOperationAction(ISD::CTLZ, VT, Expand);
387   }
388 
389   if (!Subtarget->hasBCNT(32))
390     setOperationAction(ISD::CTPOP, MVT::i32, Expand);
391 
392   if (!Subtarget->hasBCNT(64))
393     setOperationAction(ISD::CTPOP, MVT::i64, Expand);
394 
395   // The hardware supports 32-bit ROTR, but not ROTL.
396   setOperationAction(ISD::ROTL, MVT::i32, Expand);
397   setOperationAction(ISD::ROTL, MVT::i64, Expand);
398   setOperationAction(ISD::ROTR, MVT::i64, Expand);
399 
400   setOperationAction(ISD::MUL, MVT::i64, Expand);
401   setOperationAction(ISD::MULHU, MVT::i64, Expand);
402   setOperationAction(ISD::MULHS, MVT::i64, Expand);
403   setOperationAction(ISD::UDIV, MVT::i32, Expand);
404   setOperationAction(ISD::UREM, MVT::i32, Expand);
405   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
406   setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
407   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
408   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
409   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
410 
411   setOperationAction(ISD::SMIN, MVT::i32, Legal);
412   setOperationAction(ISD::UMIN, MVT::i32, Legal);
413   setOperationAction(ISD::SMAX, MVT::i32, Legal);
414   setOperationAction(ISD::UMAX, MVT::i32, Legal);
415 
416   if (Subtarget->hasFFBH())
417     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
418 
419   if (Subtarget->hasFFBL())
420     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Legal);
421 
422   setOperationAction(ISD::CTLZ, MVT::i64, Custom);
423   setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
424 
425   // We only really have 32-bit BFE instructions (and 16-bit on VI).
426   //
427   // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
428   // effort to match them now. We want this to be false for i64 cases when the
429   // extraction isn't restricted to the upper or lower half. Ideally we would
430   // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
431   // span the midpoint are probably relatively rare, so don't worry about them
432   // for now.
433   if (Subtarget->hasBFE())
434     setHasExtractBitsInsn(true);
435 
436   static const MVT::SimpleValueType VectorIntTypes[] = {
437     MVT::v2i32, MVT::v4i32
438   };
439 
440   for (MVT VT : VectorIntTypes) {
441     // Expand the following operations for the current type by default.
442     setOperationAction(ISD::ADD,  VT, Expand);
443     setOperationAction(ISD::AND,  VT, Expand);
444     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
445     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
446     setOperationAction(ISD::MUL,  VT, Expand);
447     setOperationAction(ISD::MULHU, VT, Expand);
448     setOperationAction(ISD::MULHS, VT, Expand);
449     setOperationAction(ISD::OR,   VT, Expand);
450     setOperationAction(ISD::SHL,  VT, Expand);
451     setOperationAction(ISD::SRA,  VT, Expand);
452     setOperationAction(ISD::SRL,  VT, Expand);
453     setOperationAction(ISD::ROTL, VT, Expand);
454     setOperationAction(ISD::ROTR, VT, Expand);
455     setOperationAction(ISD::SUB,  VT, Expand);
456     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
457     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
458     setOperationAction(ISD::SDIV, VT, Expand);
459     setOperationAction(ISD::UDIV, VT, Expand);
460     setOperationAction(ISD::SREM, VT, Expand);
461     setOperationAction(ISD::UREM, VT, Expand);
462     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
463     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
464     setOperationAction(ISD::SDIVREM, VT, Custom);
465     setOperationAction(ISD::UDIVREM, VT, Expand);
466     setOperationAction(ISD::ADDC, VT, Expand);
467     setOperationAction(ISD::SUBC, VT, Expand);
468     setOperationAction(ISD::ADDE, VT, Expand);
469     setOperationAction(ISD::SUBE, VT, Expand);
470     setOperationAction(ISD::SELECT, VT, Expand);
471     setOperationAction(ISD::VSELECT, VT, Expand);
472     setOperationAction(ISD::SELECT_CC, VT, Expand);
473     setOperationAction(ISD::XOR,  VT, Expand);
474     setOperationAction(ISD::BSWAP, VT, Expand);
475     setOperationAction(ISD::CTPOP, VT, Expand);
476     setOperationAction(ISD::CTTZ, VT, Expand);
477     setOperationAction(ISD::CTLZ, VT, Expand);
478     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
479   }
480 
481   static const MVT::SimpleValueType FloatVectorTypes[] = {
482     MVT::v2f32, MVT::v4f32
483   };
484 
485   for (MVT VT : FloatVectorTypes) {
486     setOperationAction(ISD::FABS, VT, Expand);
487     setOperationAction(ISD::FMINNUM, VT, Expand);
488     setOperationAction(ISD::FMAXNUM, VT, Expand);
489     setOperationAction(ISD::FADD, VT, Expand);
490     setOperationAction(ISD::FCEIL, VT, Expand);
491     setOperationAction(ISD::FCOS, VT, Expand);
492     setOperationAction(ISD::FDIV, VT, Expand);
493     setOperationAction(ISD::FEXP2, VT, Expand);
494     setOperationAction(ISD::FLOG2, VT, Expand);
495     setOperationAction(ISD::FREM, VT, Expand);
496     setOperationAction(ISD::FPOW, VT, Expand);
497     setOperationAction(ISD::FFLOOR, VT, Expand);
498     setOperationAction(ISD::FTRUNC, VT, Expand);
499     setOperationAction(ISD::FMUL, VT, Expand);
500     setOperationAction(ISD::FMA, VT, Expand);
501     setOperationAction(ISD::FRINT, VT, Expand);
502     setOperationAction(ISD::FNEARBYINT, VT, Expand);
503     setOperationAction(ISD::FSQRT, VT, Expand);
504     setOperationAction(ISD::FSIN, VT, Expand);
505     setOperationAction(ISD::FSUB, VT, Expand);
506     setOperationAction(ISD::FNEG, VT, Expand);
507     setOperationAction(ISD::VSELECT, VT, Expand);
508     setOperationAction(ISD::SELECT_CC, VT, Expand);
509     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
510     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
511   }
512 
513   // This causes using an unrolled select operation rather than expansion with
514   // bit operations. This is in general better, but the alternative using BFI
515   // instructions may be better if the select sources are SGPRs.
516   setOperationAction(ISD::SELECT, MVT::v2f32, Promote);
517   AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
518 
519   setOperationAction(ISD::SELECT, MVT::v4f32, Promote);
520   AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
521 
522   // There are no libcalls of any kind.
523   for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
524     setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
525 
526   setBooleanContents(ZeroOrNegativeOneBooleanContent);
527   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
528 
529   setSchedulingPreference(Sched::RegPressure);
530   setJumpIsExpensive(true);
531 
532   // FIXME: This is only partially true. If we have to do vector compares, any
533   // SGPR pair can be a condition register. If we have a uniform condition, we
534   // are better off doing SALU operations, where there is only one SCC. For now,
535   // we don't have a way of knowing during instruction selection if a condition
536   // will be uniform and we always use vector compares. Assume we are using
537   // vector compares until that is fixed.
538   setHasMultipleConditionRegisters(true);
539 
540   // SI at least has hardware support for floating point exceptions, but no way
541   // of using or handling them is implemented. They are also optional in OpenCL
542   // (Section 7.3)
543   setHasFloatingPointExceptions(Subtarget->hasFPExceptions());
544 
545   PredictableSelectIsExpensive = false;
546 
547   // We want to find all load dependencies for long chains of stores to enable
548   // merging into very wide vectors. The problem is with vectors with > 4
549   // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
550   // vectors are a legal type, even though we have to split the loads
551   // usually. When we can more precisely specify load legality per address
552   // space, we should be able to make FindBetterChain/MergeConsecutiveStores
553   // smarter so that they can figure out what to do in 2 iterations without all
554   // N > 4 stores on the same chain.
555   GatherAllAliasesMaxDepth = 16;
556 
557   // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
558   // about these during lowering.
559   MaxStoresPerMemcpy  = 0xffffffff;
560   MaxStoresPerMemmove = 0xffffffff;
561   MaxStoresPerMemset  = 0xffffffff;
562 
563   setTargetDAGCombine(ISD::BITCAST);
564   setTargetDAGCombine(ISD::SHL);
565   setTargetDAGCombine(ISD::SRA);
566   setTargetDAGCombine(ISD::SRL);
567   setTargetDAGCombine(ISD::MUL);
568   setTargetDAGCombine(ISD::MULHU);
569   setTargetDAGCombine(ISD::MULHS);
570   setTargetDAGCombine(ISD::SELECT);
571   setTargetDAGCombine(ISD::SELECT_CC);
572   setTargetDAGCombine(ISD::STORE);
573   setTargetDAGCombine(ISD::FADD);
574   setTargetDAGCombine(ISD::FSUB);
575   setTargetDAGCombine(ISD::FNEG);
576   setTargetDAGCombine(ISD::FABS);
577   setTargetDAGCombine(ISD::AssertZext);
578   setTargetDAGCombine(ISD::AssertSext);
579 }
580 
581 //===----------------------------------------------------------------------===//
582 // Target Information
583 //===----------------------------------------------------------------------===//
584 
585 LLVM_READNONE
586 static bool fnegFoldsIntoOp(unsigned Opc) {
587   switch (Opc) {
588   case ISD::FADD:
589   case ISD::FSUB:
590   case ISD::FMUL:
591   case ISD::FMA:
592   case ISD::FMAD:
593   case ISD::FMINNUM:
594   case ISD::FMAXNUM:
595   case ISD::FSIN:
596   case ISD::FTRUNC:
597   case ISD::FRINT:
598   case ISD::FNEARBYINT:
599   case AMDGPUISD::RCP:
600   case AMDGPUISD::RCP_LEGACY:
601   case AMDGPUISD::SIN_HW:
602   case AMDGPUISD::FMUL_LEGACY:
603   case AMDGPUISD::FMIN_LEGACY:
604   case AMDGPUISD::FMAX_LEGACY:
605     return true;
606   default:
607     return false;
608   }
609 }
610 
611 /// \p returns true if the operation will definitely need to use a 64-bit
612 /// encoding, and thus will use a VOP3 encoding regardless of the source
613 /// modifiers.
614 LLVM_READONLY
615 static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
616   return N->getNumOperands() > 2 || VT == MVT::f64;
617 }
618 
619 // Most FP instructions support source modifiers, but this could be refined
620 // slightly.
621 LLVM_READONLY
622 static bool hasSourceMods(const SDNode *N) {
623   if (isa<MemSDNode>(N))
624     return false;
625 
626   switch (N->getOpcode()) {
627   case ISD::CopyToReg:
628   case ISD::SELECT:
629   case ISD::FDIV:
630   case ISD::FREM:
631   case ISD::INLINEASM:
632   case AMDGPUISD::INTERP_P1:
633   case AMDGPUISD::INTERP_P2:
634   case AMDGPUISD::DIV_SCALE:
635 
636   // TODO: Should really be looking at the users of the bitcast. These are
637   // problematic because bitcasts are used to legalize all stores to integer
638   // types.
639   case ISD::BITCAST:
640     return false;
641   default:
642     return true;
643   }
644 }
645 
646 bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
647                                                  unsigned CostThreshold) {
648   // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
649   // it is truly free to use a source modifier in all cases. If there are
650   // multiple users but for each one will necessitate using VOP3, there will be
651   // a code size increase. Try to avoid increasing code size unless we know it
652   // will save on the instruction count.
653   unsigned NumMayIncreaseSize = 0;
654   MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
655 
656   // XXX - Should this limit number of uses to check?
657   for (const SDNode *U : N->uses()) {
658     if (!hasSourceMods(U))
659       return false;
660 
661     if (!opMustUseVOP3Encoding(U, VT)) {
662       if (++NumMayIncreaseSize > CostThreshold)
663         return false;
664     }
665   }
666 
667   return true;
668 }
669 
670 MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
671   return MVT::i32;
672 }
673 
674 bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType) const {
675   return true;
676 }
677 
678 // The backend supports 32 and 64 bit floating point immediates.
679 // FIXME: Why are we reporting vectors of FP immediates as legal?
680 bool AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
681   EVT ScalarVT = VT.getScalarType();
682   return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
683          (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
684 }
685 
686 // We don't want to shrink f64 / f32 constants.
687 bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
688   EVT ScalarVT = VT.getScalarType();
689   return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
690 }
691 
692 bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
693                                                  ISD::LoadExtType,
694                                                  EVT NewVT) const {
695 
696   unsigned NewSize = NewVT.getStoreSizeInBits();
697 
698   // If we are reducing to a 32-bit load, this is always better.
699   if (NewSize == 32)
700     return true;
701 
702   EVT OldVT = N->getValueType(0);
703   unsigned OldSize = OldVT.getStoreSizeInBits();
704 
705   // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
706   // extloads, so doing one requires using a buffer_load. In cases where we
707   // still couldn't use a scalar load, using the wider load shouldn't really
708   // hurt anything.
709 
710   // If the old size already had to be an extload, there's no harm in continuing
711   // to reduce the width.
712   return (OldSize < 32);
713 }
714 
715 bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
716                                                    EVT CastTy) const {
717 
718   assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
719 
720   if (LoadTy.getScalarType() == MVT::i32)
721     return false;
722 
723   unsigned LScalarSize = LoadTy.getScalarSizeInBits();
724   unsigned CastScalarSize = CastTy.getScalarSizeInBits();
725 
726   return (LScalarSize < CastScalarSize) ||
727          (CastScalarSize >= 32);
728 }
729 
730 // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
731 // profitable with the expansion for 64-bit since it's generally good to
732 // speculate things.
733 // FIXME: These should really have the size as a parameter.
734 bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const {
735   return true;
736 }
737 
738 bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
739   return true;
740 }
741 
742 //===---------------------------------------------------------------------===//
743 // Target Properties
744 //===---------------------------------------------------------------------===//
745 
746 bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
747   assert(VT.isFloatingPoint());
748 
749   // Packed operations do not have a fabs modifier.
750   return VT == MVT::f32 || VT == MVT::f64 ||
751          (Subtarget->has16BitInsts() && VT == MVT::f16);
752 }
753 
754 bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
755   assert(VT.isFloatingPoint());
756   return VT == MVT::f32 || VT == MVT::f64 ||
757          (Subtarget->has16BitInsts() && VT == MVT::f16) ||
758          (Subtarget->hasVOP3PInsts() && VT == MVT::v2f16);
759 }
760 
761 bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
762                                                          unsigned NumElem,
763                                                          unsigned AS) const {
764   return true;
765 }
766 
767 bool AMDGPUTargetLowering::aggressivelyPreferBuildVectorSources(EVT VecVT) const {
768   // There are few operations which truly have vector input operands. Any vector
769   // operation is going to involve operations on each component, and a
770   // build_vector will be a copy per element, so it always makes sense to use a
771   // build_vector input in place of the extracted element to avoid a copy into a
772   // super register.
773   //
774   // We should probably only do this if all users are extracts only, but this
775   // should be the common case.
776   return true;
777 }
778 
779 bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
780   // Truncate is just accessing a subregister.
781 
782   unsigned SrcSize = Source.getSizeInBits();
783   unsigned DestSize = Dest.getSizeInBits();
784 
785   return DestSize < SrcSize && DestSize % 32 == 0 ;
786 }
787 
788 bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
789   // Truncate is just accessing a subregister.
790 
791   unsigned SrcSize = Source->getScalarSizeInBits();
792   unsigned DestSize = Dest->getScalarSizeInBits();
793 
794   if (DestSize== 16 && Subtarget->has16BitInsts())
795     return SrcSize >= 32;
796 
797   return DestSize < SrcSize && DestSize % 32 == 0;
798 }
799 
800 bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
801   unsigned SrcSize = Src->getScalarSizeInBits();
802   unsigned DestSize = Dest->getScalarSizeInBits();
803 
804   if (SrcSize == 16 && Subtarget->has16BitInsts())
805     return DestSize >= 32;
806 
807   return SrcSize == 32 && DestSize == 64;
808 }
809 
810 bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
811   // Any register load of a 64-bit value really requires 2 32-bit moves. For all
812   // practical purposes, the extra mov 0 to load a 64-bit is free.  As used,
813   // this will enable reducing 64-bit operations the 32-bit, which is always
814   // good.
815 
816   if (Src == MVT::i16)
817     return Dest == MVT::i32 ||Dest == MVT::i64 ;
818 
819   return Src == MVT::i32 && Dest == MVT::i64;
820 }
821 
822 bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
823   return isZExtFree(Val.getValueType(), VT2);
824 }
825 
826 bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
827   // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
828   // limited number of native 64-bit operations. Shrinking an operation to fit
829   // in a single 32-bit register should always be helpful. As currently used,
830   // this is much less general than the name suggests, and is only used in
831   // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
832   // not profitable, and may actually be harmful.
833   return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
834 }
835 
836 //===---------------------------------------------------------------------===//
837 // TargetLowering Callbacks
838 //===---------------------------------------------------------------------===//
839 
840 CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
841                                                   bool IsVarArg) {
842   switch (CC) {
843   case CallingConv::AMDGPU_KERNEL:
844   case CallingConv::SPIR_KERNEL:
845     return CC_AMDGPU_Kernel;
846   case CallingConv::AMDGPU_VS:
847   case CallingConv::AMDGPU_GS:
848   case CallingConv::AMDGPU_PS:
849   case CallingConv::AMDGPU_CS:
850   case CallingConv::AMDGPU_HS:
851     return CC_AMDGPU;
852   case CallingConv::C:
853   case CallingConv::Fast:
854   case CallingConv::Cold:
855     return CC_AMDGPU_Func;
856   default:
857     report_fatal_error("Unsupported calling convention.");
858   }
859 }
860 
861 CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
862                                                     bool IsVarArg) {
863   switch (CC) {
864   case CallingConv::AMDGPU_KERNEL:
865   case CallingConv::SPIR_KERNEL:
866     return CC_AMDGPU_Kernel;
867   case CallingConv::AMDGPU_VS:
868   case CallingConv::AMDGPU_GS:
869   case CallingConv::AMDGPU_PS:
870   case CallingConv::AMDGPU_CS:
871   case CallingConv::AMDGPU_HS:
872     return RetCC_SI_Shader;
873   case CallingConv::C:
874   case CallingConv::Fast:
875   case CallingConv::Cold:
876     return RetCC_AMDGPU_Func;
877   default:
878     report_fatal_error("Unsupported calling convention.");
879   }
880 }
881 
882 /// The SelectionDAGBuilder will automatically promote function arguments
883 /// with illegal types.  However, this does not work for the AMDGPU targets
884 /// since the function arguments are stored in memory as these illegal types.
885 /// In order to handle this properly we need to get the original types sizes
886 /// from the LLVM IR Function and fixup the ISD:InputArg values before
887 /// passing them to AnalyzeFormalArguments()
888 
889 /// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
890 /// input values across multiple registers.  Each item in the Ins array
891 /// represents a single value that will be stored in registers.  Ins[x].VT is
892 /// the value type of the value that will be stored in the register, so
893 /// whatever SDNode we lower the argument to needs to be this type.
894 ///
895 /// In order to correctly lower the arguments we need to know the size of each
896 /// argument.  Since Ins[x].VT gives us the size of the register that will
897 /// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
898 /// for the orignal function argument so that we can deduce the correct memory
899 /// type to use for Ins[x].  In most cases the correct memory type will be
900 /// Ins[x].ArgVT.  However, this will not always be the case.  If, for example,
901 /// we have a kernel argument of type v8i8, this argument will be split into
902 /// 8 parts and each part will be represented by its own item in the Ins array.
903 /// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
904 /// the argument before it was split.  From this, we deduce that the memory type
905 /// for each individual part is i8.  We pass the memory type as LocVT to the
906 /// calling convention analysis function and the register type (Ins[x].VT) as
907 /// the ValVT.
908 void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(CCState &State,
909                              const SmallVectorImpl<ISD::InputArg> &Ins) const {
910   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
911     const ISD::InputArg &In = Ins[i];
912     EVT MemVT;
913 
914     unsigned NumRegs = getNumRegisters(State.getContext(), In.ArgVT);
915 
916     if (!Subtarget->isAmdHsaOS() &&
917         (In.ArgVT == MVT::i16 || In.ArgVT == MVT::i8 || In.ArgVT == MVT::f16)) {
918       // The ABI says the caller will extend these values to 32-bits.
919       MemVT = In.ArgVT.isInteger() ? MVT::i32 : MVT::f32;
920     } else if (NumRegs == 1) {
921       // This argument is not split, so the IR type is the memory type.
922       assert(!In.Flags.isSplit());
923       if (In.ArgVT.isExtended()) {
924         // We have an extended type, like i24, so we should just use the register type
925         MemVT = In.VT;
926       } else {
927         MemVT = In.ArgVT;
928       }
929     } else if (In.ArgVT.isVector() && In.VT.isVector() &&
930                In.ArgVT.getScalarType() == In.VT.getScalarType()) {
931       assert(In.ArgVT.getVectorNumElements() > In.VT.getVectorNumElements());
932       // We have a vector value which has been split into a vector with
933       // the same scalar type, but fewer elements.  This should handle
934       // all the floating-point vector types.
935       MemVT = In.VT;
936     } else if (In.ArgVT.isVector() &&
937                In.ArgVT.getVectorNumElements() == NumRegs) {
938       // This arg has been split so that each element is stored in a separate
939       // register.
940       MemVT = In.ArgVT.getScalarType();
941     } else if (In.ArgVT.isExtended()) {
942       // We have an extended type, like i65.
943       MemVT = In.VT;
944     } else {
945       unsigned MemoryBits = In.ArgVT.getStoreSizeInBits() / NumRegs;
946       assert(In.ArgVT.getStoreSizeInBits() % NumRegs == 0);
947       if (In.VT.isInteger()) {
948         MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
949       } else if (In.VT.isVector()) {
950         assert(!In.VT.getScalarType().isFloatingPoint());
951         unsigned NumElements = In.VT.getVectorNumElements();
952         assert(MemoryBits % NumElements == 0);
953         // This vector type has been split into another vector type with
954         // a different elements size.
955         EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
956                                          MemoryBits / NumElements);
957         MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
958       } else {
959         llvm_unreachable("cannot deduce memory type.");
960       }
961     }
962 
963     // Convert one element vectors to scalar.
964     if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
965       MemVT = MemVT.getScalarType();
966 
967     if (MemVT.isExtended()) {
968       // This should really only happen if we have vec3 arguments
969       assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3);
970       MemVT = MemVT.getPow2VectorType(State.getContext());
971     }
972 
973     assert(MemVT.isSimple());
974     allocateKernArg(i, In.VT, MemVT.getSimpleVT(), CCValAssign::Full, In.Flags,
975                     State);
976   }
977 }
978 
979 SDValue AMDGPUTargetLowering::LowerReturn(
980   SDValue Chain, CallingConv::ID CallConv,
981   bool isVarArg,
982   const SmallVectorImpl<ISD::OutputArg> &Outs,
983   const SmallVectorImpl<SDValue> &OutVals,
984   const SDLoc &DL, SelectionDAG &DAG) const {
985   // FIXME: Fails for r600 tests
986   //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
987   // "wave terminate should not have return values");
988   return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
989 }
990 
991 //===---------------------------------------------------------------------===//
992 // Target specific lowering
993 //===---------------------------------------------------------------------===//
994 
995 /// Selects the correct CCAssignFn for a given CallingConvention value.
996 CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
997                                                     bool IsVarArg) {
998   return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
999 }
1000 
1001 CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
1002                                                       bool IsVarArg) {
1003   return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
1004 }
1005 
1006 SDValue AMDGPUTargetLowering::addTokenForArgument(SDValue Chain,
1007                                                   SelectionDAG &DAG,
1008                                                   MachineFrameInfo &MFI,
1009                                                   int ClobberedFI) const {
1010   SmallVector<SDValue, 8> ArgChains;
1011   int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1012   int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1013 
1014   // Include the original chain at the beginning of the list. When this is
1015   // used by target LowerCall hooks, this helps legalize find the
1016   // CALLSEQ_BEGIN node.
1017   ArgChains.push_back(Chain);
1018 
1019   // Add a chain value for each stack argument corresponding
1020   for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
1021                             UE = DAG.getEntryNode().getNode()->use_end();
1022        U != UE; ++U) {
1023     if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U)) {
1024       if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1025         if (FI->getIndex() < 0) {
1026           int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1027           int64_t InLastByte = InFirstByte;
1028           InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1029 
1030           if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1031               (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1032             ArgChains.push_back(SDValue(L, 1));
1033         }
1034       }
1035     }
1036   }
1037 
1038   // Build a tokenfactor for all the chains.
1039   return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1040 }
1041 
1042 SDValue AMDGPUTargetLowering::lowerUnhandledCall(CallLoweringInfo &CLI,
1043                                                  SmallVectorImpl<SDValue> &InVals,
1044                                                  StringRef Reason) const {
1045   SDValue Callee = CLI.Callee;
1046   SelectionDAG &DAG = CLI.DAG;
1047 
1048   const Function &Fn = *DAG.getMachineFunction().getFunction();
1049 
1050   StringRef FuncName("<unknown>");
1051 
1052   if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
1053     FuncName = G->getSymbol();
1054   else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1055     FuncName = G->getGlobal()->getName();
1056 
1057   DiagnosticInfoUnsupported NoCalls(
1058     Fn, Reason + FuncName, CLI.DL.getDebugLoc());
1059   DAG.getContext()->diagnose(NoCalls);
1060 
1061   if (!CLI.IsTailCall) {
1062     for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
1063       InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
1064   }
1065 
1066   return DAG.getEntryNode();
1067 }
1068 
1069 SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
1070                                         SmallVectorImpl<SDValue> &InVals) const {
1071   return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1072 }
1073 
1074 SDValue AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
1075                                                       SelectionDAG &DAG) const {
1076   const Function &Fn = *DAG.getMachineFunction().getFunction();
1077 
1078   DiagnosticInfoUnsupported NoDynamicAlloca(Fn, "unsupported dynamic alloca",
1079                                             SDLoc(Op).getDebugLoc());
1080   DAG.getContext()->diagnose(NoDynamicAlloca);
1081   auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1082   return DAG.getMergeValues(Ops, SDLoc());
1083 }
1084 
1085 SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
1086                                              SelectionDAG &DAG) const {
1087   switch (Op.getOpcode()) {
1088   default:
1089     Op->print(errs(), &DAG);
1090     llvm_unreachable("Custom lowering code for this"
1091                      "instruction is not implemented yet!");
1092     break;
1093   case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
1094   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1095   case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
1096   case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1097   case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
1098   case ISD::FREM: return LowerFREM(Op, DAG);
1099   case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1100   case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1101   case ISD::FRINT: return LowerFRINT(Op, DAG);
1102   case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1103   case ISD::FROUND: return LowerFROUND(Op, DAG);
1104   case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1105   case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1106   case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1107   case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1108   case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
1109   case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
1110   case ISD::CTLZ:
1111   case ISD::CTLZ_ZERO_UNDEF:
1112     return LowerCTLZ(Op, DAG);
1113   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
1114   }
1115   return Op;
1116 }
1117 
1118 void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
1119                                               SmallVectorImpl<SDValue> &Results,
1120                                               SelectionDAG &DAG) const {
1121   switch (N->getOpcode()) {
1122   case ISD::SIGN_EXTEND_INREG:
1123     // Different parts of legalization seem to interpret which type of
1124     // sign_extend_inreg is the one to check for custom lowering. The extended
1125     // from type is what really matters, but some places check for custom
1126     // lowering of the result type. This results in trying to use
1127     // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1128     // nothing here and let the illegal result integer be handled normally.
1129     return;
1130   default:
1131     return;
1132   }
1133 }
1134 
1135 static bool hasDefinedInitializer(const GlobalValue *GV) {
1136   const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
1137   if (!GVar || !GVar->hasInitializer())
1138     return false;
1139 
1140   return !isa<UndefValue>(GVar->getInitializer());
1141 }
1142 
1143 SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
1144                                                  SDValue Op,
1145                                                  SelectionDAG &DAG) const {
1146 
1147   const DataLayout &DL = DAG.getDataLayout();
1148   GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
1149   const GlobalValue *GV = G->getGlobal();
1150 
1151   if  (G->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS) {
1152     // XXX: What does the value of G->getOffset() mean?
1153     assert(G->getOffset() == 0 &&
1154          "Do not know what to do with an non-zero offset");
1155 
1156     // TODO: We could emit code to handle the initialization somewhere.
1157     if (!hasDefinedInitializer(GV)) {
1158       unsigned Offset = MFI->allocateLDSGlobal(DL, *GV);
1159       return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1160     }
1161   }
1162 
1163   const Function &Fn = *DAG.getMachineFunction().getFunction();
1164   DiagnosticInfoUnsupported BadInit(
1165       Fn, "unsupported initializer for address space", SDLoc(Op).getDebugLoc());
1166   DAG.getContext()->diagnose(BadInit);
1167   return SDValue();
1168 }
1169 
1170 SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
1171                                                   SelectionDAG &DAG) const {
1172   SmallVector<SDValue, 8> Args;
1173 
1174   for (const SDUse &U : Op->ops())
1175     DAG.ExtractVectorElements(U.get(), Args);
1176 
1177   return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1178 }
1179 
1180 SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
1181                                                      SelectionDAG &DAG) const {
1182 
1183   SmallVector<SDValue, 8> Args;
1184   unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
1185   EVT VT = Op.getValueType();
1186   DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1187                             VT.getVectorNumElements());
1188 
1189   return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
1190 }
1191 
1192 /// \brief Generate Min/Max node
1193 SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
1194                                                    SDValue LHS, SDValue RHS,
1195                                                    SDValue True, SDValue False,
1196                                                    SDValue CC,
1197                                                    DAGCombinerInfo &DCI) const {
1198   if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
1199     return SDValue();
1200 
1201   SelectionDAG &DAG = DCI.DAG;
1202   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1203   switch (CCOpcode) {
1204   case ISD::SETOEQ:
1205   case ISD::SETONE:
1206   case ISD::SETUNE:
1207   case ISD::SETNE:
1208   case ISD::SETUEQ:
1209   case ISD::SETEQ:
1210   case ISD::SETFALSE:
1211   case ISD::SETFALSE2:
1212   case ISD::SETTRUE:
1213   case ISD::SETTRUE2:
1214   case ISD::SETUO:
1215   case ISD::SETO:
1216     break;
1217   case ISD::SETULE:
1218   case ISD::SETULT: {
1219     if (LHS == True)
1220       return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1221     return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1222   }
1223   case ISD::SETOLE:
1224   case ISD::SETOLT:
1225   case ISD::SETLE:
1226   case ISD::SETLT: {
1227     // Ordered. Assume ordered for undefined.
1228 
1229     // Only do this after legalization to avoid interfering with other combines
1230     // which might occur.
1231     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1232         !DCI.isCalledByLegalizer())
1233       return SDValue();
1234 
1235     // We need to permute the operands to get the correct NaN behavior. The
1236     // selected operand is the second one based on the failing compare with NaN,
1237     // so permute it based on the compare type the hardware uses.
1238     if (LHS == True)
1239       return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1240     return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1241   }
1242   case ISD::SETUGE:
1243   case ISD::SETUGT: {
1244     if (LHS == True)
1245       return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1246     return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1247   }
1248   case ISD::SETGT:
1249   case ISD::SETGE:
1250   case ISD::SETOGE:
1251   case ISD::SETOGT: {
1252     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG &&
1253         !DCI.isCalledByLegalizer())
1254       return SDValue();
1255 
1256     if (LHS == True)
1257       return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1258     return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1259   }
1260   case ISD::SETCC_INVALID:
1261     llvm_unreachable("Invalid setcc condcode!");
1262   }
1263   return SDValue();
1264 }
1265 
1266 std::pair<SDValue, SDValue>
1267 AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
1268   SDLoc SL(Op);
1269 
1270   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1271 
1272   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1273   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1274 
1275   SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1276   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1277 
1278   return std::make_pair(Lo, Hi);
1279 }
1280 
1281 SDValue AMDGPUTargetLowering::getLoHalf64(SDValue Op, SelectionDAG &DAG) const {
1282   SDLoc SL(Op);
1283 
1284   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1285   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1286   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1287 }
1288 
1289 SDValue AMDGPUTargetLowering::getHiHalf64(SDValue Op, SelectionDAG &DAG) const {
1290   SDLoc SL(Op);
1291 
1292   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1293   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1294   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1295 }
1296 
1297 SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
1298                                               SelectionDAG &DAG) const {
1299   LoadSDNode *Load = cast<LoadSDNode>(Op);
1300   EVT VT = Op.getValueType();
1301 
1302 
1303   // If this is a 2 element vector, we really want to scalarize and not create
1304   // weird 1 element vectors.
1305   if (VT.getVectorNumElements() == 2)
1306     return scalarizeVectorLoad(Load, DAG);
1307 
1308   SDValue BasePtr = Load->getBasePtr();
1309   EVT PtrVT = BasePtr.getValueType();
1310   EVT MemVT = Load->getMemoryVT();
1311   SDLoc SL(Op);
1312 
1313   const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1314 
1315   EVT LoVT, HiVT;
1316   EVT LoMemVT, HiMemVT;
1317   SDValue Lo, Hi;
1318 
1319   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
1320   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
1321   std::tie(Lo, Hi) = DAG.SplitVector(Op, SL, LoVT, HiVT);
1322 
1323   unsigned Size = LoMemVT.getStoreSize();
1324   unsigned BaseAlign = Load->getAlignment();
1325   unsigned HiAlign = MinAlign(BaseAlign, Size);
1326 
1327   SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1328                                   Load->getChain(), BasePtr, SrcValue, LoMemVT,
1329                                   BaseAlign, Load->getMemOperand()->getFlags());
1330   SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
1331                               DAG.getConstant(Size, SL, PtrVT));
1332   SDValue HiLoad =
1333       DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1334                      HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1335                      HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1336 
1337   SDValue Ops[] = {
1338     DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad),
1339     DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1340                 LoLoad.getValue(1), HiLoad.getValue(1))
1341   };
1342 
1343   return DAG.getMergeValues(Ops, SL);
1344 }
1345 
1346 SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
1347                                                SelectionDAG &DAG) const {
1348   StoreSDNode *Store = cast<StoreSDNode>(Op);
1349   SDValue Val = Store->getValue();
1350   EVT VT = Val.getValueType();
1351 
1352   // If this is a 2 element vector, we really want to scalarize and not create
1353   // weird 1 element vectors.
1354   if (VT.getVectorNumElements() == 2)
1355     return scalarizeVectorStore(Store, DAG);
1356 
1357   EVT MemVT = Store->getMemoryVT();
1358   SDValue Chain = Store->getChain();
1359   SDValue BasePtr = Store->getBasePtr();
1360   SDLoc SL(Op);
1361 
1362   EVT LoVT, HiVT;
1363   EVT LoMemVT, HiMemVT;
1364   SDValue Lo, Hi;
1365 
1366   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
1367   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemVT);
1368   std::tie(Lo, Hi) = DAG.SplitVector(Val, SL, LoVT, HiVT);
1369 
1370   EVT PtrVT = BasePtr.getValueType();
1371   SDValue HiPtr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
1372                               DAG.getConstant(LoMemVT.getStoreSize(), SL,
1373                                               PtrVT));
1374 
1375   const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1376   unsigned BaseAlign = Store->getAlignment();
1377   unsigned Size = LoMemVT.getStoreSize();
1378   unsigned HiAlign = MinAlign(BaseAlign, Size);
1379 
1380   SDValue LoStore =
1381       DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1382                         Store->getMemOperand()->getFlags());
1383   SDValue HiStore =
1384       DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1385                         HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1386 
1387   return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1388 }
1389 
1390 // This is a shortcut for integer division because we have fast i32<->f32
1391 // conversions, and fast f32 reciprocal instructions. The fractional part of a
1392 // float is enough to accurately represent up to a 24-bit signed integer.
1393 SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
1394                                             bool Sign) const {
1395   SDLoc DL(Op);
1396   EVT VT = Op.getValueType();
1397   SDValue LHS = Op.getOperand(0);
1398   SDValue RHS = Op.getOperand(1);
1399   MVT IntVT = MVT::i32;
1400   MVT FltVT = MVT::f32;
1401 
1402   unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
1403   if (LHSSignBits < 9)
1404     return SDValue();
1405 
1406   unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
1407   if (RHSSignBits < 9)
1408     return SDValue();
1409 
1410   unsigned BitSize = VT.getSizeInBits();
1411   unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1412   unsigned DivBits = BitSize - SignBits;
1413   if (Sign)
1414     ++DivBits;
1415 
1416   ISD::NodeType ToFp = Sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
1417   ISD::NodeType ToInt = Sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
1418 
1419   SDValue jq = DAG.getConstant(1, DL, IntVT);
1420 
1421   if (Sign) {
1422     // char|short jq = ia ^ ib;
1423     jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
1424 
1425     // jq = jq >> (bitsize - 2)
1426     jq = DAG.getNode(ISD::SRA, DL, VT, jq,
1427                      DAG.getConstant(BitSize - 2, DL, VT));
1428 
1429     // jq = jq | 0x1
1430     jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
1431   }
1432 
1433   // int ia = (int)LHS;
1434   SDValue ia = LHS;
1435 
1436   // int ib, (int)RHS;
1437   SDValue ib = RHS;
1438 
1439   // float fa = (float)ia;
1440   SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
1441 
1442   // float fb = (float)ib;
1443   SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
1444 
1445   SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
1446                            fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
1447 
1448   // fq = trunc(fq);
1449   fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
1450 
1451   // float fqneg = -fq;
1452   SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
1453 
1454   // float fr = mad(fqneg, fb, fa);
1455   unsigned OpCode = Subtarget->hasFP32Denormals() ?
1456                     (unsigned)AMDGPUISD::FMAD_FTZ :
1457                     (unsigned)ISD::FMAD;
1458   SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
1459 
1460   // int iq = (int)fq;
1461   SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
1462 
1463   // fr = fabs(fr);
1464   fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
1465 
1466   // fb = fabs(fb);
1467   fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
1468 
1469   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1470 
1471   // int cv = fr >= fb;
1472   SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
1473 
1474   // jq = (cv ? jq : 0);
1475   jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
1476 
1477   // dst = iq + jq;
1478   SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
1479 
1480   // Rem needs compensation, it's easier to recompute it
1481   SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
1482   Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
1483 
1484   // Truncate to number of bits this divide really is.
1485   if (Sign) {
1486     SDValue InRegSize
1487       = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
1488     Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
1489     Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
1490   } else {
1491     SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
1492     Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
1493     Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
1494   }
1495 
1496   return DAG.getMergeValues({ Div, Rem }, DL);
1497 }
1498 
1499 void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
1500                                       SelectionDAG &DAG,
1501                                       SmallVectorImpl<SDValue> &Results) const {
1502   assert(Op.getValueType() == MVT::i64);
1503 
1504   SDLoc DL(Op);
1505   EVT VT = Op.getValueType();
1506   EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1507 
1508   SDValue one = DAG.getConstant(1, DL, HalfVT);
1509   SDValue zero = DAG.getConstant(0, DL, HalfVT);
1510 
1511   //HiLo split
1512   SDValue LHS = Op.getOperand(0);
1513   SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero);
1514   SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one);
1515 
1516   SDValue RHS = Op.getOperand(1);
1517   SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero);
1518   SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one);
1519 
1520   if (VT == MVT::i64 &&
1521     DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
1522     DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
1523 
1524     SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1525                               LHS_Lo, RHS_Lo);
1526 
1527     SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), zero});
1528     SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), zero});
1529 
1530     Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
1531     Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
1532     return;
1533   }
1534 
1535   // Get Speculative values
1536   SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
1537   SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
1538 
1539   SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ);
1540   SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, zero});
1541   REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
1542 
1543   SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ);
1544   SDValue DIV_Lo = zero;
1545 
1546   const unsigned halfBitWidth = HalfVT.getSizeInBits();
1547 
1548   for (unsigned i = 0; i < halfBitWidth; ++i) {
1549     const unsigned bitPos = halfBitWidth - i - 1;
1550     SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
1551     // Get value of high bit
1552     SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
1553     HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one);
1554     HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
1555 
1556     // Shift
1557     REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
1558     // Add LHS high bit
1559     REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
1560 
1561     SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
1562     SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETUGE);
1563 
1564     DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
1565 
1566     // Update REM
1567     SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
1568     REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
1569   }
1570 
1571   SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
1572   DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
1573   Results.push_back(DIV);
1574   Results.push_back(REM);
1575 }
1576 
1577 SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
1578                                            SelectionDAG &DAG) const {
1579   SDLoc DL(Op);
1580   EVT VT = Op.getValueType();
1581 
1582   if (VT == MVT::i64) {
1583     SmallVector<SDValue, 2> Results;
1584     LowerUDIVREM64(Op, DAG, Results);
1585     return DAG.getMergeValues(Results, DL);
1586   }
1587 
1588   if (VT == MVT::i32) {
1589     if (SDValue Res = LowerDIVREM24(Op, DAG, false))
1590       return Res;
1591   }
1592 
1593   SDValue Num = Op.getOperand(0);
1594   SDValue Den = Op.getOperand(1);
1595 
1596   // RCP =  URECIP(Den) = 2^32 / Den + e
1597   // e is rounding error.
1598   SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
1599 
1600   // RCP_LO = mul(RCP, Den) */
1601   SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den);
1602 
1603   // RCP_HI = mulhu (RCP, Den) */
1604   SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den);
1605 
1606   // NEG_RCP_LO = -RCP_LO
1607   SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
1608                                                      RCP_LO);
1609 
1610   // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
1611   SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
1612                                            NEG_RCP_LO, RCP_LO,
1613                                            ISD::SETEQ);
1614   // Calculate the rounding error from the URECIP instruction
1615   // E = mulhu(ABS_RCP_LO, RCP)
1616   SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP);
1617 
1618   // RCP_A_E = RCP + E
1619   SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E);
1620 
1621   // RCP_S_E = RCP - E
1622   SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E);
1623 
1624   // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
1625   SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
1626                                      RCP_A_E, RCP_S_E,
1627                                      ISD::SETEQ);
1628   // Quotient = mulhu(Tmp0, Num)
1629   SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num);
1630 
1631   // Num_S_Remainder = Quotient * Den
1632   SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den);
1633 
1634   // Remainder = Num - Num_S_Remainder
1635   SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder);
1636 
1637   // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
1638   SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den,
1639                                                  DAG.getConstant(-1, DL, VT),
1640                                                  DAG.getConstant(0, DL, VT),
1641                                                  ISD::SETUGE);
1642   // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
1643   SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num,
1644                                                   Num_S_Remainder,
1645                                                   DAG.getConstant(-1, DL, VT),
1646                                                   DAG.getConstant(0, DL, VT),
1647                                                   ISD::SETUGE);
1648   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
1649   SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den,
1650                                                Remainder_GE_Zero);
1651 
1652   // Calculate Division result:
1653 
1654   // Quotient_A_One = Quotient + 1
1655   SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient,
1656                                        DAG.getConstant(1, DL, VT));
1657 
1658   // Quotient_S_One = Quotient - 1
1659   SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient,
1660                                        DAG.getConstant(1, DL, VT));
1661 
1662   // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
1663   SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
1664                                      Quotient, Quotient_A_One, ISD::SETEQ);
1665 
1666   // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
1667   Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
1668                             Quotient_S_One, Div, ISD::SETEQ);
1669 
1670   // Calculate Rem result:
1671 
1672   // Remainder_S_Den = Remainder - Den
1673   SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den);
1674 
1675   // Remainder_A_Den = Remainder + Den
1676   SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den);
1677 
1678   // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
1679   SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
1680                                     Remainder, Remainder_S_Den, ISD::SETEQ);
1681 
1682   // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
1683   Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
1684                             Remainder_A_Den, Rem, ISD::SETEQ);
1685   SDValue Ops[2] = {
1686     Div,
1687     Rem
1688   };
1689   return DAG.getMergeValues(Ops, DL);
1690 }
1691 
1692 SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
1693                                            SelectionDAG &DAG) const {
1694   SDLoc DL(Op);
1695   EVT VT = Op.getValueType();
1696 
1697   SDValue LHS = Op.getOperand(0);
1698   SDValue RHS = Op.getOperand(1);
1699 
1700   SDValue Zero = DAG.getConstant(0, DL, VT);
1701   SDValue NegOne = DAG.getConstant(-1, DL, VT);
1702 
1703   if (VT == MVT::i32) {
1704     if (SDValue Res = LowerDIVREM24(Op, DAG, true))
1705       return Res;
1706   }
1707 
1708   if (VT == MVT::i64 &&
1709       DAG.ComputeNumSignBits(LHS) > 32 &&
1710       DAG.ComputeNumSignBits(RHS) > 32) {
1711     EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
1712 
1713     //HiLo split
1714     SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
1715     SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
1716     SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
1717                                  LHS_Lo, RHS_Lo);
1718     SDValue Res[2] = {
1719       DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
1720       DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
1721     };
1722     return DAG.getMergeValues(Res, DL);
1723   }
1724 
1725   SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
1726   SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
1727   SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
1728   SDValue RSign = LHSign; // Remainder sign is the same as LHS
1729 
1730   LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
1731   RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
1732 
1733   LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
1734   RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
1735 
1736   SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
1737   SDValue Rem = Div.getValue(1);
1738 
1739   Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
1740   Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
1741 
1742   Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
1743   Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
1744 
1745   SDValue Res[2] = {
1746     Div,
1747     Rem
1748   };
1749   return DAG.getMergeValues(Res, DL);
1750 }
1751 
1752 // (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y))
1753 SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const {
1754   SDLoc SL(Op);
1755   EVT VT = Op.getValueType();
1756   SDValue X = Op.getOperand(0);
1757   SDValue Y = Op.getOperand(1);
1758 
1759   // TODO: Should this propagate fast-math-flags?
1760 
1761   SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y);
1762   SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div);
1763   SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y);
1764 
1765   return DAG.getNode(ISD::FSUB, SL, VT, X, Mul);
1766 }
1767 
1768 SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const {
1769   SDLoc SL(Op);
1770   SDValue Src = Op.getOperand(0);
1771 
1772   // result = trunc(src)
1773   // if (src > 0.0 && src != result)
1774   //   result += 1.0
1775 
1776   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
1777 
1778   const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
1779   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
1780 
1781   EVT SetCCVT =
1782       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
1783 
1784   SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
1785   SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
1786   SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
1787 
1788   SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
1789   // TODO: Should this propagate fast-math-flags?
1790   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
1791 }
1792 
1793 static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL,
1794                                   SelectionDAG &DAG) {
1795   const unsigned FractBits = 52;
1796   const unsigned ExpBits = 11;
1797 
1798   SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
1799                                 Hi,
1800                                 DAG.getConstant(FractBits - 32, SL, MVT::i32),
1801                                 DAG.getConstant(ExpBits, SL, MVT::i32));
1802   SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
1803                             DAG.getConstant(1023, SL, MVT::i32));
1804 
1805   return Exp;
1806 }
1807 
1808 SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
1809   SDLoc SL(Op);
1810   SDValue Src = Op.getOperand(0);
1811 
1812   assert(Op.getValueType() == MVT::f64);
1813 
1814   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1815   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1816 
1817   SDValue VecSrc = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
1818 
1819   // Extract the upper half, since this is where we will find the sign and
1820   // exponent.
1821   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One);
1822 
1823   SDValue Exp = extractF64Exponent(Hi, SL, DAG);
1824 
1825   const unsigned FractBits = 52;
1826 
1827   // Extract the sign bit.
1828   const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
1829   SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
1830 
1831   // Extend back to to 64-bits.
1832   SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
1833   SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
1834 
1835   SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
1836   const SDValue FractMask
1837     = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
1838 
1839   SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
1840   SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
1841   SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
1842 
1843   EVT SetCCVT =
1844       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
1845 
1846   const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
1847 
1848   SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
1849   SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
1850 
1851   SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
1852   SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
1853 
1854   return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
1855 }
1856 
1857 SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
1858   SDLoc SL(Op);
1859   SDValue Src = Op.getOperand(0);
1860 
1861   assert(Op.getValueType() == MVT::f64);
1862 
1863   APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
1864   SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
1865   SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
1866 
1867   // TODO: Should this propagate fast-math-flags?
1868 
1869   SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
1870   SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
1871 
1872   SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
1873 
1874   APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
1875   SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
1876 
1877   EVT SetCCVT =
1878       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
1879   SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
1880 
1881   return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
1882 }
1883 
1884 SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const {
1885   // FNEARBYINT and FRINT are the same, except in their handling of FP
1886   // exceptions. Those aren't really meaningful for us, and OpenCL only has
1887   // rint, so just treat them as equivalent.
1888   return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
1889 }
1890 
1891 // XXX - May require not supporting f32 denormals?
1892 
1893 // Don't handle v2f16. The extra instructions to scalarize and repack around the
1894 // compare and vselect end up producing worse code than scalarizing the whole
1895 // operation.
1896 SDValue AMDGPUTargetLowering::LowerFROUND32_16(SDValue Op, SelectionDAG &DAG) const {
1897   SDLoc SL(Op);
1898   SDValue X = Op.getOperand(0);
1899   EVT VT = Op.getValueType();
1900 
1901   SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
1902 
1903   // TODO: Should this propagate fast-math-flags?
1904 
1905   SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
1906 
1907   SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
1908 
1909   const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
1910   const SDValue One = DAG.getConstantFP(1.0, SL, VT);
1911   const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
1912 
1913   SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X);
1914 
1915   EVT SetCCVT =
1916       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
1917 
1918   SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
1919 
1920   SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero);
1921 
1922   return DAG.getNode(ISD::FADD, SL, VT, T, Sel);
1923 }
1924 
1925 SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const {
1926   SDLoc SL(Op);
1927   SDValue X = Op.getOperand(0);
1928 
1929   SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X);
1930 
1931   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1932   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1933   const SDValue NegOne = DAG.getConstant(-1, SL, MVT::i32);
1934   const SDValue FiftyOne = DAG.getConstant(51, SL, MVT::i32);
1935   EVT SetCCVT =
1936       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
1937 
1938   SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
1939 
1940   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One);
1941 
1942   SDValue Exp = extractF64Exponent(Hi, SL, DAG);
1943 
1944   const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff), SL,
1945                                        MVT::i64);
1946 
1947   SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp);
1948   SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64,
1949                           DAG.getConstant(INT64_C(0x0008000000000000), SL,
1950                                           MVT::i64),
1951                           Exp);
1952 
1953   SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M);
1954   SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT,
1955                               DAG.getConstant(0, SL, MVT::i64), Tmp0,
1956                               ISD::SETNE);
1957 
1958   SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1,
1959                              D, DAG.getConstant(0, SL, MVT::i64));
1960   SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2);
1961 
1962   K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64));
1963   K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K);
1964 
1965   SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
1966   SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
1967   SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ);
1968 
1969   SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64,
1970                             ExpEqNegOne,
1971                             DAG.getConstantFP(1.0, SL, MVT::f64),
1972                             DAG.getConstantFP(0.0, SL, MVT::f64));
1973 
1974   SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X);
1975 
1976   K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K);
1977   K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K);
1978 
1979   return K;
1980 }
1981 
1982 SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
1983   EVT VT = Op.getValueType();
1984 
1985   if (VT == MVT::f32 || VT == MVT::f16)
1986     return LowerFROUND32_16(Op, DAG);
1987 
1988   if (VT == MVT::f64)
1989     return LowerFROUND64(Op, DAG);
1990 
1991   llvm_unreachable("unhandled type");
1992 }
1993 
1994 SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
1995   SDLoc SL(Op);
1996   SDValue Src = Op.getOperand(0);
1997 
1998   // result = trunc(src);
1999   // if (src < 0.0 && src != result)
2000   //   result += -1.0.
2001 
2002   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2003 
2004   const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2005   const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2006 
2007   EVT SetCCVT =
2008       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2009 
2010   SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2011   SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2012   SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2013 
2014   SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2015   // TODO: Should this propagate fast-math-flags?
2016   return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2017 }
2018 
2019 SDValue AMDGPUTargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const {
2020   SDLoc SL(Op);
2021   SDValue Src = Op.getOperand(0);
2022   bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF;
2023 
2024   if (ZeroUndef && Src.getValueType() == MVT::i32)
2025     return DAG.getNode(AMDGPUISD::FFBH_U32, SL, MVT::i32, Src);
2026 
2027   SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
2028 
2029   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2030   const SDValue One = DAG.getConstant(1, SL, MVT::i32);
2031 
2032   SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
2033   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
2034 
2035   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
2036                                    *DAG.getContext(), MVT::i32);
2037 
2038   SDValue Hi0 = DAG.getSetCC(SL, SetCCVT, Hi, Zero, ISD::SETEQ);
2039 
2040   SDValue CtlzLo = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Lo);
2041   SDValue CtlzHi = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i32, Hi);
2042 
2043   const SDValue Bits32 = DAG.getConstant(32, SL, MVT::i32);
2044   SDValue Add = DAG.getNode(ISD::ADD, SL, MVT::i32, CtlzLo, Bits32);
2045 
2046   // ctlz(x) = hi_32(x) == 0 ? ctlz(lo_32(x)) + 32 : ctlz(hi_32(x))
2047   SDValue NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32, Hi0, Add, CtlzHi);
2048 
2049   if (!ZeroUndef) {
2050     // Test if the full 64-bit input is zero.
2051 
2052     // FIXME: DAG combines turn what should be an s_and_b64 into a v_or_b32,
2053     // which we probably don't want.
2054     SDValue Lo0 = DAG.getSetCC(SL, SetCCVT, Lo, Zero, ISD::SETEQ);
2055     SDValue SrcIsZero = DAG.getNode(ISD::AND, SL, SetCCVT, Lo0, Hi0);
2056 
2057     // TODO: If i64 setcc is half rate, it can result in 1 fewer instruction
2058     // with the same cycles, otherwise it is slower.
2059     // SDValue SrcIsZero = DAG.getSetCC(SL, SetCCVT, Src,
2060     // DAG.getConstant(0, SL, MVT::i64), ISD::SETEQ);
2061 
2062     const SDValue Bits32 = DAG.getConstant(64, SL, MVT::i32);
2063 
2064     // The instruction returns -1 for 0 input, but the defined intrinsic
2065     // behavior is to return the number of bits.
2066     NewCtlz = DAG.getNode(ISD::SELECT, SL, MVT::i32,
2067                           SrcIsZero, Bits32, NewCtlz);
2068   }
2069 
2070   return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewCtlz);
2071 }
2072 
2073 SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
2074                                                bool Signed) const {
2075   // Unsigned
2076   // cul2f(ulong u)
2077   //{
2078   //  uint lz = clz(u);
2079   //  uint e = (u != 0) ? 127U + 63U - lz : 0;
2080   //  u = (u << lz) & 0x7fffffffffffffffUL;
2081   //  ulong t = u & 0xffffffffffUL;
2082   //  uint v = (e << 23) | (uint)(u >> 40);
2083   //  uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
2084   //  return as_float(v + r);
2085   //}
2086   // Signed
2087   // cl2f(long l)
2088   //{
2089   //  long s = l >> 63;
2090   //  float r = cul2f((l + s) ^ s);
2091   //  return s ? -r : r;
2092   //}
2093 
2094   SDLoc SL(Op);
2095   SDValue Src = Op.getOperand(0);
2096   SDValue L = Src;
2097 
2098   SDValue S;
2099   if (Signed) {
2100     const SDValue SignBit = DAG.getConstant(63, SL, MVT::i64);
2101     S = DAG.getNode(ISD::SRA, SL, MVT::i64, L, SignBit);
2102 
2103     SDValue LPlusS = DAG.getNode(ISD::ADD, SL, MVT::i64, L, S);
2104     L = DAG.getNode(ISD::XOR, SL, MVT::i64, LPlusS, S);
2105   }
2106 
2107   EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(),
2108                                    *DAG.getContext(), MVT::f32);
2109 
2110 
2111   SDValue ZeroI32 = DAG.getConstant(0, SL, MVT::i32);
2112   SDValue ZeroI64 = DAG.getConstant(0, SL, MVT::i64);
2113   SDValue LZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SL, MVT::i64, L);
2114   LZ = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LZ);
2115 
2116   SDValue K = DAG.getConstant(127U + 63U, SL, MVT::i32);
2117   SDValue E = DAG.getSelect(SL, MVT::i32,
2118     DAG.getSetCC(SL, SetCCVT, L, ZeroI64, ISD::SETNE),
2119     DAG.getNode(ISD::SUB, SL, MVT::i32, K, LZ),
2120     ZeroI32);
2121 
2122   SDValue U = DAG.getNode(ISD::AND, SL, MVT::i64,
2123     DAG.getNode(ISD::SHL, SL, MVT::i64, L, LZ),
2124     DAG.getConstant((-1ULL) >> 1, SL, MVT::i64));
2125 
2126   SDValue T = DAG.getNode(ISD::AND, SL, MVT::i64, U,
2127                           DAG.getConstant(0xffffffffffULL, SL, MVT::i64));
2128 
2129   SDValue UShl = DAG.getNode(ISD::SRL, SL, MVT::i64,
2130                              U, DAG.getConstant(40, SL, MVT::i64));
2131 
2132   SDValue V = DAG.getNode(ISD::OR, SL, MVT::i32,
2133     DAG.getNode(ISD::SHL, SL, MVT::i32, E, DAG.getConstant(23, SL, MVT::i32)),
2134     DAG.getNode(ISD::TRUNCATE, SL, MVT::i32,  UShl));
2135 
2136   SDValue C = DAG.getConstant(0x8000000000ULL, SL, MVT::i64);
2137   SDValue RCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETUGT);
2138   SDValue TCmp = DAG.getSetCC(SL, SetCCVT, T, C, ISD::SETEQ);
2139 
2140   SDValue One = DAG.getConstant(1, SL, MVT::i32);
2141 
2142   SDValue VTrunc1 = DAG.getNode(ISD::AND, SL, MVT::i32, V, One);
2143 
2144   SDValue R = DAG.getSelect(SL, MVT::i32,
2145     RCmp,
2146     One,
2147     DAG.getSelect(SL, MVT::i32, TCmp, VTrunc1, ZeroI32));
2148   R = DAG.getNode(ISD::ADD, SL, MVT::i32, V, R);
2149   R = DAG.getNode(ISD::BITCAST, SL, MVT::f32, R);
2150 
2151   if (!Signed)
2152     return R;
2153 
2154   SDValue RNeg = DAG.getNode(ISD::FNEG, SL, MVT::f32, R);
2155   return DAG.getSelect(SL, MVT::f32, DAG.getSExtOrTrunc(S, SL, SetCCVT), RNeg, R);
2156 }
2157 
2158 SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
2159                                                bool Signed) const {
2160   SDLoc SL(Op);
2161   SDValue Src = Op.getOperand(0);
2162 
2163   SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Src);
2164 
2165   SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
2166                            DAG.getConstant(0, SL, MVT::i32));
2167   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC,
2168                            DAG.getConstant(1, SL, MVT::i32));
2169 
2170   SDValue CvtHi = DAG.getNode(Signed ? ISD::SINT_TO_FP : ISD::UINT_TO_FP,
2171                               SL, MVT::f64, Hi);
2172 
2173   SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
2174 
2175   SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
2176                               DAG.getConstant(32, SL, MVT::i32));
2177   // TODO: Should this propagate fast-math-flags?
2178   return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
2179 }
2180 
2181 SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
2182                                                SelectionDAG &DAG) const {
2183   assert(Op.getOperand(0).getValueType() == MVT::i64 &&
2184          "operation should be legal");
2185 
2186   // TODO: Factor out code common with LowerSINT_TO_FP.
2187 
2188   EVT DestVT = Op.getValueType();
2189   if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2190     SDLoc DL(Op);
2191     SDValue Src = Op.getOperand(0);
2192 
2193     SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2194     SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2195     SDValue FPRound =
2196         DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2197 
2198     return FPRound;
2199   }
2200 
2201   if (DestVT == MVT::f32)
2202     return LowerINT_TO_FP32(Op, DAG, false);
2203 
2204   assert(DestVT == MVT::f64);
2205   return LowerINT_TO_FP64(Op, DAG, false);
2206 }
2207 
2208 SDValue AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op,
2209                                               SelectionDAG &DAG) const {
2210   assert(Op.getOperand(0).getValueType() == MVT::i64 &&
2211          "operation should be legal");
2212 
2213   // TODO: Factor out code common with LowerUINT_TO_FP.
2214 
2215   EVT DestVT = Op.getValueType();
2216   if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
2217     SDLoc DL(Op);
2218     SDValue Src = Op.getOperand(0);
2219 
2220     SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
2221     SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SDLoc(Op));
2222     SDValue FPRound =
2223         DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
2224 
2225     return FPRound;
2226   }
2227 
2228   if (DestVT == MVT::f32)
2229     return LowerINT_TO_FP32(Op, DAG, true);
2230 
2231   assert(DestVT == MVT::f64);
2232   return LowerINT_TO_FP64(Op, DAG, true);
2233 }
2234 
2235 SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
2236                                                bool Signed) const {
2237   SDLoc SL(Op);
2238 
2239   SDValue Src = Op.getOperand(0);
2240 
2241   SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2242 
2243   SDValue K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)), SL,
2244                                  MVT::f64);
2245   SDValue K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), SL,
2246                                  MVT::f64);
2247   // TODO: Should this propagate fast-math-flags?
2248   SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, Trunc, K0);
2249 
2250   SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, MVT::f64, Mul);
2251 
2252 
2253   SDValue Fma = DAG.getNode(ISD::FMA, SL, MVT::f64, FloorMul, K1, Trunc);
2254 
2255   SDValue Hi = DAG.getNode(Signed ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, SL,
2256                            MVT::i32, FloorMul);
2257   SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
2258 
2259   SDValue Result = DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi});
2260 
2261   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Result);
2262 }
2263 
2264 SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
2265   SDLoc DL(Op);
2266   SDValue N0 = Op.getOperand(0);
2267 
2268   // Convert to target node to get known bits
2269   if (N0.getValueType() == MVT::f32)
2270     return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
2271 
2272   if (getTargetMachine().Options.UnsafeFPMath) {
2273     // There is a generic expand for FP_TO_FP16 with unsafe fast math.
2274     return SDValue();
2275   }
2276 
2277   assert(N0.getSimpleValueType() == MVT::f64);
2278 
2279   // f64 -> f16 conversion using round-to-nearest-even rounding mode.
2280   const unsigned ExpMask = 0x7ff;
2281   const unsigned ExpBiasf64 = 1023;
2282   const unsigned ExpBiasf16 = 15;
2283   SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
2284   SDValue One = DAG.getConstant(1, DL, MVT::i32);
2285   SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, N0);
2286   SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
2287                            DAG.getConstant(32, DL, MVT::i64));
2288   UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
2289   U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
2290   SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2291                           DAG.getConstant(20, DL, MVT::i64));
2292   E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
2293                   DAG.getConstant(ExpMask, DL, MVT::i32));
2294   // Subtract the fp64 exponent bias (1023) to get the real exponent and
2295   // add the f16 bias (15) to get the biased exponent for the f16 format.
2296   E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
2297                   DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
2298 
2299   SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2300                           DAG.getConstant(8, DL, MVT::i32));
2301   M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
2302                   DAG.getConstant(0xffe, DL, MVT::i32));
2303 
2304   SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
2305                                   DAG.getConstant(0x1ff, DL, MVT::i32));
2306   MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
2307 
2308   SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
2309   M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
2310 
2311   // (M != 0 ? 0x0200 : 0) | 0x7c00;
2312   SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
2313       DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
2314                       Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
2315 
2316   // N = M | (E << 12);
2317   SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2318       DAG.getNode(ISD::SHL, DL, MVT::i32, E,
2319                   DAG.getConstant(12, DL, MVT::i32)));
2320 
2321   // B = clamp(1-E, 0, 13);
2322   SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
2323                                   One, E);
2324   SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
2325   B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
2326                   DAG.getConstant(13, DL, MVT::i32));
2327 
2328   SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
2329                                    DAG.getConstant(0x1000, DL, MVT::i32));
2330 
2331   SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
2332   SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
2333   SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
2334   D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
2335 
2336   SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
2337   SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
2338                               DAG.getConstant(0x7, DL, MVT::i32));
2339   V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
2340                   DAG.getConstant(2, DL, MVT::i32));
2341   SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
2342                                One, Zero, ISD::SETEQ);
2343   SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
2344                                One, Zero, ISD::SETGT);
2345   V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
2346   V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
2347 
2348   V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
2349                       DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
2350   V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
2351                       I, V, ISD::SETEQ);
2352 
2353   // Extract the sign bit.
2354   SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
2355                             DAG.getConstant(16, DL, MVT::i32));
2356   Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
2357                      DAG.getConstant(0x8000, DL, MVT::i32));
2358 
2359   V = DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
2360   return DAG.getZExtOrTrunc(V, DL, Op.getValueType());
2361 }
2362 
2363 SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op,
2364                                               SelectionDAG &DAG) const {
2365   SDValue Src = Op.getOperand(0);
2366 
2367   // TODO: Factor out code common with LowerFP_TO_UINT.
2368 
2369   EVT SrcVT = Src.getValueType();
2370   if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
2371     SDLoc DL(Op);
2372 
2373     SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
2374     SDValue FpToInt32 =
2375         DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
2376 
2377     return FpToInt32;
2378   }
2379 
2380   if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
2381     return LowerFP64_TO_INT(Op, DAG, true);
2382 
2383   return SDValue();
2384 }
2385 
2386 SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op,
2387                                               SelectionDAG &DAG) const {
2388   SDValue Src = Op.getOperand(0);
2389 
2390   // TODO: Factor out code common with LowerFP_TO_SINT.
2391 
2392   EVT SrcVT = Src.getValueType();
2393   if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) {
2394     SDLoc DL(Op);
2395 
2396     SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
2397     SDValue FpToInt32 =
2398         DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend);
2399 
2400     return FpToInt32;
2401   }
2402 
2403   if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64)
2404     return LowerFP64_TO_INT(Op, DAG, false);
2405 
2406   return SDValue();
2407 }
2408 
2409 SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
2410                                                      SelectionDAG &DAG) const {
2411   EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
2412   MVT VT = Op.getSimpleValueType();
2413   MVT ScalarVT = VT.getScalarType();
2414 
2415   assert(VT.isVector());
2416 
2417   SDValue Src = Op.getOperand(0);
2418   SDLoc DL(Op);
2419 
2420   // TODO: Don't scalarize on Evergreen?
2421   unsigned NElts = VT.getVectorNumElements();
2422   SmallVector<SDValue, 8> Args;
2423   DAG.ExtractVectorElements(Src, Args, 0, NElts);
2424 
2425   SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
2426   for (unsigned I = 0; I < NElts; ++I)
2427     Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
2428 
2429   return DAG.getBuildVector(VT, DL, Args);
2430 }
2431 
2432 //===----------------------------------------------------------------------===//
2433 // Custom DAG optimizations
2434 //===----------------------------------------------------------------------===//
2435 
2436 static bool isU24(SDValue Op, SelectionDAG &DAG) {
2437   KnownBits Known;
2438   EVT VT = Op.getValueType();
2439   DAG.computeKnownBits(Op, Known);
2440 
2441   return (VT.getSizeInBits() - Known.countMinLeadingZeros()) <= 24;
2442 }
2443 
2444 static bool isI24(SDValue Op, SelectionDAG &DAG) {
2445   EVT VT = Op.getValueType();
2446 
2447   // In order for this to be a signed 24-bit value, bit 23, must
2448   // be a sign bit.
2449   return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
2450                                      // as unsigned 24-bit values.
2451          (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24;
2452 }
2453 
2454 static bool simplifyI24(SDNode *Node24, unsigned OpIdx,
2455                         TargetLowering::DAGCombinerInfo &DCI) {
2456 
2457   SelectionDAG &DAG = DCI.DAG;
2458   SDValue Op = Node24->getOperand(OpIdx);
2459   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2460   EVT VT = Op.getValueType();
2461 
2462   APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24);
2463   APInt KnownZero, KnownOne;
2464   TargetLowering::TargetLoweringOpt TLO(DAG, true, true);
2465   if (TLI.SimplifyDemandedBits(Node24, OpIdx, Demanded, DCI, TLO))
2466     return true;
2467 
2468   return false;
2469 }
2470 
2471 template <typename IntTy>
2472 static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset,
2473                                uint32_t Width, const SDLoc &DL) {
2474   if (Width + Offset < 32) {
2475     uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
2476     IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
2477     return DAG.getConstant(Result, DL, MVT::i32);
2478   }
2479 
2480   return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
2481 }
2482 
2483 static bool hasVolatileUser(SDNode *Val) {
2484   for (SDNode *U : Val->uses()) {
2485     if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
2486       if (M->isVolatile())
2487         return true;
2488     }
2489   }
2490 
2491   return false;
2492 }
2493 
2494 bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
2495   // i32 vectors are the canonical memory type.
2496   if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
2497     return false;
2498 
2499   if (!VT.isByteSized())
2500     return false;
2501 
2502   unsigned Size = VT.getStoreSize();
2503 
2504   if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
2505     return false;
2506 
2507   if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
2508     return false;
2509 
2510   return true;
2511 }
2512 
2513 // Replace load of an illegal type with a store of a bitcast to a friendlier
2514 // type.
2515 SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
2516                                                  DAGCombinerInfo &DCI) const {
2517   if (!DCI.isBeforeLegalize())
2518     return SDValue();
2519 
2520   LoadSDNode *LN = cast<LoadSDNode>(N);
2521   if (LN->isVolatile() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
2522     return SDValue();
2523 
2524   SDLoc SL(N);
2525   SelectionDAG &DAG = DCI.DAG;
2526   EVT VT = LN->getMemoryVT();
2527 
2528   unsigned Size = VT.getStoreSize();
2529   unsigned Align = LN->getAlignment();
2530   if (Align < Size && isTypeLegal(VT)) {
2531     bool IsFast;
2532     unsigned AS = LN->getAddressSpace();
2533 
2534     // Expand unaligned loads earlier than legalization. Due to visitation order
2535     // problems during legalization, the emitted instructions to pack and unpack
2536     // the bytes again are not eliminated in the case of an unaligned copy.
2537     if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) {
2538       if (VT.isVector())
2539         return scalarizeVectorLoad(LN, DAG);
2540 
2541       SDValue Ops[2];
2542       std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
2543       return DAG.getMergeValues(Ops, SDLoc(N));
2544     }
2545 
2546     if (!IsFast)
2547       return SDValue();
2548   }
2549 
2550   if (!shouldCombineMemoryType(VT))
2551     return SDValue();
2552 
2553   EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
2554 
2555   SDValue NewLoad
2556     = DAG.getLoad(NewVT, SL, LN->getChain(),
2557                   LN->getBasePtr(), LN->getMemOperand());
2558 
2559   SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
2560   DCI.CombineTo(N, BC, NewLoad.getValue(1));
2561   return SDValue(N, 0);
2562 }
2563 
2564 // Replace store of an illegal type with a store of a bitcast to a friendlier
2565 // type.
2566 SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
2567                                                   DAGCombinerInfo &DCI) const {
2568   if (!DCI.isBeforeLegalize())
2569     return SDValue();
2570 
2571   StoreSDNode *SN = cast<StoreSDNode>(N);
2572   if (SN->isVolatile() || !ISD::isNormalStore(SN))
2573     return SDValue();
2574 
2575   EVT VT = SN->getMemoryVT();
2576   unsigned Size = VT.getStoreSize();
2577 
2578   SDLoc SL(N);
2579   SelectionDAG &DAG = DCI.DAG;
2580   unsigned Align = SN->getAlignment();
2581   if (Align < Size && isTypeLegal(VT)) {
2582     bool IsFast;
2583     unsigned AS = SN->getAddressSpace();
2584 
2585     // Expand unaligned stores earlier than legalization. Due to visitation
2586     // order problems during legalization, the emitted instructions to pack and
2587     // unpack the bytes again are not eliminated in the case of an unaligned
2588     // copy.
2589     if (!allowsMisalignedMemoryAccesses(VT, AS, Align, &IsFast)) {
2590       if (VT.isVector())
2591         return scalarizeVectorStore(SN, DAG);
2592 
2593       return expandUnalignedStore(SN, DAG);
2594     }
2595 
2596     if (!IsFast)
2597       return SDValue();
2598   }
2599 
2600   if (!shouldCombineMemoryType(VT))
2601     return SDValue();
2602 
2603   EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
2604   SDValue Val = SN->getValue();
2605 
2606   //DCI.AddToWorklist(Val.getNode());
2607 
2608   bool OtherUses = !Val.hasOneUse();
2609   SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
2610   if (OtherUses) {
2611     SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
2612     DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
2613   }
2614 
2615   return DAG.getStore(SN->getChain(), SL, CastVal,
2616                       SN->getBasePtr(), SN->getMemOperand());
2617 }
2618 
2619 SDValue AMDGPUTargetLowering::performClampCombine(SDNode *N,
2620                                                   DAGCombinerInfo &DCI) const {
2621   ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
2622   if (!CSrc)
2623     return SDValue();
2624 
2625   const APFloat &F = CSrc->getValueAPF();
2626   APFloat Zero = APFloat::getZero(F.getSemantics());
2627   APFloat::cmpResult Cmp0 = F.compare(Zero);
2628   if (Cmp0 == APFloat::cmpLessThan ||
2629       (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) {
2630     return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
2631   }
2632 
2633   APFloat One(F.getSemantics(), "1.0");
2634   APFloat::cmpResult Cmp1 = F.compare(One);
2635   if (Cmp1 == APFloat::cmpGreaterThan)
2636     return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
2637 
2638   return SDValue(CSrc, 0);
2639 }
2640 
2641 // FIXME: This should go in generic DAG combiner with an isTruncateFree check,
2642 // but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
2643 // issues.
2644 SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
2645                                                         DAGCombinerInfo &DCI) const {
2646   SelectionDAG &DAG = DCI.DAG;
2647   SDValue N0 = N->getOperand(0);
2648 
2649   // (vt2 (assertzext (truncate vt0:x), vt1)) ->
2650   //     (vt2 (truncate (assertzext vt0:x, vt1)))
2651   if (N0.getOpcode() == ISD::TRUNCATE) {
2652     SDValue N1 = N->getOperand(1);
2653     EVT ExtVT = cast<VTSDNode>(N1)->getVT();
2654     SDLoc SL(N);
2655 
2656     SDValue Src = N0.getOperand(0);
2657     EVT SrcVT = Src.getValueType();
2658     if (SrcVT.bitsGE(ExtVT)) {
2659       SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
2660       return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
2661     }
2662   }
2663 
2664   return SDValue();
2665 }
2666 /// Split the 64-bit value \p LHS into two 32-bit components, and perform the
2667 /// binary operation \p Opc to it with the corresponding constant operands.
2668 SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
2669   DAGCombinerInfo &DCI, const SDLoc &SL,
2670   unsigned Opc, SDValue LHS,
2671   uint32_t ValLo, uint32_t ValHi) const {
2672   SelectionDAG &DAG = DCI.DAG;
2673   SDValue Lo, Hi;
2674   std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
2675 
2676   SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
2677   SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
2678 
2679   SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
2680   SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
2681 
2682   // Re-visit the ands. It's possible we eliminated one of them and it could
2683   // simplify the vector.
2684   DCI.AddToWorklist(Lo.getNode());
2685   DCI.AddToWorklist(Hi.getNode());
2686 
2687   SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
2688   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
2689 }
2690 
2691 SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
2692                                                 DAGCombinerInfo &DCI) const {
2693   EVT VT = N->getValueType(0);
2694 
2695   ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
2696   if (!RHS)
2697     return SDValue();
2698 
2699   SDValue LHS = N->getOperand(0);
2700   unsigned RHSVal = RHS->getZExtValue();
2701   if (!RHSVal)
2702     return LHS;
2703 
2704   SDLoc SL(N);
2705   SelectionDAG &DAG = DCI.DAG;
2706 
2707   switch (LHS->getOpcode()) {
2708   default:
2709     break;
2710   case ISD::ZERO_EXTEND:
2711   case ISD::SIGN_EXTEND:
2712   case ISD::ANY_EXTEND: {
2713     SDValue X = LHS->getOperand(0);
2714 
2715     if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
2716         isTypeLegal(MVT::v2i16)) {
2717       // Prefer build_vector as the canonical form if packed types are legal.
2718       // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
2719       SDValue Vec = DAG.getBuildVector(MVT::v2i16, SL,
2720        { DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0) });
2721       return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
2722     }
2723 
2724     // shl (ext x) => zext (shl x), if shift does not overflow int
2725     if (VT != MVT::i64)
2726       break;
2727     KnownBits Known;
2728     DAG.computeKnownBits(X, Known);
2729     unsigned LZ = Known.countMinLeadingZeros();
2730     if (LZ < RHSVal)
2731       break;
2732     EVT XVT = X.getValueType();
2733     SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
2734     return DAG.getZExtOrTrunc(Shl, SL, VT);
2735   }
2736   case ISD::OR:
2737     if (!isOrEquivalentToAdd(DAG, LHS))
2738       break;
2739     LLVM_FALLTHROUGH;
2740   case ISD::ADD: {
2741     // shl (or|add x, c2), c1 => or|add (shl x, c1), (c2 << c1)
2742     if (ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
2743       SDValue Shl = DAG.getNode(ISD::SHL, SL, VT, LHS->getOperand(0),
2744                                 SDValue(RHS, 0));
2745       SDValue C2V = DAG.getConstant(C2->getAPIntValue() << RHSVal,
2746                                     SDLoc(C2), VT);
2747       return DAG.getNode(LHS->getOpcode(), SL, VT, Shl, C2V);
2748     }
2749     break;
2750   }
2751   }
2752 
2753   if (VT != MVT::i64)
2754     return SDValue();
2755 
2756   // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
2757 
2758   // On some subtargets, 64-bit shift is a quarter rate instruction. In the
2759   // common case, splitting this into a move and a 32-bit shift is faster and
2760   // the same code size.
2761   if (RHSVal < 32)
2762     return SDValue();
2763 
2764   SDValue ShiftAmt = DAG.getConstant(RHSVal - 32, SL, MVT::i32);
2765 
2766   SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
2767   SDValue NewShift = DAG.getNode(ISD::SHL, SL, MVT::i32, Lo, ShiftAmt);
2768 
2769   const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2770 
2771   SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {Zero, NewShift});
2772   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
2773 }
2774 
2775 SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N,
2776                                                 DAGCombinerInfo &DCI) const {
2777   if (N->getValueType(0) != MVT::i64)
2778     return SDValue();
2779 
2780   const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
2781   if (!RHS)
2782     return SDValue();
2783 
2784   SelectionDAG &DAG = DCI.DAG;
2785   SDLoc SL(N);
2786   unsigned RHSVal = RHS->getZExtValue();
2787 
2788   // (sra i64:x, 32) -> build_pair x, (sra hi_32(x), 31)
2789   if (RHSVal == 32) {
2790     SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
2791     SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
2792                                    DAG.getConstant(31, SL, MVT::i32));
2793 
2794     SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {Hi, NewShift});
2795     return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
2796   }
2797 
2798   // (sra i64:x, 63) -> build_pair (sra hi_32(x), 31), (sra hi_32(x), 31)
2799   if (RHSVal == 63) {
2800     SDValue Hi = getHiHalf64(N->getOperand(0), DAG);
2801     SDValue NewShift = DAG.getNode(ISD::SRA, SL, MVT::i32, Hi,
2802                                    DAG.getConstant(31, SL, MVT::i32));
2803     SDValue BuildVec = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, NewShift});
2804     return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildVec);
2805   }
2806 
2807   return SDValue();
2808 }
2809 
2810 SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N,
2811                                                 DAGCombinerInfo &DCI) const {
2812   if (N->getValueType(0) != MVT::i64)
2813     return SDValue();
2814 
2815   const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
2816   if (!RHS)
2817     return SDValue();
2818 
2819   unsigned ShiftAmt = RHS->getZExtValue();
2820   if (ShiftAmt < 32)
2821     return SDValue();
2822 
2823   // srl i64:x, C for C >= 32
2824   // =>
2825   //   build_pair (srl hi_32(x), C - 32), 0
2826 
2827   SelectionDAG &DAG = DCI.DAG;
2828   SDLoc SL(N);
2829 
2830   SDValue One = DAG.getConstant(1, SL, MVT::i32);
2831   SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2832 
2833   SDValue VecOp = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, N->getOperand(0));
2834   SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
2835                            VecOp, One);
2836 
2837   SDValue NewConst = DAG.getConstant(ShiftAmt - 32, SL, MVT::i32);
2838   SDValue NewShift = DAG.getNode(ISD::SRL, SL, MVT::i32, Hi, NewConst);
2839 
2840   SDValue BuildPair = DAG.getBuildVector(MVT::v2i32, SL, {NewShift, Zero});
2841 
2842   return DAG.getNode(ISD::BITCAST, SL, MVT::i64, BuildPair);
2843 }
2844 
2845 // We need to specifically handle i64 mul here to avoid unnecessary conversion
2846 // instructions. If we only match on the legalized i64 mul expansion,
2847 // SimplifyDemandedBits will be unable to remove them because there will be
2848 // multiple uses due to the separate mul + mulh[su].
2849 static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
2850                         SDValue N0, SDValue N1, unsigned Size, bool Signed) {
2851   if (Size <= 32) {
2852     unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
2853     return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
2854   }
2855 
2856   // Because we want to eliminate extension instructions before the
2857   // operation, we need to create a single user here (i.e. not the separate
2858   // mul_lo + mul_hi) so that SimplifyDemandedBits will deal with it.
2859 
2860   unsigned MulOpc = Signed ? AMDGPUISD::MUL_LOHI_I24 : AMDGPUISD::MUL_LOHI_U24;
2861 
2862   SDValue Mul = DAG.getNode(MulOpc, SL,
2863                             DAG.getVTList(MVT::i32, MVT::i32), N0, N1);
2864 
2865   return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64,
2866                      Mul.getValue(0), Mul.getValue(1));
2867 }
2868 
2869 SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
2870                                                 DAGCombinerInfo &DCI) const {
2871   EVT VT = N->getValueType(0);
2872 
2873   unsigned Size = VT.getSizeInBits();
2874   if (VT.isVector() || Size > 64)
2875     return SDValue();
2876 
2877   // There are i16 integer mul/mad.
2878   if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
2879     return SDValue();
2880 
2881   SelectionDAG &DAG = DCI.DAG;
2882   SDLoc DL(N);
2883 
2884   SDValue N0 = N->getOperand(0);
2885   SDValue N1 = N->getOperand(1);
2886   SDValue Mul;
2887 
2888   if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
2889     N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
2890     N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
2891     Mul = getMul24(DAG, DL, N0, N1, Size, false);
2892   } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
2893     N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
2894     N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
2895     Mul = getMul24(DAG, DL, N0, N1, Size, true);
2896   } else {
2897     return SDValue();
2898   }
2899 
2900   // We need to use sext even for MUL_U24, because MUL_U24 is used
2901   // for signed multiply of 8 and 16-bit types.
2902   return DAG.getSExtOrTrunc(Mul, DL, VT);
2903 }
2904 
2905 SDValue AMDGPUTargetLowering::performMulhsCombine(SDNode *N,
2906                                                   DAGCombinerInfo &DCI) const {
2907   EVT VT = N->getValueType(0);
2908 
2909   if (!Subtarget->hasMulI24() || VT.isVector())
2910     return SDValue();
2911 
2912   SelectionDAG &DAG = DCI.DAG;
2913   SDLoc DL(N);
2914 
2915   SDValue N0 = N->getOperand(0);
2916   SDValue N1 = N->getOperand(1);
2917 
2918   if (!isI24(N0, DAG) || !isI24(N1, DAG))
2919     return SDValue();
2920 
2921   N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
2922   N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
2923 
2924   SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
2925   DCI.AddToWorklist(Mulhi.getNode());
2926   return DAG.getSExtOrTrunc(Mulhi, DL, VT);
2927 }
2928 
2929 SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
2930                                                   DAGCombinerInfo &DCI) const {
2931   EVT VT = N->getValueType(0);
2932 
2933   if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
2934     return SDValue();
2935 
2936   SelectionDAG &DAG = DCI.DAG;
2937   SDLoc DL(N);
2938 
2939   SDValue N0 = N->getOperand(0);
2940   SDValue N1 = N->getOperand(1);
2941 
2942   if (!isU24(N0, DAG) || !isU24(N1, DAG))
2943     return SDValue();
2944 
2945   N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
2946   N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
2947 
2948   SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
2949   DCI.AddToWorklist(Mulhi.getNode());
2950   return DAG.getZExtOrTrunc(Mulhi, DL, VT);
2951 }
2952 
2953 SDValue AMDGPUTargetLowering::performMulLoHi24Combine(
2954   SDNode *N, DAGCombinerInfo &DCI) const {
2955   SelectionDAG &DAG = DCI.DAG;
2956 
2957   // Simplify demanded bits before splitting into multiple users.
2958   if (simplifyI24(N, 0, DCI) || simplifyI24(N, 1, DCI))
2959     return SDValue();
2960 
2961   SDValue N0 = N->getOperand(0);
2962   SDValue N1 = N->getOperand(1);
2963 
2964   bool Signed = (N->getOpcode() == AMDGPUISD::MUL_LOHI_I24);
2965 
2966   unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
2967   unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
2968 
2969   SDLoc SL(N);
2970 
2971   SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
2972   SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
2973   return DAG.getMergeValues({ MulLo, MulHi }, SL);
2974 }
2975 
2976 static bool isNegativeOne(SDValue Val) {
2977   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
2978     return C->isAllOnesValue();
2979   return false;
2980 }
2981 
2982 static bool isCtlzOpc(unsigned Opc) {
2983   return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
2984 }
2985 
2986 SDValue AMDGPUTargetLowering::getFFBH_U32(SelectionDAG &DAG,
2987                                           SDValue Op,
2988                                           const SDLoc &DL) const {
2989   EVT VT = Op.getValueType();
2990   EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
2991   if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
2992                               LegalVT != MVT::i16))
2993     return SDValue();
2994 
2995   if (VT != MVT::i32)
2996     Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
2997 
2998   SDValue FFBH = DAG.getNode(AMDGPUISD::FFBH_U32, DL, MVT::i32, Op);
2999   if (VT != MVT::i32)
3000     FFBH = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBH);
3001 
3002   return FFBH;
3003 }
3004 
3005 // The native instructions return -1 on 0 input. Optimize out a select that
3006 // produces -1 on 0.
3007 //
3008 // TODO: If zero is not undef, we could also do this if the output is compared
3009 // against the bitwidth.
3010 //
3011 // TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
3012 SDValue AMDGPUTargetLowering::performCtlzCombine(const SDLoc &SL, SDValue Cond,
3013                                                  SDValue LHS, SDValue RHS,
3014                                                  DAGCombinerInfo &DCI) const {
3015   ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
3016   if (!CmpRhs || !CmpRhs->isNullValue())
3017     return SDValue();
3018 
3019   SelectionDAG &DAG = DCI.DAG;
3020   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
3021   SDValue CmpLHS = Cond.getOperand(0);
3022 
3023   // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
3024   if (CCOpcode == ISD::SETEQ &&
3025       isCtlzOpc(RHS.getOpcode()) &&
3026       RHS.getOperand(0) == CmpLHS &&
3027       isNegativeOne(LHS)) {
3028     return getFFBH_U32(DAG, CmpLHS, SL);
3029   }
3030 
3031   // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
3032   if (CCOpcode == ISD::SETNE &&
3033       isCtlzOpc(LHS.getOpcode()) &&
3034       LHS.getOperand(0) == CmpLHS &&
3035       isNegativeOne(RHS)) {
3036     return getFFBH_U32(DAG, CmpLHS, SL);
3037   }
3038 
3039   return SDValue();
3040 }
3041 
3042 static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
3043                                          unsigned Op,
3044                                          const SDLoc &SL,
3045                                          SDValue Cond,
3046                                          SDValue N1,
3047                                          SDValue N2) {
3048   SelectionDAG &DAG = DCI.DAG;
3049   EVT VT = N1.getValueType();
3050 
3051   SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
3052                                   N1.getOperand(0), N2.getOperand(0));
3053   DCI.AddToWorklist(NewSelect.getNode());
3054   return DAG.getNode(Op, SL, VT, NewSelect);
3055 }
3056 
3057 // Pull a free FP operation out of a select so it may fold into uses.
3058 //
3059 // select c, (fneg x), (fneg y) -> fneg (select c, x, y)
3060 // select c, (fneg x), k -> fneg (select c, x, (fneg k))
3061 //
3062 // select c, (fabs x), (fabs y) -> fabs (select c, x, y)
3063 // select c, (fabs x), +k -> fabs (select c, x, k)
3064 static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
3065                                     SDValue N) {
3066   SelectionDAG &DAG = DCI.DAG;
3067   SDValue Cond = N.getOperand(0);
3068   SDValue LHS = N.getOperand(1);
3069   SDValue RHS = N.getOperand(2);
3070 
3071   EVT VT = N.getValueType();
3072   if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
3073       (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
3074     return distributeOpThroughSelect(DCI, LHS.getOpcode(),
3075                                      SDLoc(N), Cond, LHS, RHS);
3076   }
3077 
3078   bool Inv = false;
3079   if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
3080     std::swap(LHS, RHS);
3081     Inv = true;
3082   }
3083 
3084   // TODO: Support vector constants.
3085   ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
3086   if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) {
3087     SDLoc SL(N);
3088     // If one side is an fneg/fabs and the other is a constant, we can push the
3089     // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
3090     SDValue NewLHS = LHS.getOperand(0);
3091     SDValue NewRHS = RHS;
3092 
3093     // Careful: if the neg can be folded up, don't try to pull it back down.
3094     bool ShouldFoldNeg = true;
3095 
3096     if (NewLHS.hasOneUse()) {
3097       unsigned Opc = NewLHS.getOpcode();
3098       if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc))
3099         ShouldFoldNeg = false;
3100       if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
3101         ShouldFoldNeg = false;
3102     }
3103 
3104     if (ShouldFoldNeg) {
3105       if (LHS.getOpcode() == ISD::FNEG)
3106         NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3107       else if (CRHS->isNegative())
3108         return SDValue();
3109 
3110       if (Inv)
3111         std::swap(NewLHS, NewRHS);
3112 
3113       SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
3114                                       Cond, NewLHS, NewRHS);
3115       DCI.AddToWorklist(NewSelect.getNode());
3116       return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
3117     }
3118   }
3119 
3120   return SDValue();
3121 }
3122 
3123 
3124 SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
3125                                                    DAGCombinerInfo &DCI) const {
3126   if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
3127     return Folded;
3128 
3129   SDValue Cond = N->getOperand(0);
3130   if (Cond.getOpcode() != ISD::SETCC)
3131     return SDValue();
3132 
3133   EVT VT = N->getValueType(0);
3134   SDValue LHS = Cond.getOperand(0);
3135   SDValue RHS = Cond.getOperand(1);
3136   SDValue CC = Cond.getOperand(2);
3137 
3138   SDValue True = N->getOperand(1);
3139   SDValue False = N->getOperand(2);
3140 
3141   if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
3142     SelectionDAG &DAG = DCI.DAG;
3143     if ((DAG.isConstantValueOfAnyType(True) ||
3144          DAG.isConstantValueOfAnyType(True)) &&
3145         (!DAG.isConstantValueOfAnyType(False) &&
3146          !DAG.isConstantValueOfAnyType(False))) {
3147       // Swap cmp + select pair to move constant to false input.
3148       // This will allow using VOPC cndmasks more often.
3149       // select (setcc x, y), k, x -> select (setcc y, x) x, x
3150 
3151       SDLoc SL(N);
3152       ISD::CondCode NewCC = getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
3153                                             LHS.getValueType().isInteger());
3154 
3155       SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
3156       return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
3157     }
3158 
3159     if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
3160       SDValue MinMax
3161         = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
3162       // Revisit this node so we can catch min3/max3/med3 patterns.
3163       //DCI.AddToWorklist(MinMax.getNode());
3164       return MinMax;
3165     }
3166   }
3167 
3168   // There's no reason to not do this if the condition has other uses.
3169   return performCtlzCombine(SDLoc(N), Cond, True, False, DCI);
3170 }
3171 
3172 static bool isConstantFPZero(SDValue N) {
3173   if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
3174     return C->isZero() && !C->isNegative();
3175   return false;
3176 }
3177 
3178 static unsigned inverseMinMax(unsigned Opc) {
3179   switch (Opc) {
3180   case ISD::FMAXNUM:
3181     return ISD::FMINNUM;
3182   case ISD::FMINNUM:
3183     return ISD::FMAXNUM;
3184   case AMDGPUISD::FMAX_LEGACY:
3185     return AMDGPUISD::FMIN_LEGACY;
3186   case AMDGPUISD::FMIN_LEGACY:
3187     return  AMDGPUISD::FMAX_LEGACY;
3188   default:
3189     llvm_unreachable("invalid min/max opcode");
3190   }
3191 }
3192 
3193 SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
3194                                                  DAGCombinerInfo &DCI) const {
3195   SelectionDAG &DAG = DCI.DAG;
3196   SDValue N0 = N->getOperand(0);
3197   EVT VT = N->getValueType(0);
3198 
3199   unsigned Opc = N0.getOpcode();
3200 
3201   // If the input has multiple uses and we can either fold the negate down, or
3202   // the other uses cannot, give up. This both prevents unprofitable
3203   // transformations and infinite loops: we won't repeatedly try to fold around
3204   // a negate that has no 'good' form.
3205   if (N0.hasOneUse()) {
3206     // This may be able to fold into the source, but at a code size cost. Don't
3207     // fold if the fold into the user is free.
3208     if (allUsesHaveSourceMods(N, 0))
3209       return SDValue();
3210   } else {
3211     if (fnegFoldsIntoOp(Opc) &&
3212         (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode())))
3213       return SDValue();
3214   }
3215 
3216   SDLoc SL(N);
3217   switch (Opc) {
3218   case ISD::FADD: {
3219     if (!mayIgnoreSignedZero(N0))
3220       return SDValue();
3221 
3222     // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
3223     SDValue LHS = N0.getOperand(0);
3224     SDValue RHS = N0.getOperand(1);
3225 
3226     if (LHS.getOpcode() != ISD::FNEG)
3227       LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3228     else
3229       LHS = LHS.getOperand(0);
3230 
3231     if (RHS.getOpcode() != ISD::FNEG)
3232       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3233     else
3234       RHS = RHS.getOperand(0);
3235 
3236     SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
3237     if (!N0.hasOneUse())
3238       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3239     return Res;
3240   }
3241   case ISD::FMUL:
3242   case AMDGPUISD::FMUL_LEGACY: {
3243     // (fneg (fmul x, y)) -> (fmul x, (fneg y))
3244     // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
3245     SDValue LHS = N0.getOperand(0);
3246     SDValue RHS = N0.getOperand(1);
3247 
3248     if (LHS.getOpcode() == ISD::FNEG)
3249       LHS = LHS.getOperand(0);
3250     else if (RHS.getOpcode() == ISD::FNEG)
3251       RHS = RHS.getOperand(0);
3252     else
3253       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3254 
3255     SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
3256     if (!N0.hasOneUse())
3257       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3258     return Res;
3259   }
3260   case ISD::FMA:
3261   case ISD::FMAD: {
3262     if (!mayIgnoreSignedZero(N0))
3263       return SDValue();
3264 
3265     // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
3266     SDValue LHS = N0.getOperand(0);
3267     SDValue MHS = N0.getOperand(1);
3268     SDValue RHS = N0.getOperand(2);
3269 
3270     if (LHS.getOpcode() == ISD::FNEG)
3271       LHS = LHS.getOperand(0);
3272     else if (MHS.getOpcode() == ISD::FNEG)
3273       MHS = MHS.getOperand(0);
3274     else
3275       MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
3276 
3277     if (RHS.getOpcode() != ISD::FNEG)
3278       RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3279     else
3280       RHS = RHS.getOperand(0);
3281 
3282     SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
3283     if (!N0.hasOneUse())
3284       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3285     return Res;
3286   }
3287   case ISD::FMAXNUM:
3288   case ISD::FMINNUM:
3289   case AMDGPUISD::FMAX_LEGACY:
3290   case AMDGPUISD::FMIN_LEGACY: {
3291     // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
3292     // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
3293     // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
3294     // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
3295 
3296     SDValue LHS = N0.getOperand(0);
3297     SDValue RHS = N0.getOperand(1);
3298 
3299     // 0 doesn't have a negated inline immediate.
3300     // TODO: Shouldn't fold 1/2pi either, and should be generalized to other
3301     // operations.
3302     if (isConstantFPZero(RHS))
3303       return SDValue();
3304 
3305     SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
3306     SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
3307     unsigned Opposite = inverseMinMax(Opc);
3308 
3309     SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
3310     if (!N0.hasOneUse())
3311       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
3312     return Res;
3313   }
3314   case ISD::FP_EXTEND:
3315   case ISD::FTRUNC:
3316   case ISD::FRINT:
3317   case ISD::FNEARBYINT: // XXX - Should fround be handled?
3318   case ISD::FSIN:
3319   case AMDGPUISD::RCP:
3320   case AMDGPUISD::RCP_LEGACY:
3321   case AMDGPUISD::SIN_HW: {
3322     SDValue CvtSrc = N0.getOperand(0);
3323     if (CvtSrc.getOpcode() == ISD::FNEG) {
3324       // (fneg (fp_extend (fneg x))) -> (fp_extend x)
3325       // (fneg (rcp (fneg x))) -> (rcp x)
3326       return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
3327     }
3328 
3329     if (!N0.hasOneUse())
3330       return SDValue();
3331 
3332     // (fneg (fp_extend x)) -> (fp_extend (fneg x))
3333     // (fneg (rcp x)) -> (rcp (fneg x))
3334     SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
3335     return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
3336   }
3337   case ISD::FP_ROUND: {
3338     SDValue CvtSrc = N0.getOperand(0);
3339 
3340     if (CvtSrc.getOpcode() == ISD::FNEG) {
3341       // (fneg (fp_round (fneg x))) -> (fp_round x)
3342       return DAG.getNode(ISD::FP_ROUND, SL, VT,
3343                          CvtSrc.getOperand(0), N0.getOperand(1));
3344     }
3345 
3346     if (!N0.hasOneUse())
3347       return SDValue();
3348 
3349     // (fneg (fp_round x)) -> (fp_round (fneg x))
3350     SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
3351     return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
3352   }
3353   case ISD::FP16_TO_FP: {
3354     // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
3355     // f16, but legalization of f16 fneg ends up pulling it out of the source.
3356     // Put the fneg back as a legal source operation that can be matched later.
3357     SDLoc SL(N);
3358 
3359     SDValue Src = N0.getOperand(0);
3360     EVT SrcVT = Src.getValueType();
3361 
3362     // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
3363     SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
3364                                   DAG.getConstant(0x8000, SL, SrcVT));
3365     return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
3366   }
3367   default:
3368     return SDValue();
3369   }
3370 }
3371 
3372 SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
3373                                                  DAGCombinerInfo &DCI) const {
3374   SelectionDAG &DAG = DCI.DAG;
3375   SDValue N0 = N->getOperand(0);
3376 
3377   if (!N0.hasOneUse())
3378     return SDValue();
3379 
3380   switch (N0.getOpcode()) {
3381   case ISD::FP16_TO_FP: {
3382     assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
3383     SDLoc SL(N);
3384     SDValue Src = N0.getOperand(0);
3385     EVT SrcVT = Src.getValueType();
3386 
3387     // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
3388     SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
3389                                   DAG.getConstant(0x7fff, SL, SrcVT));
3390     return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
3391   }
3392   default:
3393     return SDValue();
3394   }
3395 }
3396 
3397 SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
3398                                                 DAGCombinerInfo &DCI) const {
3399   SelectionDAG &DAG = DCI.DAG;
3400   SDLoc DL(N);
3401 
3402   switch(N->getOpcode()) {
3403   default:
3404     break;
3405   case ISD::BITCAST: {
3406     EVT DestVT = N->getValueType(0);
3407 
3408     // Push casts through vector builds. This helps avoid emitting a large
3409     // number of copies when materializing floating point vector constants.
3410     //
3411     // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
3412     //   vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
3413     if (DestVT.isVector()) {
3414       SDValue Src = N->getOperand(0);
3415       if (Src.getOpcode() == ISD::BUILD_VECTOR) {
3416         EVT SrcVT = Src.getValueType();
3417         unsigned NElts = DestVT.getVectorNumElements();
3418 
3419         if (SrcVT.getVectorNumElements() == NElts) {
3420           EVT DestEltVT = DestVT.getVectorElementType();
3421 
3422           SmallVector<SDValue, 8> CastedElts;
3423           SDLoc SL(N);
3424           for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
3425             SDValue Elt = Src.getOperand(I);
3426             CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
3427           }
3428 
3429           return DAG.getBuildVector(DestVT, SL, CastedElts);
3430         }
3431       }
3432     }
3433 
3434     if (DestVT.getSizeInBits() != 64 && !DestVT.isVector())
3435       break;
3436 
3437     // Fold bitcasts of constants.
3438     //
3439     // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
3440     // TODO: Generalize and move to DAGCombiner
3441     SDValue Src = N->getOperand(0);
3442     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) {
3443       assert(Src.getValueType() == MVT::i64);
3444       SDLoc SL(N);
3445       uint64_t CVal = C->getZExtValue();
3446       return DAG.getNode(ISD::BUILD_VECTOR, SL, DestVT,
3447                          DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
3448                          DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
3449     }
3450 
3451     if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) {
3452       const APInt &Val = C->getValueAPF().bitcastToAPInt();
3453       SDLoc SL(N);
3454       uint64_t CVal = Val.getZExtValue();
3455       SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
3456                                 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
3457                                 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
3458 
3459       return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
3460     }
3461 
3462     break;
3463   }
3464   case ISD::SHL: {
3465     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
3466       break;
3467 
3468     return performShlCombine(N, DCI);
3469   }
3470   case ISD::SRL: {
3471     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
3472       break;
3473 
3474     return performSrlCombine(N, DCI);
3475   }
3476   case ISD::SRA: {
3477     if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
3478       break;
3479 
3480     return performSraCombine(N, DCI);
3481   }
3482   case ISD::MUL:
3483     return performMulCombine(N, DCI);
3484   case ISD::MULHS:
3485     return performMulhsCombine(N, DCI);
3486   case ISD::MULHU:
3487     return performMulhuCombine(N, DCI);
3488   case AMDGPUISD::MUL_I24:
3489   case AMDGPUISD::MUL_U24:
3490   case AMDGPUISD::MULHI_I24:
3491   case AMDGPUISD::MULHI_U24: {
3492     // If the first call to simplify is successfull, then N may end up being
3493     // deleted, so we shouldn't call simplifyI24 again.
3494     simplifyI24(N, 0, DCI) || simplifyI24(N, 1, DCI);
3495     return SDValue();
3496   }
3497   case AMDGPUISD::MUL_LOHI_I24:
3498   case AMDGPUISD::MUL_LOHI_U24:
3499     return performMulLoHi24Combine(N, DCI);
3500   case ISD::SELECT:
3501     return performSelectCombine(N, DCI);
3502   case ISD::FNEG:
3503     return performFNegCombine(N, DCI);
3504   case ISD::FABS:
3505     return performFAbsCombine(N, DCI);
3506   case AMDGPUISD::BFE_I32:
3507   case AMDGPUISD::BFE_U32: {
3508     assert(!N->getValueType(0).isVector() &&
3509            "Vector handling of BFE not implemented");
3510     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
3511     if (!Width)
3512       break;
3513 
3514     uint32_t WidthVal = Width->getZExtValue() & 0x1f;
3515     if (WidthVal == 0)
3516       return DAG.getConstant(0, DL, MVT::i32);
3517 
3518     ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
3519     if (!Offset)
3520       break;
3521 
3522     SDValue BitsFrom = N->getOperand(0);
3523     uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
3524 
3525     bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
3526 
3527     if (OffsetVal == 0) {
3528       // This is already sign / zero extended, so try to fold away extra BFEs.
3529       unsigned SignBits =  Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
3530 
3531       unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
3532       if (OpSignBits >= SignBits)
3533         return BitsFrom;
3534 
3535       EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
3536       if (Signed) {
3537         // This is a sign_extend_inreg. Replace it to take advantage of existing
3538         // DAG Combines. If not eliminated, we will match back to BFE during
3539         // selection.
3540 
3541         // TODO: The sext_inreg of extended types ends, although we can could
3542         // handle them in a single BFE.
3543         return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
3544                            DAG.getValueType(SmallVT));
3545       }
3546 
3547       return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
3548     }
3549 
3550     if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
3551       if (Signed) {
3552         return constantFoldBFE<int32_t>(DAG,
3553                                         CVal->getSExtValue(),
3554                                         OffsetVal,
3555                                         WidthVal,
3556                                         DL);
3557       }
3558 
3559       return constantFoldBFE<uint32_t>(DAG,
3560                                        CVal->getZExtValue(),
3561                                        OffsetVal,
3562                                        WidthVal,
3563                                        DL);
3564     }
3565 
3566     if ((OffsetVal + WidthVal) >= 32 &&
3567         !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
3568       SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
3569       return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
3570                          BitsFrom, ShiftVal);
3571     }
3572 
3573     if (BitsFrom.hasOneUse()) {
3574       APInt Demanded = APInt::getBitsSet(32,
3575                                          OffsetVal,
3576                                          OffsetVal + WidthVal);
3577 
3578       KnownBits Known;
3579       TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
3580                                             !DCI.isBeforeLegalizeOps());
3581       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3582       if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
3583           TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
3584         DCI.CommitTargetLoweringOpt(TLO);
3585       }
3586     }
3587 
3588     break;
3589   }
3590   case ISD::LOAD:
3591     return performLoadCombine(N, DCI);
3592   case ISD::STORE:
3593     return performStoreCombine(N, DCI);
3594   case AMDGPUISD::CLAMP:
3595     return performClampCombine(N, DCI);
3596   case AMDGPUISD::RCP: {
3597     if (const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) {
3598       // XXX - Should this flush denormals?
3599       const APFloat &Val = CFP->getValueAPF();
3600       APFloat One(Val.getSemantics(), "1.0");
3601       return DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
3602     }
3603 
3604     break;
3605   }
3606   case ISD::AssertZext:
3607   case ISD::AssertSext:
3608     return performAssertSZExtCombine(N, DCI);
3609   }
3610   return SDValue();
3611 }
3612 
3613 //===----------------------------------------------------------------------===//
3614 // Helper functions
3615 //===----------------------------------------------------------------------===//
3616 
3617 SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
3618                                                    const TargetRegisterClass *RC,
3619                                                    unsigned Reg, EVT VT,
3620                                                    const SDLoc &SL,
3621                                                    bool RawReg) const {
3622   MachineFunction &MF = DAG.getMachineFunction();
3623   MachineRegisterInfo &MRI = MF.getRegInfo();
3624   unsigned VReg;
3625 
3626   if (!MRI.isLiveIn(Reg)) {
3627     VReg = MRI.createVirtualRegister(RC);
3628     MRI.addLiveIn(Reg, VReg);
3629   } else {
3630     VReg = MRI.getLiveInVirtReg(Reg);
3631   }
3632 
3633   if (RawReg)
3634     return DAG.getRegister(VReg, VT);
3635 
3636   return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
3637 }
3638 
3639 SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
3640                                                   EVT VT,
3641                                                   const SDLoc &SL,
3642                                                   int64_t Offset) const {
3643   MachineFunction &MF = DAG.getMachineFunction();
3644   MachineFrameInfo &MFI = MF.getFrameInfo();
3645 
3646   int FI = MFI.CreateFixedObject(VT.getStoreSize(), Offset, true);
3647   auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
3648   SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
3649 
3650   return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, 4,
3651                      MachineMemOperand::MODereferenceable |
3652                      MachineMemOperand::MOInvariant);
3653 }
3654 
3655 SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG,
3656                                                    const SDLoc &SL,
3657                                                    SDValue Chain,
3658                                                    SDValue StackPtr,
3659                                                    SDValue ArgVal,
3660                                                    int64_t Offset) const {
3661   MachineFunction &MF = DAG.getMachineFunction();
3662   MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset);
3663   SDValue PtrOffset = DAG.getConstant(Offset, SL, MVT::i32);
3664   SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, StackPtr, PtrOffset);
3665 
3666   SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, 4,
3667                                MachineMemOperand::MODereferenceable);
3668   return Store;
3669 }
3670 
3671 SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
3672                                              const TargetRegisterClass *RC,
3673                                              EVT VT, const SDLoc &SL,
3674                                              const ArgDescriptor &Arg) const {
3675   assert(Arg && "Attempting to load missing argument");
3676 
3677   if (Arg.isRegister())
3678     return CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL);
3679   return loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
3680 }
3681 
3682 uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
3683     const AMDGPUMachineFunction *MFI, const ImplicitParameter Param) const {
3684   unsigned Alignment = Subtarget->getAlignmentForImplicitArgPtr();
3685   uint64_t ArgOffset = alignTo(MFI->getABIArgOffset(), Alignment);
3686   switch (Param) {
3687   case GRID_DIM:
3688     return ArgOffset;
3689   case GRID_OFFSET:
3690     return ArgOffset + 4;
3691   }
3692   llvm_unreachable("unexpected implicit parameter type");
3693 }
3694 
3695 #define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
3696 
3697 const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
3698   switch ((AMDGPUISD::NodeType)Opcode) {
3699   case AMDGPUISD::FIRST_NUMBER: break;
3700   // AMDIL DAG nodes
3701   NODE_NAME_CASE(UMUL);
3702   NODE_NAME_CASE(BRANCH_COND);
3703 
3704   // AMDGPU DAG nodes
3705   NODE_NAME_CASE(IF)
3706   NODE_NAME_CASE(ELSE)
3707   NODE_NAME_CASE(LOOP)
3708   NODE_NAME_CASE(CALL)
3709   NODE_NAME_CASE(TC_RETURN)
3710   NODE_NAME_CASE(TRAP)
3711   NODE_NAME_CASE(RET_FLAG)
3712   NODE_NAME_CASE(RETURN_TO_EPILOG)
3713   NODE_NAME_CASE(ENDPGM)
3714   NODE_NAME_CASE(DWORDADDR)
3715   NODE_NAME_CASE(FRACT)
3716   NODE_NAME_CASE(SETCC)
3717   NODE_NAME_CASE(SETREG)
3718   NODE_NAME_CASE(FMA_W_CHAIN)
3719   NODE_NAME_CASE(FMUL_W_CHAIN)
3720   NODE_NAME_CASE(CLAMP)
3721   NODE_NAME_CASE(COS_HW)
3722   NODE_NAME_CASE(SIN_HW)
3723   NODE_NAME_CASE(FMAX_LEGACY)
3724   NODE_NAME_CASE(FMIN_LEGACY)
3725   NODE_NAME_CASE(FMAX3)
3726   NODE_NAME_CASE(SMAX3)
3727   NODE_NAME_CASE(UMAX3)
3728   NODE_NAME_CASE(FMIN3)
3729   NODE_NAME_CASE(SMIN3)
3730   NODE_NAME_CASE(UMIN3)
3731   NODE_NAME_CASE(FMED3)
3732   NODE_NAME_CASE(SMED3)
3733   NODE_NAME_CASE(UMED3)
3734   NODE_NAME_CASE(URECIP)
3735   NODE_NAME_CASE(DIV_SCALE)
3736   NODE_NAME_CASE(DIV_FMAS)
3737   NODE_NAME_CASE(DIV_FIXUP)
3738   NODE_NAME_CASE(FMAD_FTZ)
3739   NODE_NAME_CASE(TRIG_PREOP)
3740   NODE_NAME_CASE(RCP)
3741   NODE_NAME_CASE(RSQ)
3742   NODE_NAME_CASE(RCP_LEGACY)
3743   NODE_NAME_CASE(RSQ_LEGACY)
3744   NODE_NAME_CASE(FMUL_LEGACY)
3745   NODE_NAME_CASE(RSQ_CLAMP)
3746   NODE_NAME_CASE(LDEXP)
3747   NODE_NAME_CASE(FP_CLASS)
3748   NODE_NAME_CASE(DOT4)
3749   NODE_NAME_CASE(CARRY)
3750   NODE_NAME_CASE(BORROW)
3751   NODE_NAME_CASE(BFE_U32)
3752   NODE_NAME_CASE(BFE_I32)
3753   NODE_NAME_CASE(BFI)
3754   NODE_NAME_CASE(BFM)
3755   NODE_NAME_CASE(FFBH_U32)
3756   NODE_NAME_CASE(FFBH_I32)
3757   NODE_NAME_CASE(MUL_U24)
3758   NODE_NAME_CASE(MUL_I24)
3759   NODE_NAME_CASE(MULHI_U24)
3760   NODE_NAME_CASE(MULHI_I24)
3761   NODE_NAME_CASE(MUL_LOHI_U24)
3762   NODE_NAME_CASE(MUL_LOHI_I24)
3763   NODE_NAME_CASE(MAD_U24)
3764   NODE_NAME_CASE(MAD_I24)
3765   NODE_NAME_CASE(TEXTURE_FETCH)
3766   NODE_NAME_CASE(EXPORT)
3767   NODE_NAME_CASE(EXPORT_DONE)
3768   NODE_NAME_CASE(R600_EXPORT)
3769   NODE_NAME_CASE(CONST_ADDRESS)
3770   NODE_NAME_CASE(REGISTER_LOAD)
3771   NODE_NAME_CASE(REGISTER_STORE)
3772   NODE_NAME_CASE(SAMPLE)
3773   NODE_NAME_CASE(SAMPLEB)
3774   NODE_NAME_CASE(SAMPLED)
3775   NODE_NAME_CASE(SAMPLEL)
3776   NODE_NAME_CASE(CVT_F32_UBYTE0)
3777   NODE_NAME_CASE(CVT_F32_UBYTE1)
3778   NODE_NAME_CASE(CVT_F32_UBYTE2)
3779   NODE_NAME_CASE(CVT_F32_UBYTE3)
3780   NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
3781   NODE_NAME_CASE(FP_TO_FP16)
3782   NODE_NAME_CASE(FP16_ZEXT)
3783   NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
3784   NODE_NAME_CASE(CONST_DATA_PTR)
3785   NODE_NAME_CASE(PC_ADD_REL_OFFSET)
3786   NODE_NAME_CASE(KILL)
3787   NODE_NAME_CASE(DUMMY_CHAIN)
3788   case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
3789   NODE_NAME_CASE(INIT_EXEC)
3790   NODE_NAME_CASE(INIT_EXEC_FROM_INPUT)
3791   NODE_NAME_CASE(SENDMSG)
3792   NODE_NAME_CASE(SENDMSGHALT)
3793   NODE_NAME_CASE(INTERP_MOV)
3794   NODE_NAME_CASE(INTERP_P1)
3795   NODE_NAME_CASE(INTERP_P2)
3796   NODE_NAME_CASE(STORE_MSKOR)
3797   NODE_NAME_CASE(LOAD_CONSTANT)
3798   NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
3799   NODE_NAME_CASE(TBUFFER_STORE_FORMAT_X3)
3800   NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
3801   NODE_NAME_CASE(ATOMIC_CMP_SWAP)
3802   NODE_NAME_CASE(ATOMIC_INC)
3803   NODE_NAME_CASE(ATOMIC_DEC)
3804   NODE_NAME_CASE(BUFFER_LOAD)
3805   NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
3806   case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
3807   }
3808   return nullptr;
3809 }
3810 
3811 SDValue AMDGPUTargetLowering::getSqrtEstimate(SDValue Operand,
3812                                               SelectionDAG &DAG, int Enabled,
3813                                               int &RefinementSteps,
3814                                               bool &UseOneConstNR,
3815                                               bool Reciprocal) const {
3816   EVT VT = Operand.getValueType();
3817 
3818   if (VT == MVT::f32) {
3819     RefinementSteps = 0;
3820     return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
3821   }
3822 
3823   // TODO: There is also f64 rsq instruction, but the documentation is less
3824   // clear on its precision.
3825 
3826   return SDValue();
3827 }
3828 
3829 SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
3830                                                SelectionDAG &DAG, int Enabled,
3831                                                int &RefinementSteps) const {
3832   EVT VT = Operand.getValueType();
3833 
3834   if (VT == MVT::f32) {
3835     // Reciprocal, < 1 ulp error.
3836     //
3837     // This reciprocal approximation converges to < 0.5 ulp error with one
3838     // newton rhapson performed with two fused multiple adds (FMAs).
3839 
3840     RefinementSteps = 0;
3841     return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
3842   }
3843 
3844   // TODO: There is also f64 rcp instruction, but the documentation is less
3845   // clear on its precision.
3846 
3847   return SDValue();
3848 }
3849 
3850 void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
3851     const SDValue Op, KnownBits &Known,
3852     const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
3853 
3854   Known.resetAll(); // Don't know anything.
3855 
3856   unsigned Opc = Op.getOpcode();
3857 
3858   switch (Opc) {
3859   default:
3860     break;
3861   case AMDGPUISD::CARRY:
3862   case AMDGPUISD::BORROW: {
3863     Known.Zero = APInt::getHighBitsSet(32, 31);
3864     break;
3865   }
3866 
3867   case AMDGPUISD::BFE_I32:
3868   case AMDGPUISD::BFE_U32: {
3869     ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
3870     if (!CWidth)
3871       return;
3872 
3873     uint32_t Width = CWidth->getZExtValue() & 0x1f;
3874 
3875     if (Opc == AMDGPUISD::BFE_U32)
3876       Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
3877 
3878     break;
3879   }
3880   case AMDGPUISD::FP_TO_FP16:
3881   case AMDGPUISD::FP16_ZEXT: {
3882     unsigned BitWidth = Known.getBitWidth();
3883 
3884     // High bits are zero.
3885     Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
3886     break;
3887   }
3888   case AMDGPUISD::MUL_U24:
3889   case AMDGPUISD::MUL_I24: {
3890     KnownBits LHSKnown, RHSKnown;
3891     DAG.computeKnownBits(Op.getOperand(0), LHSKnown, Depth + 1);
3892     DAG.computeKnownBits(Op.getOperand(1), RHSKnown, Depth + 1);
3893 
3894     unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
3895                       RHSKnown.countMinTrailingZeros();
3896     Known.Zero.setLowBits(std::min(TrailZ, 32u));
3897 
3898     unsigned LHSValBits = 32 - std::max(LHSKnown.countMinSignBits(), 8u);
3899     unsigned RHSValBits = 32 - std::max(RHSKnown.countMinSignBits(), 8u);
3900     unsigned MaxValBits = std::min(LHSValBits + RHSValBits, 32u);
3901     if (MaxValBits >= 32)
3902       break;
3903     bool Negative = false;
3904     if (Opc == AMDGPUISD::MUL_I24) {
3905       bool LHSNegative = !!(LHSKnown.One  & (1 << 23));
3906       bool LHSPositive = !!(LHSKnown.Zero & (1 << 23));
3907       bool RHSNegative = !!(RHSKnown.One  & (1 << 23));
3908       bool RHSPositive = !!(RHSKnown.Zero & (1 << 23));
3909       if ((!LHSNegative && !LHSPositive) || (!RHSNegative && !RHSPositive))
3910         break;
3911       Negative = (LHSNegative && RHSPositive) || (LHSPositive && RHSNegative);
3912     }
3913     if (Negative)
3914       Known.One.setHighBits(32 - MaxValBits);
3915     else
3916       Known.Zero.setHighBits(32 - MaxValBits);
3917     break;
3918   }
3919   }
3920 }
3921 
3922 unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
3923     SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
3924     unsigned Depth) const {
3925   switch (Op.getOpcode()) {
3926   case AMDGPUISD::BFE_I32: {
3927     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
3928     if (!Width)
3929       return 1;
3930 
3931     unsigned SignBits = 32 - Width->getZExtValue() + 1;
3932     if (!isNullConstant(Op.getOperand(1)))
3933       return SignBits;
3934 
3935     // TODO: Could probably figure something out with non-0 offsets.
3936     unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
3937     return std::max(SignBits, Op0SignBits);
3938   }
3939 
3940   case AMDGPUISD::BFE_U32: {
3941     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
3942     return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
3943   }
3944 
3945   case AMDGPUISD::CARRY:
3946   case AMDGPUISD::BORROW:
3947     return 31;
3948   case AMDGPUISD::FP_TO_FP16:
3949   case AMDGPUISD::FP16_ZEXT:
3950     return 16;
3951   default:
3952     return 1;
3953   }
3954 }
3955