1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief Custom DAG lowering for SI
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #ifdef _MSC_VER
16 // Provide M_PI.
17 #define _USE_MATH_DEFINES
18 #endif
19 
20 #include "SIISelLowering.h"
21 #include "AMDGPU.h"
22 #include "AMDGPUIntrinsicInfo.h"
23 #include "AMDGPUSubtarget.h"
24 #include "AMDGPUTargetMachine.h"
25 #include "SIDefines.h"
26 #include "SIInstrInfo.h"
27 #include "SIMachineFunctionInfo.h"
28 #include "SIRegisterInfo.h"
29 #include "Utils/AMDGPUBaseInfo.h"
30 #include "llvm/ADT/APFloat.h"
31 #include "llvm/ADT/APInt.h"
32 #include "llvm/ADT/ArrayRef.h"
33 #include "llvm/ADT/BitVector.h"
34 #include "llvm/ADT/SmallVector.h"
35 #include "llvm/ADT/Statistic.h"
36 #include "llvm/ADT/StringRef.h"
37 #include "llvm/ADT/StringSwitch.h"
38 #include "llvm/ADT/Twine.h"
39 #include "llvm/CodeGen/Analysis.h"
40 #include "llvm/CodeGen/CallingConvLower.h"
41 #include "llvm/CodeGen/DAGCombine.h"
42 #include "llvm/CodeGen/ISDOpcodes.h"
43 #include "llvm/CodeGen/MachineBasicBlock.h"
44 #include "llvm/CodeGen/MachineFrameInfo.h"
45 #include "llvm/CodeGen/MachineFunction.h"
46 #include "llvm/CodeGen/MachineInstr.h"
47 #include "llvm/CodeGen/MachineInstrBuilder.h"
48 #include "llvm/CodeGen/MachineMemOperand.h"
49 #include "llvm/CodeGen/MachineModuleInfo.h"
50 #include "llvm/CodeGen/MachineOperand.h"
51 #include "llvm/CodeGen/MachineRegisterInfo.h"
52 #include "llvm/CodeGen/MachineValueType.h"
53 #include "llvm/CodeGen/SelectionDAG.h"
54 #include "llvm/CodeGen/SelectionDAGNodes.h"
55 #include "llvm/CodeGen/ValueTypes.h"
56 #include "llvm/IR/Constants.h"
57 #include "llvm/IR/DataLayout.h"
58 #include "llvm/IR/DebugLoc.h"
59 #include "llvm/IR/DerivedTypes.h"
60 #include "llvm/IR/DiagnosticInfo.h"
61 #include "llvm/IR/Function.h"
62 #include "llvm/IR/GlobalValue.h"
63 #include "llvm/IR/InstrTypes.h"
64 #include "llvm/IR/Instruction.h"
65 #include "llvm/IR/Instructions.h"
66 #include "llvm/IR/IntrinsicInst.h"
67 #include "llvm/IR/Type.h"
68 #include "llvm/Support/Casting.h"
69 #include "llvm/Support/CodeGen.h"
70 #include "llvm/Support/CommandLine.h"
71 #include "llvm/Support/Compiler.h"
72 #include "llvm/Support/ErrorHandling.h"
73 #include "llvm/Support/KnownBits.h"
74 #include "llvm/Support/MathExtras.h"
75 #include "llvm/Target/TargetCallingConv.h"
76 #include "llvm/Target/TargetOptions.h"
77 #include "llvm/Target/TargetRegisterInfo.h"
78 #include <cassert>
79 #include <cmath>
80 #include <cstdint>
81 #include <iterator>
82 #include <tuple>
83 #include <utility>
84 #include <vector>
85 
86 using namespace llvm;
87 
88 #define DEBUG_TYPE "si-lower"
89 
90 STATISTIC(NumTailCalls, "Number of tail calls");
91 
92 static cl::opt<bool> EnableVGPRIndexMode(
93   "amdgpu-vgpr-index-mode",
94   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
95   cl::init(false));
96 
97 static unsigned findFirstFreeSGPR(CCState &CCInfo) {
98   unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
99   for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
100     if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
101       return AMDGPU::SGPR0 + Reg;
102     }
103   }
104   llvm_unreachable("Cannot allocate sgpr");
105 }
106 
107 SITargetLowering::SITargetLowering(const TargetMachine &TM,
108                                    const SISubtarget &STI)
109     : AMDGPUTargetLowering(TM, STI) {
110   addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
111   addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
112 
113   addRegisterClass(MVT::i32, &AMDGPU::SReg_32_XM0RegClass);
114   addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
115 
116   addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
117   addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
118   addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
119 
120   addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass);
121   addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass);
122 
123   addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
124   addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
125 
126   addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
127   addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
128 
129   addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
130   addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
131 
132   if (Subtarget->has16BitInsts()) {
133     addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass);
134     addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
135   }
136 
137   if (Subtarget->hasVOP3PInsts()) {
138     addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass);
139     addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass);
140   }
141 
142   computeRegisterProperties(STI.getRegisterInfo());
143 
144   // We need to custom lower vector stores from local memory
145   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
146   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
147   setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
148   setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
149   setOperationAction(ISD::LOAD, MVT::i1, Custom);
150 
151   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
152   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
153   setOperationAction(ISD::STORE, MVT::v8i32, Custom);
154   setOperationAction(ISD::STORE, MVT::v16i32, Custom);
155   setOperationAction(ISD::STORE, MVT::i1, Custom);
156 
157   setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
158   setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
159   setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
160   setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
161   setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
162   setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
163   setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
164   setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
165   setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
166   setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
167 
168   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
169   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
170   setOperationAction(ISD::ConstantPool, MVT::v2i64, Expand);
171 
172   setOperationAction(ISD::SELECT, MVT::i1, Promote);
173   setOperationAction(ISD::SELECT, MVT::i64, Custom);
174   setOperationAction(ISD::SELECT, MVT::f64, Promote);
175   AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
176 
177   setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
178   setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
179   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
180   setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
181   setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
182 
183   setOperationAction(ISD::SETCC, MVT::i1, Promote);
184   setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
185   setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
186   AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
187 
188   setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
189   setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
190 
191   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
192   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
193   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
194   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
195   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
196   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
197   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
198 
199   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
200   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
201   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
202   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
203 
204   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
205 
206   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
207   setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
208   setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
209 
210   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
211   setOperationAction(ISD::BR_CC, MVT::i1, Expand);
212   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
213   setOperationAction(ISD::BR_CC, MVT::i64, Expand);
214   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
215   setOperationAction(ISD::BR_CC, MVT::f64, Expand);
216 
217   setOperationAction(ISD::UADDO, MVT::i32, Legal);
218   setOperationAction(ISD::USUBO, MVT::i32, Legal);
219 
220   setOperationAction(ISD::ADDCARRY, MVT::i32, Legal);
221   setOperationAction(ISD::SUBCARRY, MVT::i32, Legal);
222 
223   // We only support LOAD/STORE and vector manipulation ops for vectors
224   // with > 4 elements.
225   for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
226         MVT::v2i64, MVT::v2f64}) {
227     for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
228       switch (Op) {
229       case ISD::LOAD:
230       case ISD::STORE:
231       case ISD::BUILD_VECTOR:
232       case ISD::BITCAST:
233       case ISD::EXTRACT_VECTOR_ELT:
234       case ISD::INSERT_VECTOR_ELT:
235       case ISD::INSERT_SUBVECTOR:
236       case ISD::EXTRACT_SUBVECTOR:
237       case ISD::SCALAR_TO_VECTOR:
238         break;
239       case ISD::CONCAT_VECTORS:
240         setOperationAction(Op, VT, Custom);
241         break;
242       default:
243         setOperationAction(Op, VT, Expand);
244         break;
245       }
246     }
247   }
248 
249   // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
250   // is expanded to avoid having two separate loops in case the index is a VGPR.
251 
252   // Most operations are naturally 32-bit vector operations. We only support
253   // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
254   for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
255     setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
256     AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
257 
258     setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
259     AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
260 
261     setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
262     AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
263 
264     setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
265     AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
266   }
267 
268   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
269   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
270   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
271   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
272 
273   // Avoid stack access for these.
274   // TODO: Generalize to more vector types.
275   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom);
276   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom);
277   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
278   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
279 
280   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
281   // and output demarshalling
282   setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
283   setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
284 
285   // We can't return success/failure, only the old value,
286   // let LLVM add the comparison
287   setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
288   setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand);
289 
290   if (getSubtarget()->hasFlatAddressSpace()) {
291     setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
292     setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
293   }
294 
295   setOperationAction(ISD::BSWAP, MVT::i32, Legal);
296   setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
297 
298   // On SI this is s_memtime and s_memrealtime on VI.
299   setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
300   setOperationAction(ISD::TRAP, MVT::Other, Custom);
301   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Custom);
302 
303   setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
304   setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
305 
306   if (Subtarget->getGeneration() >= SISubtarget::SEA_ISLANDS) {
307     setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
308     setOperationAction(ISD::FCEIL, MVT::f64, Legal);
309     setOperationAction(ISD::FRINT, MVT::f64, Legal);
310   }
311 
312   setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
313 
314   setOperationAction(ISD::FSIN, MVT::f32, Custom);
315   setOperationAction(ISD::FCOS, MVT::f32, Custom);
316   setOperationAction(ISD::FDIV, MVT::f32, Custom);
317   setOperationAction(ISD::FDIV, MVT::f64, Custom);
318 
319   if (Subtarget->has16BitInsts()) {
320     setOperationAction(ISD::Constant, MVT::i16, Legal);
321 
322     setOperationAction(ISD::SMIN, MVT::i16, Legal);
323     setOperationAction(ISD::SMAX, MVT::i16, Legal);
324 
325     setOperationAction(ISD::UMIN, MVT::i16, Legal);
326     setOperationAction(ISD::UMAX, MVT::i16, Legal);
327 
328     setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote);
329     AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
330 
331     setOperationAction(ISD::ROTR, MVT::i16, Promote);
332     setOperationAction(ISD::ROTL, MVT::i16, Promote);
333 
334     setOperationAction(ISD::SDIV, MVT::i16, Promote);
335     setOperationAction(ISD::UDIV, MVT::i16, Promote);
336     setOperationAction(ISD::SREM, MVT::i16, Promote);
337     setOperationAction(ISD::UREM, MVT::i16, Promote);
338 
339     setOperationAction(ISD::BSWAP, MVT::i16, Promote);
340     setOperationAction(ISD::BITREVERSE, MVT::i16, Promote);
341 
342     setOperationAction(ISD::CTTZ, MVT::i16, Promote);
343     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote);
344     setOperationAction(ISD::CTLZ, MVT::i16, Promote);
345     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote);
346 
347     setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
348 
349     setOperationAction(ISD::BR_CC, MVT::i16, Expand);
350 
351     setOperationAction(ISD::LOAD, MVT::i16, Custom);
352 
353     setTruncStoreAction(MVT::i64, MVT::i16, Expand);
354 
355     setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
356     AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
357     setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
358     AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
359 
360     setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
361     setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
362     setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
363     setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
364 
365     // F16 - Constant Actions.
366     setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
367 
368     // F16 - Load/Store Actions.
369     setOperationAction(ISD::LOAD, MVT::f16, Promote);
370     AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
371     setOperationAction(ISD::STORE, MVT::f16, Promote);
372     AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
373 
374     // F16 - VOP1 Actions.
375     setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
376     setOperationAction(ISD::FCOS, MVT::f16, Promote);
377     setOperationAction(ISD::FSIN, MVT::f16, Promote);
378     setOperationAction(ISD::FP_TO_SINT, MVT::f16, Promote);
379     setOperationAction(ISD::FP_TO_UINT, MVT::f16, Promote);
380     setOperationAction(ISD::SINT_TO_FP, MVT::f16, Promote);
381     setOperationAction(ISD::UINT_TO_FP, MVT::f16, Promote);
382     setOperationAction(ISD::FROUND, MVT::f16, Custom);
383 
384     // F16 - VOP2 Actions.
385     setOperationAction(ISD::BR_CC, MVT::f16, Expand);
386     setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
387     setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
388     setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
389     setOperationAction(ISD::FDIV, MVT::f16, Custom);
390 
391     // F16 - VOP3 Actions.
392     setOperationAction(ISD::FMA, MVT::f16, Legal);
393     if (!Subtarget->hasFP16Denormals())
394       setOperationAction(ISD::FMAD, MVT::f16, Legal);
395   }
396 
397   if (Subtarget->hasVOP3PInsts()) {
398     for (MVT VT : {MVT::v2i16, MVT::v2f16}) {
399       for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
400         switch (Op) {
401         case ISD::LOAD:
402         case ISD::STORE:
403         case ISD::BUILD_VECTOR:
404         case ISD::BITCAST:
405         case ISD::EXTRACT_VECTOR_ELT:
406         case ISD::INSERT_VECTOR_ELT:
407         case ISD::INSERT_SUBVECTOR:
408         case ISD::EXTRACT_SUBVECTOR:
409         case ISD::SCALAR_TO_VECTOR:
410           break;
411         case ISD::CONCAT_VECTORS:
412           setOperationAction(Op, VT, Custom);
413           break;
414         default:
415           setOperationAction(Op, VT, Expand);
416           break;
417         }
418       }
419     }
420 
421     // XXX - Do these do anything? Vector constants turn into build_vector.
422     setOperationAction(ISD::Constant, MVT::v2i16, Legal);
423     setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal);
424 
425     setOperationAction(ISD::STORE, MVT::v2i16, Promote);
426     AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
427     setOperationAction(ISD::STORE, MVT::v2f16, Promote);
428     AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
429 
430     setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
431     AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
432     setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
433     AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
434 
435     setOperationAction(ISD::AND, MVT::v2i16, Promote);
436     AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
437     setOperationAction(ISD::OR, MVT::v2i16, Promote);
438     AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
439     setOperationAction(ISD::XOR, MVT::v2i16, Promote);
440     AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
441     setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
442     AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
443     setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
444     AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
445 
446     setOperationAction(ISD::ADD, MVT::v2i16, Legal);
447     setOperationAction(ISD::SUB, MVT::v2i16, Legal);
448     setOperationAction(ISD::MUL, MVT::v2i16, Legal);
449     setOperationAction(ISD::SHL, MVT::v2i16, Legal);
450     setOperationAction(ISD::SRL, MVT::v2i16, Legal);
451     setOperationAction(ISD::SRA, MVT::v2i16, Legal);
452     setOperationAction(ISD::SMIN, MVT::v2i16, Legal);
453     setOperationAction(ISD::UMIN, MVT::v2i16, Legal);
454     setOperationAction(ISD::SMAX, MVT::v2i16, Legal);
455     setOperationAction(ISD::UMAX, MVT::v2i16, Legal);
456 
457     setOperationAction(ISD::FADD, MVT::v2f16, Legal);
458     setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
459     setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
460     setOperationAction(ISD::FMA, MVT::v2f16, Legal);
461     setOperationAction(ISD::FMINNUM, MVT::v2f16, Legal);
462     setOperationAction(ISD::FMAXNUM, MVT::v2f16, Legal);
463 
464     // This isn't really legal, but this avoids the legalizer unrolling it (and
465     // allows matching fneg (fabs x) patterns)
466     setOperationAction(ISD::FABS, MVT::v2f16, Legal);
467 
468     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
469     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
470 
471     setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand);
472     setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
473     setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
474     setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
475   } else {
476     setOperationAction(ISD::SELECT, MVT::v2i16, Custom);
477     setOperationAction(ISD::SELECT, MVT::v2f16, Custom);
478   }
479 
480   for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) {
481     setOperationAction(ISD::SELECT, VT, Custom);
482   }
483 
484   setTargetDAGCombine(ISD::ADD);
485   setTargetDAGCombine(ISD::ADDCARRY);
486   setTargetDAGCombine(ISD::SUB);
487   setTargetDAGCombine(ISD::SUBCARRY);
488   setTargetDAGCombine(ISD::FADD);
489   setTargetDAGCombine(ISD::FSUB);
490   setTargetDAGCombine(ISD::FMINNUM);
491   setTargetDAGCombine(ISD::FMAXNUM);
492   setTargetDAGCombine(ISD::SMIN);
493   setTargetDAGCombine(ISD::SMAX);
494   setTargetDAGCombine(ISD::UMIN);
495   setTargetDAGCombine(ISD::UMAX);
496   setTargetDAGCombine(ISD::SETCC);
497   setTargetDAGCombine(ISD::AND);
498   setTargetDAGCombine(ISD::OR);
499   setTargetDAGCombine(ISD::XOR);
500   setTargetDAGCombine(ISD::SINT_TO_FP);
501   setTargetDAGCombine(ISD::UINT_TO_FP);
502   setTargetDAGCombine(ISD::FCANONICALIZE);
503   setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
504   setTargetDAGCombine(ISD::ZERO_EXTEND);
505   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
506   setTargetDAGCombine(ISD::BUILD_VECTOR);
507 
508   // All memory operations. Some folding on the pointer operand is done to help
509   // matching the constant offsets in the addressing modes.
510   setTargetDAGCombine(ISD::LOAD);
511   setTargetDAGCombine(ISD::STORE);
512   setTargetDAGCombine(ISD::ATOMIC_LOAD);
513   setTargetDAGCombine(ISD::ATOMIC_STORE);
514   setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP);
515   setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
516   setTargetDAGCombine(ISD::ATOMIC_SWAP);
517   setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD);
518   setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB);
519   setTargetDAGCombine(ISD::ATOMIC_LOAD_AND);
520   setTargetDAGCombine(ISD::ATOMIC_LOAD_OR);
521   setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR);
522   setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND);
523   setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN);
524   setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX);
525   setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN);
526   setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
527 
528   setSchedulingPreference(Sched::RegPressure);
529 }
530 
531 const SISubtarget *SITargetLowering::getSubtarget() const {
532   return static_cast<const SISubtarget *>(Subtarget);
533 }
534 
535 //===----------------------------------------------------------------------===//
536 // TargetLowering queries
537 //===----------------------------------------------------------------------===//
538 
539 bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
540   // SI has some legal vector types, but no legal vector operations. Say no
541   // shuffles are legal in order to prefer scalarizing some vector operations.
542   return false;
543 }
544 
545 bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
546                                           const CallInst &CI,
547                                           unsigned IntrID) const {
548   switch (IntrID) {
549   case Intrinsic::amdgcn_atomic_inc:
550   case Intrinsic::amdgcn_atomic_dec: {
551     Info.opc = ISD::INTRINSIC_W_CHAIN;
552     Info.memVT = MVT::getVT(CI.getType());
553     Info.ptrVal = CI.getOperand(0);
554     Info.align = 0;
555 
556     const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
557     Info.vol = !Vol || !Vol->isZero();
558     Info.readMem = true;
559     Info.writeMem = true;
560     return true;
561   }
562   default:
563     return false;
564   }
565 }
566 
567 bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
568                                             SmallVectorImpl<Value*> &Ops,
569                                             Type *&AccessTy) const {
570   switch (II->getIntrinsicID()) {
571   case Intrinsic::amdgcn_atomic_inc:
572   case Intrinsic::amdgcn_atomic_dec: {
573     Value *Ptr = II->getArgOperand(0);
574     AccessTy = II->getType();
575     Ops.push_back(Ptr);
576     return true;
577   }
578   default:
579     return false;
580   }
581 }
582 
583 bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
584   if (!Subtarget->hasFlatInstOffsets()) {
585     // Flat instructions do not have offsets, and only have the register
586     // address.
587     return AM.BaseOffs == 0 && AM.Scale == 0;
588   }
589 
590   // GFX9 added a 13-bit signed offset. When using regular flat instructions,
591   // the sign bit is ignored and is treated as a 12-bit unsigned offset.
592 
593   // Just r + i
594   return isUInt<12>(AM.BaseOffs) && AM.Scale == 0;
595 }
596 
597 bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
598   if (Subtarget->hasFlatGlobalInsts())
599     return isInt<13>(AM.BaseOffs) && AM.Scale == 0;
600 
601   if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
602       // Assume the we will use FLAT for all global memory accesses
603       // on VI.
604       // FIXME: This assumption is currently wrong.  On VI we still use
605       // MUBUF instructions for the r + i addressing mode.  As currently
606       // implemented, the MUBUF instructions only work on buffer < 4GB.
607       // It may be possible to support > 4GB buffers with MUBUF instructions,
608       // by setting the stride value in the resource descriptor which would
609       // increase the size limit to (stride * 4GB).  However, this is risky,
610       // because it has never been validated.
611     return isLegalFlatAddressingMode(AM);
612   }
613 
614   return isLegalMUBUFAddressingMode(AM);
615 }
616 
617 bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
618   // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
619   // additionally can do r + r + i with addr64. 32-bit has more addressing
620   // mode options. Depending on the resource constant, it can also do
621   // (i64 r0) + (i32 r1) * (i14 i).
622   //
623   // Private arrays end up using a scratch buffer most of the time, so also
624   // assume those use MUBUF instructions. Scratch loads / stores are currently
625   // implemented as mubuf instructions with offen bit set, so slightly
626   // different than the normal addr64.
627   if (!isUInt<12>(AM.BaseOffs))
628     return false;
629 
630   // FIXME: Since we can split immediate into soffset and immediate offset,
631   // would it make sense to allow any immediate?
632 
633   switch (AM.Scale) {
634   case 0: // r + i or just i, depending on HasBaseReg.
635     return true;
636   case 1:
637     return true; // We have r + r or r + i.
638   case 2:
639     if (AM.HasBaseReg) {
640       // Reject 2 * r + r.
641       return false;
642     }
643 
644     // Allow 2 * r as r + r
645     // Or  2 * r + i is allowed as r + r + i.
646     return true;
647   default: // Don't allow n * r
648     return false;
649   }
650 }
651 
652 bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
653                                              const AddrMode &AM, Type *Ty,
654                                              unsigned AS, Instruction *I) const {
655   // No global is ever allowed as a base.
656   if (AM.BaseGV)
657     return false;
658 
659   if (AS == AMDGPUASI.GLOBAL_ADDRESS)
660     return isLegalGlobalAddressingMode(AM);
661 
662   if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
663     // If the offset isn't a multiple of 4, it probably isn't going to be
664     // correctly aligned.
665     // FIXME: Can we get the real alignment here?
666     if (AM.BaseOffs % 4 != 0)
667       return isLegalMUBUFAddressingMode(AM);
668 
669     // There are no SMRD extloads, so if we have to do a small type access we
670     // will use a MUBUF load.
671     // FIXME?: We also need to do this if unaligned, but we don't know the
672     // alignment here.
673     if (DL.getTypeStoreSize(Ty) < 4)
674       return isLegalGlobalAddressingMode(AM);
675 
676     if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
677       // SMRD instructions have an 8-bit, dword offset on SI.
678       if (!isUInt<8>(AM.BaseOffs / 4))
679         return false;
680     } else if (Subtarget->getGeneration() == SISubtarget::SEA_ISLANDS) {
681       // On CI+, this can also be a 32-bit literal constant offset. If it fits
682       // in 8-bits, it can use a smaller encoding.
683       if (!isUInt<32>(AM.BaseOffs / 4))
684         return false;
685     } else if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
686       // On VI, these use the SMEM format and the offset is 20-bit in bytes.
687       if (!isUInt<20>(AM.BaseOffs))
688         return false;
689     } else
690       llvm_unreachable("unhandled generation");
691 
692     if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
693       return true;
694 
695     if (AM.Scale == 1 && AM.HasBaseReg)
696       return true;
697 
698     return false;
699 
700   } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
701     return isLegalMUBUFAddressingMode(AM);
702   } else if (AS == AMDGPUASI.LOCAL_ADDRESS ||
703              AS == AMDGPUASI.REGION_ADDRESS) {
704     // Basic, single offset DS instructions allow a 16-bit unsigned immediate
705     // field.
706     // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
707     // an 8-bit dword offset but we don't know the alignment here.
708     if (!isUInt<16>(AM.BaseOffs))
709       return false;
710 
711     if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
712       return true;
713 
714     if (AM.Scale == 1 && AM.HasBaseReg)
715       return true;
716 
717     return false;
718   } else if (AS == AMDGPUASI.FLAT_ADDRESS ||
719              AS == AMDGPUASI.UNKNOWN_ADDRESS_SPACE) {
720     // For an unknown address space, this usually means that this is for some
721     // reason being used for pure arithmetic, and not based on some addressing
722     // computation. We don't have instructions that compute pointers with any
723     // addressing modes, so treat them as having no offset like flat
724     // instructions.
725     return isLegalFlatAddressingMode(AM);
726   } else {
727     llvm_unreachable("unhandled address space");
728   }
729 }
730 
731 bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
732                                         const SelectionDAG &DAG) const {
733   if (AS == AMDGPUASI.GLOBAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS) {
734     return (MemVT.getSizeInBits() <= 4 * 32);
735   } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
736     unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
737     return (MemVT.getSizeInBits() <= MaxPrivateBits);
738   } else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
739     return (MemVT.getSizeInBits() <= 2 * 32);
740   }
741   return true;
742 }
743 
744 bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
745                                                       unsigned AddrSpace,
746                                                       unsigned Align,
747                                                       bool *IsFast) const {
748   if (IsFast)
749     *IsFast = false;
750 
751   // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
752   // which isn't a simple VT.
753   // Until MVT is extended to handle this, simply check for the size and
754   // rely on the condition below: allow accesses if the size is a multiple of 4.
755   if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
756                            VT.getStoreSize() > 16)) {
757     return false;
758   }
759 
760   if (AddrSpace == AMDGPUASI.LOCAL_ADDRESS ||
761       AddrSpace == AMDGPUASI.REGION_ADDRESS) {
762     // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
763     // aligned, 8 byte access in a single operation using ds_read2/write2_b32
764     // with adjacent offsets.
765     bool AlignedBy4 = (Align % 4 == 0);
766     if (IsFast)
767       *IsFast = AlignedBy4;
768 
769     return AlignedBy4;
770   }
771 
772   // FIXME: We have to be conservative here and assume that flat operations
773   // will access scratch.  If we had access to the IR function, then we
774   // could determine if any private memory was used in the function.
775   if (!Subtarget->hasUnalignedScratchAccess() &&
776       (AddrSpace == AMDGPUASI.PRIVATE_ADDRESS ||
777        AddrSpace == AMDGPUASI.FLAT_ADDRESS)) {
778     return false;
779   }
780 
781   if (Subtarget->hasUnalignedBufferAccess()) {
782     // If we have an uniform constant load, it still requires using a slow
783     // buffer instruction if unaligned.
784     if (IsFast) {
785       *IsFast = (AddrSpace == AMDGPUASI.CONSTANT_ADDRESS) ?
786         (Align % 4 == 0) : true;
787     }
788 
789     return true;
790   }
791 
792   // Smaller than dword value must be aligned.
793   if (VT.bitsLT(MVT::i32))
794     return false;
795 
796   // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
797   // byte-address are ignored, thus forcing Dword alignment.
798   // This applies to private, global, and constant memory.
799   if (IsFast)
800     *IsFast = true;
801 
802   return VT.bitsGT(MVT::i32) && Align % 4 == 0;
803 }
804 
805 EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
806                                           unsigned SrcAlign, bool IsMemset,
807                                           bool ZeroMemset,
808                                           bool MemcpyStrSrc,
809                                           MachineFunction &MF) const {
810   // FIXME: Should account for address space here.
811 
812   // The default fallback uses the private pointer size as a guess for a type to
813   // use. Make sure we switch these to 64-bit accesses.
814 
815   if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global
816     return MVT::v4i32;
817 
818   if (Size >= 8 && DstAlign >= 4)
819     return MVT::v2i32;
820 
821   // Use the default.
822   return MVT::Other;
823 }
824 
825 static bool isFlatGlobalAddrSpace(unsigned AS, AMDGPUAS AMDGPUASI) {
826   return AS == AMDGPUASI.GLOBAL_ADDRESS ||
827          AS == AMDGPUASI.FLAT_ADDRESS ||
828          AS == AMDGPUASI.CONSTANT_ADDRESS;
829 }
830 
831 bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
832                                            unsigned DestAS) const {
833   return isFlatGlobalAddrSpace(SrcAS, AMDGPUASI) &&
834          isFlatGlobalAddrSpace(DestAS, AMDGPUASI);
835 }
836 
837 bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
838   const MemSDNode *MemNode = cast<MemSDNode>(N);
839   const Value *Ptr = MemNode->getMemOperand()->getValue();
840   const Instruction *I = dyn_cast<Instruction>(Ptr);
841   return I && I->getMetadata("amdgpu.noclobber");
842 }
843 
844 bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS,
845                                             unsigned DestAS) const {
846   // Flat -> private/local is a simple truncate.
847   // Flat -> global is no-op
848   if (SrcAS == AMDGPUASI.FLAT_ADDRESS)
849     return true;
850 
851   return isNoopAddrSpaceCast(SrcAS, DestAS);
852 }
853 
854 bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
855   const MemSDNode *MemNode = cast<MemSDNode>(N);
856 
857   return AMDGPU::isUniformMMO(MemNode->getMemOperand());
858 }
859 
860 TargetLoweringBase::LegalizeTypeAction
861 SITargetLowering::getPreferredVectorAction(EVT VT) const {
862   if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
863     return TypeSplitVector;
864 
865   return TargetLoweringBase::getPreferredVectorAction(VT);
866 }
867 
868 bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
869                                                          Type *Ty) const {
870   // FIXME: Could be smarter if called for vector constants.
871   return true;
872 }
873 
874 bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
875   if (Subtarget->has16BitInsts() && VT == MVT::i16) {
876     switch (Op) {
877     case ISD::LOAD:
878     case ISD::STORE:
879 
880     // These operations are done with 32-bit instructions anyway.
881     case ISD::AND:
882     case ISD::OR:
883     case ISD::XOR:
884     case ISD::SELECT:
885       // TODO: Extensions?
886       return true;
887     default:
888       return false;
889     }
890   }
891 
892   // SimplifySetCC uses this function to determine whether or not it should
893   // create setcc with i1 operands.  We don't have instructions for i1 setcc.
894   if (VT == MVT::i1 && Op == ISD::SETCC)
895     return false;
896 
897   return TargetLowering::isTypeDesirableForOp(Op, VT);
898 }
899 
900 SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
901                                                    const SDLoc &SL,
902                                                    SDValue Chain,
903                                                    uint64_t Offset) const {
904   const DataLayout &DL = DAG.getDataLayout();
905   MachineFunction &MF = DAG.getMachineFunction();
906   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
907 
908   const ArgDescriptor *InputPtrReg;
909   const TargetRegisterClass *RC;
910 
911   std::tie(InputPtrReg, RC)
912     = Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
913 
914   MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
915   MVT PtrVT = getPointerTy(DL, AMDGPUASI.CONSTANT_ADDRESS);
916   SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
917     MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
918 
919   return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
920                      DAG.getConstant(Offset, SL, PtrVT));
921 }
922 
923 SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
924                                             const SDLoc &SL) const {
925   auto MFI = DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
926   uint64_t Offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
927   return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
928 }
929 
930 SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
931                                          const SDLoc &SL, SDValue Val,
932                                          bool Signed,
933                                          const ISD::InputArg *Arg) const {
934   if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
935       VT.bitsLT(MemVT)) {
936     unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
937     Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
938   }
939 
940   if (MemVT.isFloatingPoint())
941     Val = getFPExtOrFPTrunc(DAG, Val, SL, VT);
942   else if (Signed)
943     Val = DAG.getSExtOrTrunc(Val, SL, VT);
944   else
945     Val = DAG.getZExtOrTrunc(Val, SL, VT);
946 
947   return Val;
948 }
949 
950 SDValue SITargetLowering::lowerKernargMemParameter(
951   SelectionDAG &DAG, EVT VT, EVT MemVT,
952   const SDLoc &SL, SDValue Chain,
953   uint64_t Offset, bool Signed,
954   const ISD::InputArg *Arg) const {
955   const DataLayout &DL = DAG.getDataLayout();
956   Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
957   PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS);
958   MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
959 
960   unsigned Align = DL.getABITypeAlignment(Ty);
961 
962   SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
963   SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
964                              MachineMemOperand::MONonTemporal |
965                              MachineMemOperand::MODereferenceable |
966                              MachineMemOperand::MOInvariant);
967 
968   SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
969   return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
970 }
971 
972 SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
973                                               const SDLoc &SL, SDValue Chain,
974                                               const ISD::InputArg &Arg) const {
975   MachineFunction &MF = DAG.getMachineFunction();
976   MachineFrameInfo &MFI = MF.getFrameInfo();
977 
978   if (Arg.Flags.isByVal()) {
979     unsigned Size = Arg.Flags.getByValSize();
980     int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
981     return DAG.getFrameIndex(FrameIdx, MVT::i32);
982   }
983 
984   unsigned ArgOffset = VA.getLocMemOffset();
985   unsigned ArgSize = VA.getValVT().getStoreSize();
986 
987   int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
988 
989   // Create load nodes to retrieve arguments from the stack.
990   SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
991   SDValue ArgValue;
992 
993   // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
994   ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
995   MVT MemVT = VA.getValVT();
996 
997   switch (VA.getLocInfo()) {
998   default:
999     break;
1000   case CCValAssign::BCvt:
1001     MemVT = VA.getLocVT();
1002     break;
1003   case CCValAssign::SExt:
1004     ExtType = ISD::SEXTLOAD;
1005     break;
1006   case CCValAssign::ZExt:
1007     ExtType = ISD::ZEXTLOAD;
1008     break;
1009   case CCValAssign::AExt:
1010     ExtType = ISD::EXTLOAD;
1011     break;
1012   }
1013 
1014   ArgValue = DAG.getExtLoad(
1015     ExtType, SL, VA.getLocVT(), Chain, FIN,
1016     MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
1017     MemVT);
1018   return ArgValue;
1019 }
1020 
1021 SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
1022   const SIMachineFunctionInfo &MFI,
1023   EVT VT,
1024   AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
1025   const ArgDescriptor *Reg;
1026   const TargetRegisterClass *RC;
1027 
1028   std::tie(Reg, RC) = MFI.getPreloadedValue(PVID);
1029   return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
1030 }
1031 
1032 static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
1033                                    CallingConv::ID CallConv,
1034                                    ArrayRef<ISD::InputArg> Ins,
1035                                    BitVector &Skipped,
1036                                    FunctionType *FType,
1037                                    SIMachineFunctionInfo *Info) {
1038   for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
1039     const ISD::InputArg &Arg = Ins[I];
1040 
1041     // First check if it's a PS input addr.
1042     if (CallConv == CallingConv::AMDGPU_PS && !Arg.Flags.isInReg() &&
1043         !Arg.Flags.isByVal() && PSInputNum <= 15) {
1044 
1045       if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) {
1046         // We can safely skip PS inputs.
1047         Skipped.set(I);
1048         ++PSInputNum;
1049         continue;
1050       }
1051 
1052       Info->markPSInputAllocated(PSInputNum);
1053       if (Arg.Used)
1054         Info->markPSInputEnabled(PSInputNum);
1055 
1056       ++PSInputNum;
1057     }
1058 
1059     // Second split vertices into their elements.
1060     if (Arg.VT.isVector()) {
1061       ISD::InputArg NewArg = Arg;
1062       NewArg.Flags.setSplit();
1063       NewArg.VT = Arg.VT.getVectorElementType();
1064 
1065       // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
1066       // three or five element vertex only needs three or five registers,
1067       // NOT four or eight.
1068       Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
1069       unsigned NumElements = ParamType->getVectorNumElements();
1070 
1071       for (unsigned J = 0; J != NumElements; ++J) {
1072         Splits.push_back(NewArg);
1073         NewArg.PartOffset += NewArg.VT.getStoreSize();
1074       }
1075     } else {
1076       Splits.push_back(Arg);
1077     }
1078   }
1079 }
1080 
1081 // Allocate special inputs passed in VGPRs.
1082 static void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
1083                                            MachineFunction &MF,
1084                                            const SIRegisterInfo &TRI,
1085                                            SIMachineFunctionInfo &Info) {
1086   if (Info.hasWorkItemIDX()) {
1087     unsigned Reg = AMDGPU::VGPR0;
1088     MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1089 
1090     CCInfo.AllocateReg(Reg);
1091     Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg));
1092   }
1093 
1094   if (Info.hasWorkItemIDY()) {
1095     unsigned Reg = AMDGPU::VGPR1;
1096     MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1097 
1098     CCInfo.AllocateReg(Reg);
1099     Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
1100   }
1101 
1102   if (Info.hasWorkItemIDZ()) {
1103     unsigned Reg = AMDGPU::VGPR2;
1104     MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1105 
1106     CCInfo.AllocateReg(Reg);
1107     Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
1108   }
1109 }
1110 
1111 // Try to allocate a VGPR at the end of the argument list, or if no argument
1112 // VGPRs are left allocating a stack slot.
1113 static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) {
1114   ArrayRef<MCPhysReg> ArgVGPRs
1115     = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
1116   unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
1117   if (RegIdx == ArgVGPRs.size()) {
1118     // Spill to stack required.
1119     int64_t Offset = CCInfo.AllocateStack(4, 4);
1120 
1121     return ArgDescriptor::createStack(Offset);
1122   }
1123 
1124   unsigned Reg = ArgVGPRs[RegIdx];
1125   Reg = CCInfo.AllocateReg(Reg);
1126   assert(Reg != AMDGPU::NoRegister);
1127 
1128   MachineFunction &MF = CCInfo.getMachineFunction();
1129   MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1130   return ArgDescriptor::createRegister(Reg);
1131 }
1132 
1133 static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
1134                                              const TargetRegisterClass *RC,
1135                                              unsigned NumArgRegs) {
1136   ArrayRef<MCPhysReg> ArgSGPRs = makeArrayRef(RC->begin(), 32);
1137   unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
1138   if (RegIdx == ArgSGPRs.size())
1139     report_fatal_error("ran out of SGPRs for arguments");
1140 
1141   unsigned Reg = ArgSGPRs[RegIdx];
1142   Reg = CCInfo.AllocateReg(Reg);
1143   assert(Reg != AMDGPU::NoRegister);
1144 
1145   MachineFunction &MF = CCInfo.getMachineFunction();
1146   MF.addLiveIn(Reg, RC);
1147   return ArgDescriptor::createRegister(Reg);
1148 }
1149 
1150 static ArgDescriptor allocateSGPR32Input(CCState &CCInfo) {
1151   return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
1152 }
1153 
1154 static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) {
1155   return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
1156 }
1157 
1158 static void allocateSpecialInputVGPRs(CCState &CCInfo,
1159                                       MachineFunction &MF,
1160                                       const SIRegisterInfo &TRI,
1161                                       SIMachineFunctionInfo &Info) {
1162   if (Info.hasWorkItemIDX())
1163     Info.setWorkItemIDX(allocateVGPR32Input(CCInfo));
1164 
1165   if (Info.hasWorkItemIDY())
1166     Info.setWorkItemIDY(allocateVGPR32Input(CCInfo));
1167 
1168   if (Info.hasWorkItemIDZ())
1169     Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo));
1170 }
1171 
1172 static void allocateSpecialInputSGPRs(CCState &CCInfo,
1173                                       MachineFunction &MF,
1174                                       const SIRegisterInfo &TRI,
1175                                       SIMachineFunctionInfo &Info) {
1176   auto &ArgInfo = Info.getArgInfo();
1177 
1178   // TODO: Unify handling with private memory pointers.
1179 
1180   if (Info.hasDispatchPtr())
1181     ArgInfo.DispatchPtr = allocateSGPR64Input(CCInfo);
1182 
1183   if (Info.hasQueuePtr())
1184     ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo);
1185 
1186   if (Info.hasKernargSegmentPtr())
1187     ArgInfo.KernargSegmentPtr = allocateSGPR64Input(CCInfo);
1188 
1189   if (Info.hasDispatchID())
1190     ArgInfo.DispatchID = allocateSGPR64Input(CCInfo);
1191 
1192   // flat_scratch_init is not applicable for non-kernel functions.
1193 
1194   if (Info.hasWorkGroupIDX())
1195     ArgInfo.WorkGroupIDX = allocateSGPR32Input(CCInfo);
1196 
1197   if (Info.hasWorkGroupIDY())
1198     ArgInfo.WorkGroupIDY = allocateSGPR32Input(CCInfo);
1199 
1200   if (Info.hasWorkGroupIDZ())
1201     ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo);
1202 
1203   if (Info.hasImplicitArgPtr())
1204     ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo);
1205 }
1206 
1207 // Allocate special inputs passed in user SGPRs.
1208 static void allocateHSAUserSGPRs(CCState &CCInfo,
1209                                  MachineFunction &MF,
1210                                  const SIRegisterInfo &TRI,
1211                                  SIMachineFunctionInfo &Info) {
1212   if (Info.hasImplicitBufferPtr()) {
1213     unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
1214     MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
1215     CCInfo.AllocateReg(ImplicitBufferPtrReg);
1216   }
1217 
1218   // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
1219   if (Info.hasPrivateSegmentBuffer()) {
1220     unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
1221     MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
1222     CCInfo.AllocateReg(PrivateSegmentBufferReg);
1223   }
1224 
1225   if (Info.hasDispatchPtr()) {
1226     unsigned DispatchPtrReg = Info.addDispatchPtr(TRI);
1227     MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
1228     CCInfo.AllocateReg(DispatchPtrReg);
1229   }
1230 
1231   if (Info.hasQueuePtr()) {
1232     unsigned QueuePtrReg = Info.addQueuePtr(TRI);
1233     MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
1234     CCInfo.AllocateReg(QueuePtrReg);
1235   }
1236 
1237   if (Info.hasKernargSegmentPtr()) {
1238     unsigned InputPtrReg = Info.addKernargSegmentPtr(TRI);
1239     MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
1240     CCInfo.AllocateReg(InputPtrReg);
1241   }
1242 
1243   if (Info.hasDispatchID()) {
1244     unsigned DispatchIDReg = Info.addDispatchID(TRI);
1245     MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
1246     CCInfo.AllocateReg(DispatchIDReg);
1247   }
1248 
1249   if (Info.hasFlatScratchInit()) {
1250     unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI);
1251     MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
1252     CCInfo.AllocateReg(FlatScratchInitReg);
1253   }
1254 
1255   // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
1256   // these from the dispatch pointer.
1257 }
1258 
1259 // Allocate special input registers that are initialized per-wave.
1260 static void allocateSystemSGPRs(CCState &CCInfo,
1261                                 MachineFunction &MF,
1262                                 SIMachineFunctionInfo &Info,
1263                                 CallingConv::ID CallConv,
1264                                 bool IsShader) {
1265   if (Info.hasWorkGroupIDX()) {
1266     unsigned Reg = Info.addWorkGroupIDX();
1267     MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1268     CCInfo.AllocateReg(Reg);
1269   }
1270 
1271   if (Info.hasWorkGroupIDY()) {
1272     unsigned Reg = Info.addWorkGroupIDY();
1273     MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1274     CCInfo.AllocateReg(Reg);
1275   }
1276 
1277   if (Info.hasWorkGroupIDZ()) {
1278     unsigned Reg = Info.addWorkGroupIDZ();
1279     MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1280     CCInfo.AllocateReg(Reg);
1281   }
1282 
1283   if (Info.hasWorkGroupInfo()) {
1284     unsigned Reg = Info.addWorkGroupInfo();
1285     MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1286     CCInfo.AllocateReg(Reg);
1287   }
1288 
1289   if (Info.hasPrivateSegmentWaveByteOffset()) {
1290     // Scratch wave offset passed in system SGPR.
1291     unsigned PrivateSegmentWaveByteOffsetReg;
1292 
1293     if (IsShader) {
1294       PrivateSegmentWaveByteOffsetReg =
1295         Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
1296 
1297       // This is true if the scratch wave byte offset doesn't have a fixed
1298       // location.
1299       if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
1300         PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
1301         Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
1302       }
1303     } else
1304       PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
1305 
1306     MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
1307     CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
1308   }
1309 }
1310 
1311 static void reservePrivateMemoryRegs(const TargetMachine &TM,
1312                                      MachineFunction &MF,
1313                                      const SIRegisterInfo &TRI,
1314                                      SIMachineFunctionInfo &Info) {
1315   // Now that we've figured out where the scratch register inputs are, see if
1316   // should reserve the arguments and use them directly.
1317   MachineFrameInfo &MFI = MF.getFrameInfo();
1318   bool HasStackObjects = MFI.hasStackObjects();
1319 
1320   // Record that we know we have non-spill stack objects so we don't need to
1321   // check all stack objects later.
1322   if (HasStackObjects)
1323     Info.setHasNonSpillStackObjects(true);
1324 
1325   // Everything live out of a block is spilled with fast regalloc, so it's
1326   // almost certain that spilling will be required.
1327   if (TM.getOptLevel() == CodeGenOpt::None)
1328     HasStackObjects = true;
1329 
1330   // For now assume stack access is needed in any callee functions, so we need
1331   // the scratch registers to pass in.
1332   bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
1333 
1334   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
1335   if (ST.isAmdCodeObjectV2(MF)) {
1336     if (RequiresStackAccess) {
1337       // If we have stack objects, we unquestionably need the private buffer
1338       // resource. For the Code Object V2 ABI, this will be the first 4 user
1339       // SGPR inputs. We can reserve those and use them directly.
1340 
1341       unsigned PrivateSegmentBufferReg = Info.getPreloadedReg(
1342         AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
1343       Info.setScratchRSrcReg(PrivateSegmentBufferReg);
1344 
1345       if (MFI.hasCalls()) {
1346         // If we have calls, we need to keep the frame register in a register
1347         // that won't be clobbered by a call, so ensure it is copied somewhere.
1348 
1349         // This is not a problem for the scratch wave offset, because the same
1350         // registers are reserved in all functions.
1351 
1352         // FIXME: Nothing is really ensuring this is a call preserved register,
1353         // it's just selected from the end so it happens to be.
1354         unsigned ReservedOffsetReg
1355           = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
1356         Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1357       } else {
1358         unsigned PrivateSegmentWaveByteOffsetReg = Info.getPreloadedReg(
1359           AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
1360         Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
1361       }
1362     } else {
1363       unsigned ReservedBufferReg
1364         = TRI.reservedPrivateSegmentBufferReg(MF);
1365       unsigned ReservedOffsetReg
1366         = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
1367 
1368       // We tentatively reserve the last registers (skipping the last two
1369       // which may contain VCC). After register allocation, we'll replace
1370       // these with the ones immediately after those which were really
1371       // allocated. In the prologue copies will be inserted from the argument
1372       // to these reserved registers.
1373       Info.setScratchRSrcReg(ReservedBufferReg);
1374       Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1375     }
1376   } else {
1377     unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
1378 
1379     // Without HSA, relocations are used for the scratch pointer and the
1380     // buffer resource setup is always inserted in the prologue. Scratch wave
1381     // offset is still in an input SGPR.
1382     Info.setScratchRSrcReg(ReservedBufferReg);
1383 
1384     if (HasStackObjects && !MFI.hasCalls()) {
1385       unsigned ScratchWaveOffsetReg = Info.getPreloadedReg(
1386         AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
1387       Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg);
1388     } else {
1389       unsigned ReservedOffsetReg
1390         = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
1391       Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1392     }
1393   }
1394 }
1395 
1396 bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const {
1397   const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1398   return !Info->isEntryFunction();
1399 }
1400 
1401 void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
1402 
1403 }
1404 
1405 void SITargetLowering::insertCopiesSplitCSR(
1406   MachineBasicBlock *Entry,
1407   const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
1408   const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
1409 
1410   const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
1411   if (!IStart)
1412     return;
1413 
1414   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1415   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
1416   MachineBasicBlock::iterator MBBI = Entry->begin();
1417   for (const MCPhysReg *I = IStart; *I; ++I) {
1418     const TargetRegisterClass *RC = nullptr;
1419     if (AMDGPU::SReg_64RegClass.contains(*I))
1420       RC = &AMDGPU::SGPR_64RegClass;
1421     else if (AMDGPU::SReg_32RegClass.contains(*I))
1422       RC = &AMDGPU::SGPR_32RegClass;
1423     else
1424       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
1425 
1426     unsigned NewVR = MRI->createVirtualRegister(RC);
1427     // Create copy from CSR to a virtual register.
1428     Entry->addLiveIn(*I);
1429     BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
1430       .addReg(*I);
1431 
1432     // Insert the copy-back instructions right before the terminator.
1433     for (auto *Exit : Exits)
1434       BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
1435               TII->get(TargetOpcode::COPY), *I)
1436         .addReg(NewVR);
1437   }
1438 }
1439 
1440 SDValue SITargetLowering::LowerFormalArguments(
1441     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
1442     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
1443     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
1444   const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
1445 
1446   MachineFunction &MF = DAG.getMachineFunction();
1447   FunctionType *FType = MF.getFunction()->getFunctionType();
1448   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1449   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
1450 
1451   if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
1452     const Function *Fn = MF.getFunction();
1453     DiagnosticInfoUnsupported NoGraphicsHSA(
1454         *Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
1455     DAG.getContext()->diagnose(NoGraphicsHSA);
1456     return DAG.getEntryNode();
1457   }
1458 
1459   // Create stack objects that are used for emitting debugger prologue if
1460   // "amdgpu-debugger-emit-prologue" attribute was specified.
1461   if (ST.debuggerEmitPrologue())
1462     createDebuggerPrologueStackObjects(MF);
1463 
1464   SmallVector<ISD::InputArg, 16> Splits;
1465   SmallVector<CCValAssign, 16> ArgLocs;
1466   BitVector Skipped(Ins.size());
1467   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1468                  *DAG.getContext());
1469 
1470   bool IsShader = AMDGPU::isShader(CallConv);
1471   bool IsKernel = AMDGPU::isKernel(CallConv);
1472   bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
1473 
1474   if (!IsEntryFunc) {
1475     // 4 bytes are reserved at offset 0 for the emergency stack slot. Skip over
1476     // this when allocating argument fixed offsets.
1477     CCInfo.AllocateStack(4, 4);
1478   }
1479 
1480   if (IsShader) {
1481     processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
1482 
1483     // At least one interpolation mode must be enabled or else the GPU will
1484     // hang.
1485     //
1486     // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
1487     // set PSInputAddr, the user wants to enable some bits after the compilation
1488     // based on run-time states. Since we can't know what the final PSInputEna
1489     // will look like, so we shouldn't do anything here and the user should take
1490     // responsibility for the correct programming.
1491     //
1492     // Otherwise, the following restrictions apply:
1493     // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
1494     // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
1495     //   enabled too.
1496     if (CallConv == CallingConv::AMDGPU_PS &&
1497         ((Info->getPSInputAddr() & 0x7F) == 0 ||
1498          ((Info->getPSInputAddr() & 0xF) == 0 &&
1499           Info->isPSInputAllocated(11)))) {
1500       CCInfo.AllocateReg(AMDGPU::VGPR0);
1501       CCInfo.AllocateReg(AMDGPU::VGPR1);
1502       Info->markPSInputAllocated(0);
1503       Info->markPSInputEnabled(0);
1504     }
1505 
1506     assert(!Info->hasDispatchPtr() &&
1507            !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
1508            !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
1509            !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
1510            !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
1511            !Info->hasWorkItemIDZ());
1512   } else if (IsKernel) {
1513     assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
1514   } else {
1515     Splits.append(Ins.begin(), Ins.end());
1516   }
1517 
1518   if (IsEntryFunc) {
1519     allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
1520     allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
1521   }
1522 
1523   if (IsKernel) {
1524     analyzeFormalArgumentsCompute(CCInfo, Ins);
1525   } else {
1526     CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
1527     CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
1528   }
1529 
1530   SmallVector<SDValue, 16> Chains;
1531 
1532   for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
1533     const ISD::InputArg &Arg = Ins[i];
1534     if (Skipped[i]) {
1535       InVals.push_back(DAG.getUNDEF(Arg.VT));
1536       continue;
1537     }
1538 
1539     CCValAssign &VA = ArgLocs[ArgIdx++];
1540     MVT VT = VA.getLocVT();
1541 
1542     if (IsEntryFunc && VA.isMemLoc()) {
1543       VT = Ins[i].VT;
1544       EVT MemVT = VA.getLocVT();
1545 
1546       const uint64_t Offset = Subtarget->getExplicitKernelArgOffset(MF) +
1547         VA.getLocMemOffset();
1548       Info->setABIArgOffset(Offset + MemVT.getStoreSize());
1549 
1550       // The first 36 bytes of the input buffer contains information about
1551       // thread group and global sizes.
1552       SDValue Arg = lowerKernargMemParameter(
1553         DAG, VT, MemVT, DL, Chain, Offset, Ins[i].Flags.isSExt(), &Ins[i]);
1554       Chains.push_back(Arg.getValue(1));
1555 
1556       auto *ParamTy =
1557         dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
1558       if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
1559           ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
1560         // On SI local pointers are just offsets into LDS, so they are always
1561         // less than 16-bits.  On CI and newer they could potentially be
1562         // real pointers, so we can't guarantee their size.
1563         Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
1564                           DAG.getValueType(MVT::i16));
1565       }
1566 
1567       InVals.push_back(Arg);
1568       continue;
1569     } else if (!IsEntryFunc && VA.isMemLoc()) {
1570       SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
1571       InVals.push_back(Val);
1572       if (!Arg.Flags.isByVal())
1573         Chains.push_back(Val.getValue(1));
1574       continue;
1575     }
1576 
1577     assert(VA.isRegLoc() && "Parameter must be in a register!");
1578 
1579     unsigned Reg = VA.getLocReg();
1580     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
1581     EVT ValVT = VA.getValVT();
1582 
1583     Reg = MF.addLiveIn(Reg, RC);
1584     SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1585 
1586     // If this is an 8 or 16-bit value, it is really passed promoted
1587     // to 32 bits. Insert an assert[sz]ext to capture this, then
1588     // truncate to the right size.
1589     switch (VA.getLocInfo()) {
1590     case CCValAssign::Full:
1591       break;
1592     case CCValAssign::BCvt:
1593       Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
1594       break;
1595     case CCValAssign::SExt:
1596       Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
1597                         DAG.getValueType(ValVT));
1598       Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
1599       break;
1600     case CCValAssign::ZExt:
1601       Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
1602                         DAG.getValueType(ValVT));
1603       Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
1604       break;
1605     case CCValAssign::AExt:
1606       Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
1607       break;
1608     default:
1609       llvm_unreachable("Unknown loc info!");
1610     }
1611 
1612     if (IsShader && Arg.VT.isVector()) {
1613       // Build a vector from the registers
1614       Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
1615       unsigned NumElements = ParamType->getVectorNumElements();
1616 
1617       SmallVector<SDValue, 4> Regs;
1618       Regs.push_back(Val);
1619       for (unsigned j = 1; j != NumElements; ++j) {
1620         Reg = ArgLocs[ArgIdx++].getLocReg();
1621         Reg = MF.addLiveIn(Reg, RC);
1622 
1623         SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1624         Regs.push_back(Copy);
1625       }
1626 
1627       // Fill up the missing vector elements
1628       NumElements = Arg.VT.getVectorNumElements() - NumElements;
1629       Regs.append(NumElements, DAG.getUNDEF(VT));
1630 
1631       InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs));
1632       continue;
1633     }
1634 
1635     InVals.push_back(Val);
1636   }
1637 
1638   if (!IsEntryFunc) {
1639     // Special inputs come after user arguments.
1640     allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
1641   }
1642 
1643   // Start adding system SGPRs.
1644   if (IsEntryFunc) {
1645     allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
1646   } else {
1647     CCInfo.AllocateReg(Info->getScratchRSrcReg());
1648     CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
1649     CCInfo.AllocateReg(Info->getFrameOffsetReg());
1650     allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
1651   }
1652 
1653   auto &ArgUsageInfo =
1654     DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
1655   ArgUsageInfo.setFuncArgInfo(*MF.getFunction(), Info->getArgInfo());
1656 
1657   unsigned StackArgSize = CCInfo.getNextStackOffset();
1658   Info->setBytesInStackArgArea(StackArgSize);
1659 
1660   return Chains.empty() ? Chain :
1661     DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
1662 }
1663 
1664 // TODO: If return values can't fit in registers, we should return as many as
1665 // possible in registers before passing on stack.
1666 bool SITargetLowering::CanLowerReturn(
1667   CallingConv::ID CallConv,
1668   MachineFunction &MF, bool IsVarArg,
1669   const SmallVectorImpl<ISD::OutputArg> &Outs,
1670   LLVMContext &Context) const {
1671   // Replacing returns with sret/stack usage doesn't make sense for shaders.
1672   // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
1673   // for shaders. Vector types should be explicitly handled by CC.
1674   if (AMDGPU::isEntryFunctionCC(CallConv))
1675     return true;
1676 
1677   SmallVector<CCValAssign, 16> RVLocs;
1678   CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
1679   return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
1680 }
1681 
1682 SDValue
1683 SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
1684                               bool isVarArg,
1685                               const SmallVectorImpl<ISD::OutputArg> &Outs,
1686                               const SmallVectorImpl<SDValue> &OutVals,
1687                               const SDLoc &DL, SelectionDAG &DAG) const {
1688   MachineFunction &MF = DAG.getMachineFunction();
1689   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1690 
1691   if (AMDGPU::isKernel(CallConv)) {
1692     return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
1693                                              OutVals, DL, DAG);
1694   }
1695 
1696   bool IsShader = AMDGPU::isShader(CallConv);
1697 
1698   Info->setIfReturnsVoid(Outs.size() == 0);
1699   bool IsWaveEnd = Info->returnsVoid() && IsShader;
1700 
1701   SmallVector<ISD::OutputArg, 48> Splits;
1702   SmallVector<SDValue, 48> SplitVals;
1703 
1704   // Split vectors into their elements.
1705   for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
1706     const ISD::OutputArg &Out = Outs[i];
1707 
1708     if (IsShader && Out.VT.isVector()) {
1709       MVT VT = Out.VT.getVectorElementType();
1710       ISD::OutputArg NewOut = Out;
1711       NewOut.Flags.setSplit();
1712       NewOut.VT = VT;
1713 
1714       // We want the original number of vector elements here, e.g.
1715       // three or five, not four or eight.
1716       unsigned NumElements = Out.ArgVT.getVectorNumElements();
1717 
1718       for (unsigned j = 0; j != NumElements; ++j) {
1719         SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, OutVals[i],
1720                                    DAG.getConstant(j, DL, MVT::i32));
1721         SplitVals.push_back(Elem);
1722         Splits.push_back(NewOut);
1723         NewOut.PartOffset += NewOut.VT.getStoreSize();
1724       }
1725     } else {
1726       SplitVals.push_back(OutVals[i]);
1727       Splits.push_back(Out);
1728     }
1729   }
1730 
1731   // CCValAssign - represent the assignment of the return value to a location.
1732   SmallVector<CCValAssign, 48> RVLocs;
1733 
1734   // CCState - Info about the registers and stack slots.
1735   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
1736                  *DAG.getContext());
1737 
1738   // Analyze outgoing return values.
1739   CCInfo.AnalyzeReturn(Splits, CCAssignFnForReturn(CallConv, isVarArg));
1740 
1741   SDValue Flag;
1742   SmallVector<SDValue, 48> RetOps;
1743   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
1744 
1745   // Add return address for callable functions.
1746   if (!Info->isEntryFunction()) {
1747     const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
1748     SDValue ReturnAddrReg = CreateLiveInRegister(
1749       DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
1750 
1751     // FIXME: Should be able to use a vreg here, but need a way to prevent it
1752     // from being allcoated to a CSR.
1753 
1754     SDValue PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
1755                                                 MVT::i64);
1756 
1757     Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, Flag);
1758     Flag = Chain.getValue(1);
1759 
1760     RetOps.push_back(PhysReturnAddrReg);
1761   }
1762 
1763   // Copy the result values into the output registers.
1764   for (unsigned i = 0, realRVLocIdx = 0;
1765        i != RVLocs.size();
1766        ++i, ++realRVLocIdx) {
1767     CCValAssign &VA = RVLocs[i];
1768     assert(VA.isRegLoc() && "Can only return in registers!");
1769     // TODO: Partially return in registers if return values don't fit.
1770 
1771     SDValue Arg = SplitVals[realRVLocIdx];
1772 
1773     // Copied from other backends.
1774     switch (VA.getLocInfo()) {
1775     case CCValAssign::Full:
1776       break;
1777     case CCValAssign::BCvt:
1778       Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
1779       break;
1780     case CCValAssign::SExt:
1781       Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
1782       break;
1783     case CCValAssign::ZExt:
1784       Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
1785       break;
1786     case CCValAssign::AExt:
1787       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
1788       break;
1789     default:
1790       llvm_unreachable("Unknown loc info!");
1791     }
1792 
1793     Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
1794     Flag = Chain.getValue(1);
1795     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
1796   }
1797 
1798   // FIXME: Does sret work properly?
1799   if (!Info->isEntryFunction()) {
1800     const SIRegisterInfo *TRI
1801       = static_cast<const SISubtarget *>(Subtarget)->getRegisterInfo();
1802     const MCPhysReg *I =
1803       TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
1804     if (I) {
1805       for (; *I; ++I) {
1806         if (AMDGPU::SReg_64RegClass.contains(*I))
1807           RetOps.push_back(DAG.getRegister(*I, MVT::i64));
1808         else if (AMDGPU::SReg_32RegClass.contains(*I))
1809           RetOps.push_back(DAG.getRegister(*I, MVT::i32));
1810         else
1811           llvm_unreachable("Unexpected register class in CSRsViaCopy!");
1812       }
1813     }
1814   }
1815 
1816   // Update chain and glue.
1817   RetOps[0] = Chain;
1818   if (Flag.getNode())
1819     RetOps.push_back(Flag);
1820 
1821   unsigned Opc = AMDGPUISD::ENDPGM;
1822   if (!IsWaveEnd)
1823     Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG;
1824   return DAG.getNode(Opc, DL, MVT::Other, RetOps);
1825 }
1826 
1827 SDValue SITargetLowering::LowerCallResult(
1828     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
1829     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
1830     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
1831     SDValue ThisVal) const {
1832   CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
1833 
1834   // Assign locations to each value returned by this call.
1835   SmallVector<CCValAssign, 16> RVLocs;
1836   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
1837                  *DAG.getContext());
1838   CCInfo.AnalyzeCallResult(Ins, RetCC);
1839 
1840   // Copy all of the result registers out of their specified physreg.
1841   for (unsigned i = 0; i != RVLocs.size(); ++i) {
1842     CCValAssign VA = RVLocs[i];
1843     SDValue Val;
1844 
1845     if (VA.isRegLoc()) {
1846       Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
1847       Chain = Val.getValue(1);
1848       InFlag = Val.getValue(2);
1849     } else if (VA.isMemLoc()) {
1850       report_fatal_error("TODO: return values in memory");
1851     } else
1852       llvm_unreachable("unknown argument location type");
1853 
1854     switch (VA.getLocInfo()) {
1855     case CCValAssign::Full:
1856       break;
1857     case CCValAssign::BCvt:
1858       Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
1859       break;
1860     case CCValAssign::ZExt:
1861       Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
1862                         DAG.getValueType(VA.getValVT()));
1863       Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
1864       break;
1865     case CCValAssign::SExt:
1866       Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
1867                         DAG.getValueType(VA.getValVT()));
1868       Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
1869       break;
1870     case CCValAssign::AExt:
1871       Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
1872       break;
1873     default:
1874       llvm_unreachable("Unknown loc info!");
1875     }
1876 
1877     InVals.push_back(Val);
1878   }
1879 
1880   return Chain;
1881 }
1882 
1883 // Add code to pass special inputs required depending on used features separate
1884 // from the explicit user arguments present in the IR.
1885 void SITargetLowering::passSpecialInputs(
1886     CallLoweringInfo &CLI,
1887     const SIMachineFunctionInfo &Info,
1888     SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
1889     SmallVectorImpl<SDValue> &MemOpChains,
1890     SDValue Chain,
1891     SDValue StackPtr) const {
1892   // If we don't have a call site, this was a call inserted by
1893   // legalization. These can never use special inputs.
1894   if (!CLI.CS)
1895     return;
1896 
1897   const Function *CalleeFunc = CLI.CS.getCalledFunction();
1898   assert(CalleeFunc);
1899 
1900   SelectionDAG &DAG = CLI.DAG;
1901   const SDLoc &DL = CLI.DL;
1902 
1903   const SISubtarget *ST = getSubtarget();
1904   const SIRegisterInfo *TRI = ST->getRegisterInfo();
1905 
1906   auto &ArgUsageInfo =
1907     DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
1908   const AMDGPUFunctionArgInfo &CalleeArgInfo
1909     = ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
1910 
1911   const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
1912 
1913   // TODO: Unify with private memory register handling. This is complicated by
1914   // the fact that at least in kernels, the input argument is not necessarily
1915   // in the same location as the input.
1916   AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = {
1917     AMDGPUFunctionArgInfo::DISPATCH_PTR,
1918     AMDGPUFunctionArgInfo::QUEUE_PTR,
1919     AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR,
1920     AMDGPUFunctionArgInfo::DISPATCH_ID,
1921     AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
1922     AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
1923     AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
1924     AMDGPUFunctionArgInfo::WORKITEM_ID_X,
1925     AMDGPUFunctionArgInfo::WORKITEM_ID_Y,
1926     AMDGPUFunctionArgInfo::WORKITEM_ID_Z,
1927     AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
1928   };
1929 
1930   for (auto InputID : InputRegs) {
1931     const ArgDescriptor *OutgoingArg;
1932     const TargetRegisterClass *ArgRC;
1933 
1934     std::tie(OutgoingArg, ArgRC) = CalleeArgInfo.getPreloadedValue(InputID);
1935     if (!OutgoingArg)
1936       continue;
1937 
1938     const ArgDescriptor *IncomingArg;
1939     const TargetRegisterClass *IncomingArgRC;
1940     std::tie(IncomingArg, IncomingArgRC)
1941       = CallerArgInfo.getPreloadedValue(InputID);
1942     assert(IncomingArgRC == ArgRC);
1943 
1944     // All special arguments are ints for now.
1945     EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
1946     SDValue InputReg;
1947 
1948     if (IncomingArg) {
1949       InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
1950     } else {
1951       // The implicit arg ptr is special because it doesn't have a corresponding
1952       // input for kernels, and is computed from the kernarg segment pointer.
1953       assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
1954       InputReg = getImplicitArgPtr(DAG, DL);
1955     }
1956 
1957     if (OutgoingArg->isRegister()) {
1958       RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
1959     } else {
1960       SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, StackPtr,
1961                                               InputReg,
1962                                               OutgoingArg->getStackOffset());
1963       MemOpChains.push_back(ArgStore);
1964     }
1965   }
1966 }
1967 
1968 static bool canGuaranteeTCO(CallingConv::ID CC) {
1969   return CC == CallingConv::Fast;
1970 }
1971 
1972 /// Return true if we might ever do TCO for calls with this calling convention.
1973 static bool mayTailCallThisCC(CallingConv::ID CC) {
1974   switch (CC) {
1975   case CallingConv::C:
1976     return true;
1977   default:
1978     return canGuaranteeTCO(CC);
1979   }
1980 }
1981 
1982 bool SITargetLowering::isEligibleForTailCallOptimization(
1983     SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
1984     const SmallVectorImpl<ISD::OutputArg> &Outs,
1985     const SmallVectorImpl<SDValue> &OutVals,
1986     const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
1987   if (!mayTailCallThisCC(CalleeCC))
1988     return false;
1989 
1990   MachineFunction &MF = DAG.getMachineFunction();
1991   const Function *CallerF = MF.getFunction();
1992   CallingConv::ID CallerCC = CallerF->getCallingConv();
1993   const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
1994   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
1995 
1996   // Kernels aren't callable, and don't have a live in return address so it
1997   // doesn't make sense to do a tail call with entry functions.
1998   if (!CallerPreserved)
1999     return false;
2000 
2001   bool CCMatch = CallerCC == CalleeCC;
2002 
2003   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
2004     if (canGuaranteeTCO(CalleeCC) && CCMatch)
2005       return true;
2006     return false;
2007   }
2008 
2009   // TODO: Can we handle var args?
2010   if (IsVarArg)
2011     return false;
2012 
2013   for (const Argument &Arg : CallerF->args()) {
2014     if (Arg.hasByValAttr())
2015       return false;
2016   }
2017 
2018   LLVMContext &Ctx = *DAG.getContext();
2019 
2020   // Check that the call results are passed in the same way.
2021   if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
2022                                   CCAssignFnForCall(CalleeCC, IsVarArg),
2023                                   CCAssignFnForCall(CallerCC, IsVarArg)))
2024     return false;
2025 
2026   // The callee has to preserve all registers the caller needs to preserve.
2027   if (!CCMatch) {
2028     const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2029     if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
2030       return false;
2031   }
2032 
2033   // Nothing more to check if the callee is taking no arguments.
2034   if (Outs.empty())
2035     return true;
2036 
2037   SmallVector<CCValAssign, 16> ArgLocs;
2038   CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
2039 
2040   CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
2041 
2042   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
2043   // If the stack arguments for this call do not fit into our own save area then
2044   // the call cannot be made tail.
2045   // TODO: Is this really necessary?
2046   if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
2047     return false;
2048 
2049   const MachineRegisterInfo &MRI = MF.getRegInfo();
2050   return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
2051 }
2052 
2053 bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2054   if (!CI->isTailCall())
2055     return false;
2056 
2057   const Function *ParentFn = CI->getParent()->getParent();
2058   if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
2059     return false;
2060 
2061   auto Attr = ParentFn->getFnAttribute("disable-tail-calls");
2062   return (Attr.getValueAsString() != "true");
2063 }
2064 
2065 // The wave scratch offset register is used as the global base pointer.
2066 SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
2067                                     SmallVectorImpl<SDValue> &InVals) const {
2068   SelectionDAG &DAG = CLI.DAG;
2069   const SDLoc &DL = CLI.DL;
2070   SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
2071   SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
2072   SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
2073   SDValue Chain = CLI.Chain;
2074   SDValue Callee = CLI.Callee;
2075   bool &IsTailCall = CLI.IsTailCall;
2076   CallingConv::ID CallConv = CLI.CallConv;
2077   bool IsVarArg = CLI.IsVarArg;
2078   bool IsSibCall = false;
2079   bool IsThisReturn = false;
2080   MachineFunction &MF = DAG.getMachineFunction();
2081 
2082   if (IsVarArg) {
2083     return lowerUnhandledCall(CLI, InVals,
2084                               "unsupported call to variadic function ");
2085   }
2086 
2087   if (!CLI.CS.getCalledFunction()) {
2088     return lowerUnhandledCall(CLI, InVals,
2089                               "unsupported indirect call to function ");
2090   }
2091 
2092   if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
2093     return lowerUnhandledCall(CLI, InVals,
2094                               "unsupported required tail call to function ");
2095   }
2096 
2097   // The first 4 bytes are reserved for the callee's emergency stack slot.
2098   const unsigned CalleeUsableStackOffset = 4;
2099 
2100   if (IsTailCall) {
2101     IsTailCall = isEligibleForTailCallOptimization(
2102       Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
2103     if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall()) {
2104       report_fatal_error("failed to perform tail call elimination on a call "
2105                          "site marked musttail");
2106     }
2107 
2108     bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
2109 
2110     // A sibling call is one where we're under the usual C ABI and not planning
2111     // to change that but can still do a tail call:
2112     if (!TailCallOpt && IsTailCall)
2113       IsSibCall = true;
2114 
2115     if (IsTailCall)
2116       ++NumTailCalls;
2117   }
2118 
2119   if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Callee)) {
2120     // FIXME: Remove this hack for function pointer types.
2121     const GlobalValue *GV = GA->getGlobal();
2122     assert(Callee.getValueType() == MVT::i32);
2123     Callee = DAG.getGlobalAddress(GV, DL, MVT::i64, GA->getOffset(),
2124                                   false, GA->getTargetFlags());
2125   }
2126 
2127   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2128 
2129   // Analyze operands of the call, assigning locations to each operand.
2130   SmallVector<CCValAssign, 16> ArgLocs;
2131   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
2132   CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
2133   CCInfo.AnalyzeCallOperands(Outs, AssignFn);
2134 
2135   // Get a count of how many bytes are to be pushed on the stack.
2136   unsigned NumBytes = CCInfo.getNextStackOffset();
2137 
2138   if (IsSibCall) {
2139     // Since we're not changing the ABI to make this a tail call, the memory
2140     // operands are already available in the caller's incoming argument space.
2141     NumBytes = 0;
2142   }
2143 
2144   // FPDiff is the byte offset of the call's argument area from the callee's.
2145   // Stores to callee stack arguments will be placed in FixedStackSlots offset
2146   // by this amount for a tail call. In a sibling call it must be 0 because the
2147   // caller will deallocate the entire stack and the callee still expects its
2148   // arguments to begin at SP+0. Completely unused for non-tail calls.
2149   int32_t FPDiff = 0;
2150   MachineFrameInfo &MFI = MF.getFrameInfo();
2151   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2152 
2153   SDValue CallerSavedFP;
2154 
2155   // Adjust the stack pointer for the new arguments...
2156   // These operations are automatically eliminated by the prolog/epilog pass
2157   if (!IsSibCall) {
2158     Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
2159 
2160     unsigned OffsetReg = Info->getScratchWaveOffsetReg();
2161 
2162     // In the HSA case, this should be an identity copy.
2163     SDValue ScratchRSrcReg
2164       = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
2165     RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
2166 
2167     // TODO: Don't hardcode these registers and get from the callee function.
2168     SDValue ScratchWaveOffsetReg
2169       = DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32);
2170     RegsToPass.emplace_back(AMDGPU::SGPR4, ScratchWaveOffsetReg);
2171 
2172     if (!Info->isEntryFunction()) {
2173       // Avoid clobbering this function's FP value. In the current convention
2174       // callee will overwrite this, so do save/restore around the call site.
2175       CallerSavedFP = DAG.getCopyFromReg(Chain, DL,
2176                                          Info->getFrameOffsetReg(), MVT::i32);
2177     }
2178   }
2179 
2180   // Stack pointer relative accesses are done by changing the offset SGPR. This
2181   // is just the VGPR offset component.
2182   SDValue StackPtr = DAG.getConstant(CalleeUsableStackOffset, DL, MVT::i32);
2183 
2184   SmallVector<SDValue, 8> MemOpChains;
2185   MVT PtrVT = MVT::i32;
2186 
2187   // Walk the register/memloc assignments, inserting copies/loads.
2188   for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
2189        ++i, ++realArgIdx) {
2190     CCValAssign &VA = ArgLocs[i];
2191     SDValue Arg = OutVals[realArgIdx];
2192 
2193     // Promote the value if needed.
2194     switch (VA.getLocInfo()) {
2195     case CCValAssign::Full:
2196       break;
2197     case CCValAssign::BCvt:
2198       Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2199       break;
2200     case CCValAssign::ZExt:
2201       Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2202       break;
2203     case CCValAssign::SExt:
2204       Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2205       break;
2206     case CCValAssign::AExt:
2207       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2208       break;
2209     case CCValAssign::FPExt:
2210       Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
2211       break;
2212     default:
2213       llvm_unreachable("Unknown loc info!");
2214     }
2215 
2216     if (VA.isRegLoc()) {
2217       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2218     } else {
2219       assert(VA.isMemLoc());
2220 
2221       SDValue DstAddr;
2222       MachinePointerInfo DstInfo;
2223 
2224       unsigned LocMemOffset = VA.getLocMemOffset();
2225       int32_t Offset = LocMemOffset;
2226       SDValue PtrOff = DAG.getConstant(Offset, DL, MVT::i32);
2227       PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
2228 
2229       if (IsTailCall) {
2230         ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2231         unsigned OpSize = Flags.isByVal() ?
2232           Flags.getByValSize() : VA.getValVT().getStoreSize();
2233 
2234         Offset = Offset + FPDiff;
2235         int FI = MFI.CreateFixedObject(OpSize, Offset, true);
2236 
2237         DstAddr = DAG.getFrameIndex(FI, PtrVT);
2238         DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, DstAddr, StackPtr);
2239         DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
2240 
2241         // Make sure any stack arguments overlapping with where we're storing
2242         // are loaded before this eventual operation. Otherwise they'll be
2243         // clobbered.
2244 
2245         // FIXME: Why is this really necessary? This seems to just result in a
2246         // lot of code to copy the stack and write them back to the same
2247         // locations, which are supposed to be immutable?
2248         Chain = addTokenForArgument(Chain, DAG, MFI, FI);
2249       } else {
2250         DstAddr = PtrOff;
2251         DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
2252       }
2253 
2254       if (Outs[i].Flags.isByVal()) {
2255         SDValue SizeNode =
2256             DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
2257         SDValue Cpy = DAG.getMemcpy(
2258             Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
2259             /*isVol = */ false, /*AlwaysInline = */ true,
2260             /*isTailCall = */ false,
2261             DstInfo, MachinePointerInfo());
2262 
2263         MemOpChains.push_back(Cpy);
2264       } else {
2265         SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
2266         MemOpChains.push_back(Store);
2267       }
2268     }
2269   }
2270 
2271   // Copy special input registers after user input arguments.
2272   passSpecialInputs(CLI, *Info, RegsToPass, MemOpChains, Chain, StackPtr);
2273 
2274   if (!MemOpChains.empty())
2275     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
2276 
2277   // Build a sequence of copy-to-reg nodes chained together with token chain
2278   // and flag operands which copy the outgoing args into the appropriate regs.
2279   SDValue InFlag;
2280   for (auto &RegToPass : RegsToPass) {
2281     Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
2282                              RegToPass.second, InFlag);
2283     InFlag = Chain.getValue(1);
2284   }
2285 
2286 
2287   SDValue PhysReturnAddrReg;
2288   if (IsTailCall) {
2289     // Since the return is being combined with the call, we need to pass on the
2290     // return address.
2291 
2292     const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2293     SDValue ReturnAddrReg = CreateLiveInRegister(
2294       DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
2295 
2296     PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
2297                                         MVT::i64);
2298     Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, InFlag);
2299     InFlag = Chain.getValue(1);
2300   }
2301 
2302   // We don't usually want to end the call-sequence here because we would tidy
2303   // the frame up *after* the call, however in the ABI-changing tail-call case
2304   // we've carefully laid out the parameters so that when sp is reset they'll be
2305   // in the correct location.
2306   if (IsTailCall && !IsSibCall) {
2307     Chain = DAG.getCALLSEQ_END(Chain,
2308                                DAG.getTargetConstant(NumBytes, DL, MVT::i32),
2309                                DAG.getTargetConstant(0, DL, MVT::i32),
2310                                InFlag, DL);
2311     InFlag = Chain.getValue(1);
2312   }
2313 
2314   std::vector<SDValue> Ops;
2315   Ops.push_back(Chain);
2316   Ops.push_back(Callee);
2317 
2318   if (IsTailCall) {
2319     // Each tail call may have to adjust the stack by a different amount, so
2320     // this information must travel along with the operation for eventual
2321     // consumption by emitEpilogue.
2322     Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
2323 
2324     Ops.push_back(PhysReturnAddrReg);
2325   }
2326 
2327   // Add argument registers to the end of the list so that they are known live
2328   // into the call.
2329   for (auto &RegToPass : RegsToPass) {
2330     Ops.push_back(DAG.getRegister(RegToPass.first,
2331                                   RegToPass.second.getValueType()));
2332   }
2333 
2334   // Add a register mask operand representing the call-preserved registers.
2335 
2336   const AMDGPURegisterInfo *TRI = Subtarget->getRegisterInfo();
2337   const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
2338   assert(Mask && "Missing call preserved mask for calling convention");
2339   Ops.push_back(DAG.getRegisterMask(Mask));
2340 
2341   if (InFlag.getNode())
2342     Ops.push_back(InFlag);
2343 
2344   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2345 
2346   // If we're doing a tall call, use a TC_RETURN here rather than an
2347   // actual call instruction.
2348   if (IsTailCall) {
2349     MFI.setHasTailCall();
2350     return DAG.getNode(AMDGPUISD::TC_RETURN, DL, NodeTys, Ops);
2351   }
2352 
2353   // Returns a chain and a flag for retval copy to use.
2354   SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
2355   Chain = Call.getValue(0);
2356   InFlag = Call.getValue(1);
2357 
2358   if (CallerSavedFP) {
2359     SDValue FPReg = DAG.getRegister(Info->getFrameOffsetReg(), MVT::i32);
2360     Chain = DAG.getCopyToReg(Chain, DL, FPReg, CallerSavedFP, InFlag);
2361     InFlag = Chain.getValue(1);
2362   }
2363 
2364   uint64_t CalleePopBytes = NumBytes;
2365   Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32),
2366                              DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32),
2367                              InFlag, DL);
2368   if (!Ins.empty())
2369     InFlag = Chain.getValue(1);
2370 
2371   // Handle result values, copying them out of physregs into vregs that we
2372   // return.
2373   return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
2374                          InVals, IsThisReturn,
2375                          IsThisReturn ? OutVals[0] : SDValue());
2376 }
2377 
2378 unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
2379                                              SelectionDAG &DAG) const {
2380   unsigned Reg = StringSwitch<unsigned>(RegName)
2381     .Case("m0", AMDGPU::M0)
2382     .Case("exec", AMDGPU::EXEC)
2383     .Case("exec_lo", AMDGPU::EXEC_LO)
2384     .Case("exec_hi", AMDGPU::EXEC_HI)
2385     .Case("flat_scratch", AMDGPU::FLAT_SCR)
2386     .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
2387     .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
2388     .Default(AMDGPU::NoRegister);
2389 
2390   if (Reg == AMDGPU::NoRegister) {
2391     report_fatal_error(Twine("invalid register name \""
2392                              + StringRef(RegName)  + "\"."));
2393 
2394   }
2395 
2396   if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
2397       Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
2398     report_fatal_error(Twine("invalid register \""
2399                              + StringRef(RegName)  + "\" for subtarget."));
2400   }
2401 
2402   switch (Reg) {
2403   case AMDGPU::M0:
2404   case AMDGPU::EXEC_LO:
2405   case AMDGPU::EXEC_HI:
2406   case AMDGPU::FLAT_SCR_LO:
2407   case AMDGPU::FLAT_SCR_HI:
2408     if (VT.getSizeInBits() == 32)
2409       return Reg;
2410     break;
2411   case AMDGPU::EXEC:
2412   case AMDGPU::FLAT_SCR:
2413     if (VT.getSizeInBits() == 64)
2414       return Reg;
2415     break;
2416   default:
2417     llvm_unreachable("missing register type checking");
2418   }
2419 
2420   report_fatal_error(Twine("invalid type for register \""
2421                            + StringRef(RegName) + "\"."));
2422 }
2423 
2424 // If kill is not the last instruction, split the block so kill is always a
2425 // proper terminator.
2426 MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI,
2427                                                     MachineBasicBlock *BB) const {
2428   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
2429 
2430   MachineBasicBlock::iterator SplitPoint(&MI);
2431   ++SplitPoint;
2432 
2433   if (SplitPoint == BB->end()) {
2434     // Don't bother with a new block.
2435     MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR));
2436     return BB;
2437   }
2438 
2439   MachineFunction *MF = BB->getParent();
2440   MachineBasicBlock *SplitBB
2441     = MF->CreateMachineBasicBlock(BB->getBasicBlock());
2442 
2443   MF->insert(++MachineFunction::iterator(BB), SplitBB);
2444   SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end());
2445 
2446   SplitBB->transferSuccessorsAndUpdatePHIs(BB);
2447   BB->addSuccessor(SplitBB);
2448 
2449   MI.setDesc(TII->get(AMDGPU::SI_KILL_TERMINATOR));
2450   return SplitBB;
2451 }
2452 
2453 // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
2454 // wavefront. If the value is uniform and just happens to be in a VGPR, this
2455 // will only do one iteration. In the worst case, this will loop 64 times.
2456 //
2457 // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
2458 static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
2459   const SIInstrInfo *TII,
2460   MachineRegisterInfo &MRI,
2461   MachineBasicBlock &OrigBB,
2462   MachineBasicBlock &LoopBB,
2463   const DebugLoc &DL,
2464   const MachineOperand &IdxReg,
2465   unsigned InitReg,
2466   unsigned ResultReg,
2467   unsigned PhiReg,
2468   unsigned InitSaveExecReg,
2469   int Offset,
2470   bool UseGPRIdxMode) {
2471   MachineBasicBlock::iterator I = LoopBB.begin();
2472 
2473   unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2474   unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2475   unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2476   unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2477 
2478   BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
2479     .addReg(InitReg)
2480     .addMBB(&OrigBB)
2481     .addReg(ResultReg)
2482     .addMBB(&LoopBB);
2483 
2484   BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
2485     .addReg(InitSaveExecReg)
2486     .addMBB(&OrigBB)
2487     .addReg(NewExec)
2488     .addMBB(&LoopBB);
2489 
2490   // Read the next variant <- also loop target.
2491   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
2492     .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
2493 
2494   // Compare the just read M0 value to all possible Idx values.
2495   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
2496     .addReg(CurrentIdxReg)
2497     .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
2498 
2499   if (UseGPRIdxMode) {
2500     unsigned IdxReg;
2501     if (Offset == 0) {
2502       IdxReg = CurrentIdxReg;
2503     } else {
2504       IdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2505       BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), IdxReg)
2506         .addReg(CurrentIdxReg, RegState::Kill)
2507         .addImm(Offset);
2508     }
2509 
2510     MachineInstr *SetIdx =
2511       BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_IDX))
2512       .addReg(IdxReg, RegState::Kill);
2513     SetIdx->getOperand(2).setIsUndef();
2514   } else {
2515     // Move index from VCC into M0
2516     if (Offset == 0) {
2517       BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2518         .addReg(CurrentIdxReg, RegState::Kill);
2519     } else {
2520       BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
2521         .addReg(CurrentIdxReg, RegState::Kill)
2522         .addImm(Offset);
2523     }
2524   }
2525 
2526   // Update EXEC, save the original EXEC value to VCC.
2527   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
2528     .addReg(CondReg, RegState::Kill);
2529 
2530   MRI.setSimpleHint(NewExec, CondReg);
2531 
2532   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
2533   MachineInstr *InsertPt =
2534     BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
2535     .addReg(AMDGPU::EXEC)
2536     .addReg(NewExec);
2537 
2538   // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
2539   // s_cbranch_scc0?
2540 
2541   // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
2542   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
2543     .addMBB(&LoopBB);
2544 
2545   return InsertPt->getIterator();
2546 }
2547 
2548 // This has slightly sub-optimal regalloc when the source vector is killed by
2549 // the read. The register allocator does not understand that the kill is
2550 // per-workitem, so is kept alive for the whole loop so we end up not re-using a
2551 // subregister from it, using 1 more VGPR than necessary. This was saved when
2552 // this was expanded after register allocation.
2553 static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
2554                                                   MachineBasicBlock &MBB,
2555                                                   MachineInstr &MI,
2556                                                   unsigned InitResultReg,
2557                                                   unsigned PhiReg,
2558                                                   int Offset,
2559                                                   bool UseGPRIdxMode) {
2560   MachineFunction *MF = MBB.getParent();
2561   MachineRegisterInfo &MRI = MF->getRegInfo();
2562   const DebugLoc &DL = MI.getDebugLoc();
2563   MachineBasicBlock::iterator I(&MI);
2564 
2565   unsigned DstReg = MI.getOperand(0).getReg();
2566   unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2567   unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2568 
2569   BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
2570 
2571   // Save the EXEC mask
2572   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), SaveExec)
2573     .addReg(AMDGPU::EXEC);
2574 
2575   // To insert the loop we need to split the block. Move everything after this
2576   // point to a new block, and insert a new empty block between the two.
2577   MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
2578   MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
2579   MachineFunction::iterator MBBI(MBB);
2580   ++MBBI;
2581 
2582   MF->insert(MBBI, LoopBB);
2583   MF->insert(MBBI, RemainderBB);
2584 
2585   LoopBB->addSuccessor(LoopBB);
2586   LoopBB->addSuccessor(RemainderBB);
2587 
2588   // Move the rest of the block into a new block.
2589   RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
2590   RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
2591 
2592   MBB.addSuccessor(LoopBB);
2593 
2594   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
2595 
2596   auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
2597                                       InitResultReg, DstReg, PhiReg, TmpExec,
2598                                       Offset, UseGPRIdxMode);
2599 
2600   MachineBasicBlock::iterator First = RemainderBB->begin();
2601   BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
2602     .addReg(SaveExec);
2603 
2604   return InsPt;
2605 }
2606 
2607 // Returns subreg index, offset
2608 static std::pair<unsigned, int>
2609 computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
2610                             const TargetRegisterClass *SuperRC,
2611                             unsigned VecReg,
2612                             int Offset) {
2613   int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
2614 
2615   // Skip out of bounds offsets, or else we would end up using an undefined
2616   // register.
2617   if (Offset >= NumElts || Offset < 0)
2618     return std::make_pair(AMDGPU::sub0, Offset);
2619 
2620   return std::make_pair(AMDGPU::sub0 + Offset, 0);
2621 }
2622 
2623 // Return true if the index is an SGPR and was set.
2624 static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII,
2625                                  MachineRegisterInfo &MRI,
2626                                  MachineInstr &MI,
2627                                  int Offset,
2628                                  bool UseGPRIdxMode,
2629                                  bool IsIndirectSrc) {
2630   MachineBasicBlock *MBB = MI.getParent();
2631   const DebugLoc &DL = MI.getDebugLoc();
2632   MachineBasicBlock::iterator I(&MI);
2633 
2634   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
2635   const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
2636 
2637   assert(Idx->getReg() != AMDGPU::NoRegister);
2638 
2639   if (!TII->getRegisterInfo().isSGPRClass(IdxRC))
2640     return false;
2641 
2642   if (UseGPRIdxMode) {
2643     unsigned IdxMode = IsIndirectSrc ?
2644       VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE;
2645     if (Offset == 0) {
2646       MachineInstr *SetOn =
2647           BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
2648               .add(*Idx)
2649               .addImm(IdxMode);
2650 
2651       SetOn->getOperand(3).setIsUndef();
2652     } else {
2653       unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
2654       BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
2655           .add(*Idx)
2656           .addImm(Offset);
2657       MachineInstr *SetOn =
2658         BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
2659         .addReg(Tmp, RegState::Kill)
2660         .addImm(IdxMode);
2661 
2662       SetOn->getOperand(3).setIsUndef();
2663     }
2664 
2665     return true;
2666   }
2667 
2668   if (Offset == 0) {
2669     BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2670       .add(*Idx);
2671   } else {
2672     BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
2673       .add(*Idx)
2674       .addImm(Offset);
2675   }
2676 
2677   return true;
2678 }
2679 
2680 // Control flow needs to be inserted if indexing with a VGPR.
2681 static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
2682                                           MachineBasicBlock &MBB,
2683                                           const SISubtarget &ST) {
2684   const SIInstrInfo *TII = ST.getInstrInfo();
2685   const SIRegisterInfo &TRI = TII->getRegisterInfo();
2686   MachineFunction *MF = MBB.getParent();
2687   MachineRegisterInfo &MRI = MF->getRegInfo();
2688 
2689   unsigned Dst = MI.getOperand(0).getReg();
2690   unsigned SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
2691   int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
2692 
2693   const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
2694 
2695   unsigned SubReg;
2696   std::tie(SubReg, Offset)
2697     = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
2698 
2699   bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
2700 
2701   if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) {
2702     MachineBasicBlock::iterator I(&MI);
2703     const DebugLoc &DL = MI.getDebugLoc();
2704 
2705     if (UseGPRIdxMode) {
2706       // TODO: Look at the uses to avoid the copy. This may require rescheduling
2707       // to avoid interfering with other uses, so probably requires a new
2708       // optimization pass.
2709       BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
2710         .addReg(SrcReg, RegState::Undef, SubReg)
2711         .addReg(SrcReg, RegState::Implicit)
2712         .addReg(AMDGPU::M0, RegState::Implicit);
2713       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
2714     } else {
2715       BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
2716         .addReg(SrcReg, RegState::Undef, SubReg)
2717         .addReg(SrcReg, RegState::Implicit);
2718     }
2719 
2720     MI.eraseFromParent();
2721 
2722     return &MBB;
2723   }
2724 
2725   const DebugLoc &DL = MI.getDebugLoc();
2726   MachineBasicBlock::iterator I(&MI);
2727 
2728   unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2729   unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2730 
2731   BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
2732 
2733   if (UseGPRIdxMode) {
2734     MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
2735       .addImm(0) // Reset inside loop.
2736       .addImm(VGPRIndexMode::SRC0_ENABLE);
2737     SetOn->getOperand(3).setIsUndef();
2738 
2739     // Disable again after the loop.
2740     BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
2741   }
2742 
2743   auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset, UseGPRIdxMode);
2744   MachineBasicBlock *LoopBB = InsPt->getParent();
2745 
2746   if (UseGPRIdxMode) {
2747     BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
2748       .addReg(SrcReg, RegState::Undef, SubReg)
2749       .addReg(SrcReg, RegState::Implicit)
2750       .addReg(AMDGPU::M0, RegState::Implicit);
2751   } else {
2752     BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
2753       .addReg(SrcReg, RegState::Undef, SubReg)
2754       .addReg(SrcReg, RegState::Implicit);
2755   }
2756 
2757   MI.eraseFromParent();
2758 
2759   return LoopBB;
2760 }
2761 
2762 static unsigned getMOVRELDPseudo(const SIRegisterInfo &TRI,
2763                                  const TargetRegisterClass *VecRC) {
2764   switch (TRI.getRegSizeInBits(*VecRC)) {
2765   case 32: // 4 bytes
2766     return AMDGPU::V_MOVRELD_B32_V1;
2767   case 64: // 8 bytes
2768     return AMDGPU::V_MOVRELD_B32_V2;
2769   case 128: // 16 bytes
2770     return AMDGPU::V_MOVRELD_B32_V4;
2771   case 256: // 32 bytes
2772     return AMDGPU::V_MOVRELD_B32_V8;
2773   case 512: // 64 bytes
2774     return AMDGPU::V_MOVRELD_B32_V16;
2775   default:
2776     llvm_unreachable("unsupported size for MOVRELD pseudos");
2777   }
2778 }
2779 
2780 static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
2781                                           MachineBasicBlock &MBB,
2782                                           const SISubtarget &ST) {
2783   const SIInstrInfo *TII = ST.getInstrInfo();
2784   const SIRegisterInfo &TRI = TII->getRegisterInfo();
2785   MachineFunction *MF = MBB.getParent();
2786   MachineRegisterInfo &MRI = MF->getRegInfo();
2787 
2788   unsigned Dst = MI.getOperand(0).getReg();
2789   const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
2790   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
2791   const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
2792   int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
2793   const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
2794 
2795   // This can be an immediate, but will be folded later.
2796   assert(Val->getReg());
2797 
2798   unsigned SubReg;
2799   std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
2800                                                          SrcVec->getReg(),
2801                                                          Offset);
2802   bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
2803 
2804   if (Idx->getReg() == AMDGPU::NoRegister) {
2805     MachineBasicBlock::iterator I(&MI);
2806     const DebugLoc &DL = MI.getDebugLoc();
2807 
2808     assert(Offset == 0);
2809 
2810     BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
2811         .add(*SrcVec)
2812         .add(*Val)
2813         .addImm(SubReg);
2814 
2815     MI.eraseFromParent();
2816     return &MBB;
2817   }
2818 
2819   if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) {
2820     MachineBasicBlock::iterator I(&MI);
2821     const DebugLoc &DL = MI.getDebugLoc();
2822 
2823     if (UseGPRIdxMode) {
2824       BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
2825           .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst
2826           .add(*Val)
2827           .addReg(Dst, RegState::ImplicitDefine)
2828           .addReg(SrcVec->getReg(), RegState::Implicit)
2829           .addReg(AMDGPU::M0, RegState::Implicit);
2830 
2831       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
2832     } else {
2833       const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
2834 
2835       BuildMI(MBB, I, DL, MovRelDesc)
2836           .addReg(Dst, RegState::Define)
2837           .addReg(SrcVec->getReg())
2838           .add(*Val)
2839           .addImm(SubReg - AMDGPU::sub0);
2840     }
2841 
2842     MI.eraseFromParent();
2843     return &MBB;
2844   }
2845 
2846   if (Val->isReg())
2847     MRI.clearKillFlags(Val->getReg());
2848 
2849   const DebugLoc &DL = MI.getDebugLoc();
2850 
2851   if (UseGPRIdxMode) {
2852     MachineBasicBlock::iterator I(&MI);
2853 
2854     MachineInstr *SetOn = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
2855       .addImm(0) // Reset inside loop.
2856       .addImm(VGPRIndexMode::DST_ENABLE);
2857     SetOn->getOperand(3).setIsUndef();
2858 
2859     // Disable again after the loop.
2860     BuildMI(MBB, std::next(I), DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
2861   }
2862 
2863   unsigned PhiReg = MRI.createVirtualRegister(VecRC);
2864 
2865   auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
2866                               Offset, UseGPRIdxMode);
2867   MachineBasicBlock *LoopBB = InsPt->getParent();
2868 
2869   if (UseGPRIdxMode) {
2870     BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
2871         .addReg(PhiReg, RegState::Undef, SubReg) // vdst
2872         .add(*Val)                               // src0
2873         .addReg(Dst, RegState::ImplicitDefine)
2874         .addReg(PhiReg, RegState::Implicit)
2875         .addReg(AMDGPU::M0, RegState::Implicit);
2876   } else {
2877     const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
2878 
2879     BuildMI(*LoopBB, InsPt, DL, MovRelDesc)
2880         .addReg(Dst, RegState::Define)
2881         .addReg(PhiReg)
2882         .add(*Val)
2883         .addImm(SubReg - AMDGPU::sub0);
2884   }
2885 
2886   MI.eraseFromParent();
2887 
2888   return LoopBB;
2889 }
2890 
2891 MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
2892   MachineInstr &MI, MachineBasicBlock *BB) const {
2893 
2894   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
2895   MachineFunction *MF = BB->getParent();
2896   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
2897 
2898   if (TII->isMIMG(MI)) {
2899       if (!MI.memoperands_empty())
2900         return BB;
2901     // Add a memoperand for mimg instructions so that they aren't assumed to
2902     // be ordered memory instuctions.
2903 
2904     MachinePointerInfo PtrInfo(MFI->getImagePSV());
2905     MachineMemOperand::Flags Flags = MachineMemOperand::MODereferenceable;
2906     if (MI.mayStore())
2907       Flags |= MachineMemOperand::MOStore;
2908 
2909     if (MI.mayLoad())
2910       Flags |= MachineMemOperand::MOLoad;
2911 
2912     auto MMO = MF->getMachineMemOperand(PtrInfo, Flags, 0, 0);
2913     MI.addMemOperand(*MF, MMO);
2914     return BB;
2915   }
2916 
2917   switch (MI.getOpcode()) {
2918   case AMDGPU::SI_INIT_M0:
2919     BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
2920             TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2921         .add(MI.getOperand(0));
2922     MI.eraseFromParent();
2923     return BB;
2924 
2925   case AMDGPU::SI_INIT_EXEC:
2926     // This should be before all vector instructions.
2927     BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
2928             AMDGPU::EXEC)
2929         .addImm(MI.getOperand(0).getImm());
2930     MI.eraseFromParent();
2931     return BB;
2932 
2933   case AMDGPU::SI_INIT_EXEC_FROM_INPUT: {
2934     // Extract the thread count from an SGPR input and set EXEC accordingly.
2935     // Since BFM can't shift by 64, handle that case with CMP + CMOV.
2936     //
2937     // S_BFE_U32 count, input, {shift, 7}
2938     // S_BFM_B64 exec, count, 0
2939     // S_CMP_EQ_U32 count, 64
2940     // S_CMOV_B64 exec, -1
2941     MachineInstr *FirstMI = &*BB->begin();
2942     MachineRegisterInfo &MRI = MF->getRegInfo();
2943     unsigned InputReg = MI.getOperand(0).getReg();
2944     unsigned CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2945     bool Found = false;
2946 
2947     // Move the COPY of the input reg to the beginning, so that we can use it.
2948     for (auto I = BB->begin(); I != &MI; I++) {
2949       if (I->getOpcode() != TargetOpcode::COPY ||
2950           I->getOperand(0).getReg() != InputReg)
2951         continue;
2952 
2953       if (I == FirstMI) {
2954         FirstMI = &*++BB->begin();
2955       } else {
2956         I->removeFromParent();
2957         BB->insert(FirstMI, &*I);
2958       }
2959       Found = true;
2960       break;
2961     }
2962     assert(Found);
2963     (void)Found;
2964 
2965     // This should be before all vector instructions.
2966     BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
2967         .addReg(InputReg)
2968         .addImm((MI.getOperand(1).getImm() & 0x7f) | 0x70000);
2969     BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFM_B64),
2970             AMDGPU::EXEC)
2971         .addReg(CountReg)
2972         .addImm(0);
2973     BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32))
2974         .addReg(CountReg, RegState::Kill)
2975         .addImm(64);
2976     BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMOV_B64),
2977             AMDGPU::EXEC)
2978         .addImm(-1);
2979     MI.eraseFromParent();
2980     return BB;
2981   }
2982 
2983   case AMDGPU::GET_GROUPSTATICSIZE: {
2984     DebugLoc DL = MI.getDebugLoc();
2985     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
2986         .add(MI.getOperand(0))
2987         .addImm(MFI->getLDSSize());
2988     MI.eraseFromParent();
2989     return BB;
2990   }
2991   case AMDGPU::SI_INDIRECT_SRC_V1:
2992   case AMDGPU::SI_INDIRECT_SRC_V2:
2993   case AMDGPU::SI_INDIRECT_SRC_V4:
2994   case AMDGPU::SI_INDIRECT_SRC_V8:
2995   case AMDGPU::SI_INDIRECT_SRC_V16:
2996     return emitIndirectSrc(MI, *BB, *getSubtarget());
2997   case AMDGPU::SI_INDIRECT_DST_V1:
2998   case AMDGPU::SI_INDIRECT_DST_V2:
2999   case AMDGPU::SI_INDIRECT_DST_V4:
3000   case AMDGPU::SI_INDIRECT_DST_V8:
3001   case AMDGPU::SI_INDIRECT_DST_V16:
3002     return emitIndirectDst(MI, *BB, *getSubtarget());
3003   case AMDGPU::SI_KILL:
3004     return splitKillBlock(MI, BB);
3005   case AMDGPU::V_CNDMASK_B64_PSEUDO: {
3006     MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
3007 
3008     unsigned Dst = MI.getOperand(0).getReg();
3009     unsigned Src0 = MI.getOperand(1).getReg();
3010     unsigned Src1 = MI.getOperand(2).getReg();
3011     const DebugLoc &DL = MI.getDebugLoc();
3012     unsigned SrcCond = MI.getOperand(3).getReg();
3013 
3014     unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3015     unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3016     unsigned SrcCondCopy = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
3017 
3018     BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
3019       .addReg(SrcCond);
3020     BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
3021       .addReg(Src0, 0, AMDGPU::sub0)
3022       .addReg(Src1, 0, AMDGPU::sub0)
3023       .addReg(SrcCondCopy);
3024     BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
3025       .addReg(Src0, 0, AMDGPU::sub1)
3026       .addReg(Src1, 0, AMDGPU::sub1)
3027       .addReg(SrcCondCopy);
3028 
3029     BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
3030       .addReg(DstLo)
3031       .addImm(AMDGPU::sub0)
3032       .addReg(DstHi)
3033       .addImm(AMDGPU::sub1);
3034     MI.eraseFromParent();
3035     return BB;
3036   }
3037   case AMDGPU::SI_BR_UNDEF: {
3038     const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3039     const DebugLoc &DL = MI.getDebugLoc();
3040     MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
3041                            .add(MI.getOperand(0));
3042     Br->getOperand(1).setIsUndef(true); // read undef SCC
3043     MI.eraseFromParent();
3044     return BB;
3045   }
3046   case AMDGPU::ADJCALLSTACKUP:
3047   case AMDGPU::ADJCALLSTACKDOWN: {
3048     const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3049     MachineInstrBuilder MIB(*MF, &MI);
3050     MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
3051         .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
3052     return BB;
3053   }
3054   case AMDGPU::SI_CALL_ISEL:
3055   case AMDGPU::SI_TCRETURN_ISEL: {
3056     const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3057     const DebugLoc &DL = MI.getDebugLoc();
3058     unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
3059 
3060     MachineRegisterInfo &MRI = MF->getRegInfo();
3061     unsigned GlobalAddrReg = MI.getOperand(0).getReg();
3062     MachineInstr *PCRel = MRI.getVRegDef(GlobalAddrReg);
3063     assert(PCRel->getOpcode() == AMDGPU::SI_PC_ADD_REL_OFFSET);
3064 
3065     const GlobalValue *G = PCRel->getOperand(1).getGlobal();
3066 
3067     MachineInstrBuilder MIB;
3068     if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
3069       MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg)
3070         .add(MI.getOperand(0))
3071         .addGlobalAddress(G);
3072     } else {
3073       MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_TCRETURN))
3074         .add(MI.getOperand(0))
3075         .addGlobalAddress(G);
3076 
3077       // There is an additional imm operand for tcreturn, but it should be in the
3078       // right place already.
3079     }
3080 
3081     for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
3082       MIB.add(MI.getOperand(I));
3083 
3084     MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
3085     MI.eraseFromParent();
3086     return BB;
3087   }
3088   default:
3089     return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
3090   }
3091 }
3092 
3093 bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
3094   // This currently forces unfolding various combinations of fsub into fma with
3095   // free fneg'd operands. As long as we have fast FMA (controlled by
3096   // isFMAFasterThanFMulAndFAdd), we should perform these.
3097 
3098   // When fma is quarter rate, for f64 where add / sub are at best half rate,
3099   // most of these combines appear to be cycle neutral but save on instruction
3100   // count / code size.
3101   return true;
3102 }
3103 
3104 EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
3105                                          EVT VT) const {
3106   if (!VT.isVector()) {
3107     return MVT::i1;
3108   }
3109   return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
3110 }
3111 
3112 MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const {
3113   // TODO: Should i16 be used always if legal? For now it would force VALU
3114   // shifts.
3115   return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
3116 }
3117 
3118 // Answering this is somewhat tricky and depends on the specific device which
3119 // have different rates for fma or all f64 operations.
3120 //
3121 // v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
3122 // regardless of which device (although the number of cycles differs between
3123 // devices), so it is always profitable for f64.
3124 //
3125 // v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
3126 // only on full rate devices. Normally, we should prefer selecting v_mad_f32
3127 // which we can always do even without fused FP ops since it returns the same
3128 // result as the separate operations and since it is always full
3129 // rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
3130 // however does not support denormals, so we do report fma as faster if we have
3131 // a fast fma device and require denormals.
3132 //
3133 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
3134   VT = VT.getScalarType();
3135 
3136   switch (VT.getSimpleVT().SimpleTy) {
3137   case MVT::f32:
3138     // This is as fast on some subtargets. However, we always have full rate f32
3139     // mad available which returns the same result as the separate operations
3140     // which we should prefer over fma. We can't use this if we want to support
3141     // denormals, so only report this in these cases.
3142     return Subtarget->hasFP32Denormals() && Subtarget->hasFastFMAF32();
3143   case MVT::f64:
3144     return true;
3145   case MVT::f16:
3146     return Subtarget->has16BitInsts() && Subtarget->hasFP16Denormals();
3147   default:
3148     break;
3149   }
3150 
3151   return false;
3152 }
3153 
3154 //===----------------------------------------------------------------------===//
3155 // Custom DAG Lowering Operations
3156 //===----------------------------------------------------------------------===//
3157 
3158 SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
3159   switch (Op.getOpcode()) {
3160   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
3161   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
3162   case ISD::LOAD: {
3163     SDValue Result = LowerLOAD(Op, DAG);
3164     assert((!Result.getNode() ||
3165             Result.getNode()->getNumValues() == 2) &&
3166            "Load should return a value and a chain");
3167     return Result;
3168   }
3169 
3170   case ISD::FSIN:
3171   case ISD::FCOS:
3172     return LowerTrig(Op, DAG);
3173   case ISD::SELECT: return LowerSELECT(Op, DAG);
3174   case ISD::FDIV: return LowerFDIV(Op, DAG);
3175   case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
3176   case ISD::STORE: return LowerSTORE(Op, DAG);
3177   case ISD::GlobalAddress: {
3178     MachineFunction &MF = DAG.getMachineFunction();
3179     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3180     return LowerGlobalAddress(MFI, Op, DAG);
3181   }
3182   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
3183   case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
3184   case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
3185   case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
3186   case ISD::INSERT_VECTOR_ELT:
3187     return lowerINSERT_VECTOR_ELT(Op, DAG);
3188   case ISD::EXTRACT_VECTOR_ELT:
3189     return lowerEXTRACT_VECTOR_ELT(Op, DAG);
3190   case ISD::FP_ROUND:
3191     return lowerFP_ROUND(Op, DAG);
3192 
3193   case ISD::TRAP:
3194   case ISD::DEBUGTRAP:
3195     return lowerTRAP(Op, DAG);
3196   }
3197   return SDValue();
3198 }
3199 
3200 void SITargetLowering::ReplaceNodeResults(SDNode *N,
3201                                           SmallVectorImpl<SDValue> &Results,
3202                                           SelectionDAG &DAG) const {
3203   switch (N->getOpcode()) {
3204   case ISD::INSERT_VECTOR_ELT: {
3205     if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
3206       Results.push_back(Res);
3207     return;
3208   }
3209   case ISD::EXTRACT_VECTOR_ELT: {
3210     if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
3211       Results.push_back(Res);
3212     return;
3213   }
3214   case ISD::INTRINSIC_WO_CHAIN: {
3215     unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
3216     if (IID == Intrinsic::amdgcn_cvt_pkrtz) {
3217       SDValue Src0 = N->getOperand(1);
3218       SDValue Src1 = N->getOperand(2);
3219       SDLoc SL(N);
3220       SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32,
3221                                 Src0, Src1);
3222       Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
3223       return;
3224     }
3225     break;
3226   }
3227   case ISD::SELECT: {
3228     SDLoc SL(N);
3229     EVT VT = N->getValueType(0);
3230     EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3231     SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
3232     SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
3233 
3234     EVT SelectVT = NewVT;
3235     if (NewVT.bitsLT(MVT::i32)) {
3236       LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
3237       RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
3238       SelectVT = MVT::i32;
3239     }
3240 
3241     SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT,
3242                                     N->getOperand(0), LHS, RHS);
3243 
3244     if (NewVT != SelectVT)
3245       NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
3246     Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
3247     return;
3248   }
3249   default:
3250     break;
3251   }
3252 }
3253 
3254 /// \brief Helper function for LowerBRCOND
3255 static SDNode *findUser(SDValue Value, unsigned Opcode) {
3256 
3257   SDNode *Parent = Value.getNode();
3258   for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
3259        I != E; ++I) {
3260 
3261     if (I.getUse().get() != Value)
3262       continue;
3263 
3264     if (I->getOpcode() == Opcode)
3265       return *I;
3266   }
3267   return nullptr;
3268 }
3269 
3270 unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
3271   if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
3272     switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
3273     case Intrinsic::amdgcn_if:
3274       return AMDGPUISD::IF;
3275     case Intrinsic::amdgcn_else:
3276       return AMDGPUISD::ELSE;
3277     case Intrinsic::amdgcn_loop:
3278       return AMDGPUISD::LOOP;
3279     case Intrinsic::amdgcn_end_cf:
3280       llvm_unreachable("should not occur");
3281     default:
3282       return 0;
3283     }
3284   }
3285 
3286   // break, if_break, else_break are all only used as inputs to loop, not
3287   // directly as branch conditions.
3288   return 0;
3289 }
3290 
3291 void SITargetLowering::createDebuggerPrologueStackObjects(
3292     MachineFunction &MF) const {
3293   // Create stack objects that are used for emitting debugger prologue.
3294   //
3295   // Debugger prologue writes work group IDs and work item IDs to scratch memory
3296   // at fixed location in the following format:
3297   //   offset 0:  work group ID x
3298   //   offset 4:  work group ID y
3299   //   offset 8:  work group ID z
3300   //   offset 16: work item ID x
3301   //   offset 20: work item ID y
3302   //   offset 24: work item ID z
3303   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3304   int ObjectIdx = 0;
3305 
3306   // For each dimension:
3307   for (unsigned i = 0; i < 3; ++i) {
3308     // Create fixed stack object for work group ID.
3309     ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4, true);
3310     Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx);
3311     // Create fixed stack object for work item ID.
3312     ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4 + 16, true);
3313     Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx);
3314   }
3315 }
3316 
3317 bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
3318   const Triple &TT = getTargetMachine().getTargetTriple();
3319   return GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
3320          AMDGPU::shouldEmitConstantsToTextSection(TT);
3321 }
3322 
3323 bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
3324   return (GV->getType()->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
3325               GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) &&
3326          !shouldEmitFixup(GV) &&
3327          !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
3328 }
3329 
3330 bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
3331   return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
3332 }
3333 
3334 /// This transforms the control flow intrinsics to get the branch destination as
3335 /// last parameter, also switches branch target with BR if the need arise
3336 SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
3337                                       SelectionDAG &DAG) const {
3338   SDLoc DL(BRCOND);
3339 
3340   SDNode *Intr = BRCOND.getOperand(1).getNode();
3341   SDValue Target = BRCOND.getOperand(2);
3342   SDNode *BR = nullptr;
3343   SDNode *SetCC = nullptr;
3344 
3345   if (Intr->getOpcode() == ISD::SETCC) {
3346     // As long as we negate the condition everything is fine
3347     SetCC = Intr;
3348     Intr = SetCC->getOperand(0).getNode();
3349 
3350   } else {
3351     // Get the target from BR if we don't negate the condition
3352     BR = findUser(BRCOND, ISD::BR);
3353     Target = BR->getOperand(1);
3354   }
3355 
3356   // FIXME: This changes the types of the intrinsics instead of introducing new
3357   // nodes with the correct types.
3358   // e.g. llvm.amdgcn.loop
3359 
3360   // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3
3361   // =>     t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088>
3362 
3363   unsigned CFNode = isCFIntrinsic(Intr);
3364   if (CFNode == 0) {
3365     // This is a uniform branch so we don't need to legalize.
3366     return BRCOND;
3367   }
3368 
3369   bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
3370                    Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
3371 
3372   assert(!SetCC ||
3373         (SetCC->getConstantOperandVal(1) == 1 &&
3374          cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
3375                                                              ISD::SETNE));
3376 
3377   // operands of the new intrinsic call
3378   SmallVector<SDValue, 4> Ops;
3379   if (HaveChain)
3380     Ops.push_back(BRCOND.getOperand(0));
3381 
3382   Ops.append(Intr->op_begin() + (HaveChain ?  2 : 1), Intr->op_end());
3383   Ops.push_back(Target);
3384 
3385   ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
3386 
3387   // build the new intrinsic call
3388   SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
3389 
3390   if (!HaveChain) {
3391     SDValue Ops[] =  {
3392       SDValue(Result, 0),
3393       BRCOND.getOperand(0)
3394     };
3395 
3396     Result = DAG.getMergeValues(Ops, DL).getNode();
3397   }
3398 
3399   if (BR) {
3400     // Give the branch instruction our target
3401     SDValue Ops[] = {
3402       BR->getOperand(0),
3403       BRCOND.getOperand(2)
3404     };
3405     SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
3406     DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
3407     BR = NewBR.getNode();
3408   }
3409 
3410   SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
3411 
3412   // Copy the intrinsic results to registers
3413   for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
3414     SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
3415     if (!CopyToReg)
3416       continue;
3417 
3418     Chain = DAG.getCopyToReg(
3419       Chain, DL,
3420       CopyToReg->getOperand(1),
3421       SDValue(Result, i - 1),
3422       SDValue());
3423 
3424     DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
3425   }
3426 
3427   // Remove the old intrinsic from the chain
3428   DAG.ReplaceAllUsesOfValueWith(
3429     SDValue(Intr, Intr->getNumValues() - 1),
3430     Intr->getOperand(0));
3431 
3432   return Chain;
3433 }
3434 
3435 SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG,
3436                                             SDValue Op,
3437                                             const SDLoc &DL,
3438                                             EVT VT) const {
3439   return Op.getValueType().bitsLE(VT) ?
3440       DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
3441       DAG.getNode(ISD::FTRUNC, DL, VT, Op);
3442 }
3443 
3444 SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
3445   assert(Op.getValueType() == MVT::f16 &&
3446          "Do not know how to custom lower FP_ROUND for non-f16 type");
3447 
3448   SDValue Src = Op.getOperand(0);
3449   EVT SrcVT = Src.getValueType();
3450   if (SrcVT != MVT::f64)
3451     return Op;
3452 
3453   SDLoc DL(Op);
3454 
3455   SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
3456   SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
3457   return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
3458 }
3459 
3460 SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
3461   SDLoc SL(Op);
3462   MachineFunction &MF = DAG.getMachineFunction();
3463   SDValue Chain = Op.getOperand(0);
3464 
3465   unsigned TrapID = Op.getOpcode() == ISD::DEBUGTRAP ?
3466     SISubtarget::TrapIDLLVMDebugTrap : SISubtarget::TrapIDLLVMTrap;
3467 
3468   if (Subtarget->getTrapHandlerAbi() == SISubtarget::TrapHandlerAbiHsa &&
3469       Subtarget->isTrapHandlerEnabled()) {
3470     SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3471     unsigned UserSGPR = Info->getQueuePtrUserSGPR();
3472     assert(UserSGPR != AMDGPU::NoRegister);
3473 
3474     SDValue QueuePtr = CreateLiveInRegister(
3475       DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
3476 
3477     SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
3478 
3479     SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
3480                                      QueuePtr, SDValue());
3481 
3482     SDValue Ops[] = {
3483       ToReg,
3484       DAG.getTargetConstant(TrapID, SL, MVT::i16),
3485       SGPR01,
3486       ToReg.getValue(1)
3487     };
3488 
3489     return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
3490   }
3491 
3492   switch (TrapID) {
3493   case SISubtarget::TrapIDLLVMTrap:
3494     return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
3495   case SISubtarget::TrapIDLLVMDebugTrap: {
3496     DiagnosticInfoUnsupported NoTrap(*MF.getFunction(),
3497                                      "debugtrap handler not supported",
3498                                      Op.getDebugLoc(),
3499                                      DS_Warning);
3500     LLVMContext &Ctx = MF.getFunction()->getContext();
3501     Ctx.diagnose(NoTrap);
3502     return Chain;
3503   }
3504   default:
3505     llvm_unreachable("unsupported trap handler type!");
3506   }
3507 
3508   return Chain;
3509 }
3510 
3511 SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
3512                                              SelectionDAG &DAG) const {
3513   // FIXME: Use inline constants (src_{shared, private}_base) instead.
3514   if (Subtarget->hasApertureRegs()) {
3515     unsigned Offset = AS == AMDGPUASI.LOCAL_ADDRESS ?
3516         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
3517         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
3518     unsigned WidthM1 = AS == AMDGPUASI.LOCAL_ADDRESS ?
3519         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
3520         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
3521     unsigned Encoding =
3522         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
3523         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
3524         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
3525 
3526     SDValue EncodingImm = DAG.getTargetConstant(Encoding, DL, MVT::i16);
3527     SDValue ApertureReg = SDValue(
3528         DAG.getMachineNode(AMDGPU::S_GETREG_B32, DL, MVT::i32, EncodingImm), 0);
3529     SDValue ShiftAmount = DAG.getTargetConstant(WidthM1 + 1, DL, MVT::i32);
3530     return DAG.getNode(ISD::SHL, DL, MVT::i32, ApertureReg, ShiftAmount);
3531   }
3532 
3533   MachineFunction &MF = DAG.getMachineFunction();
3534   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3535   unsigned UserSGPR = Info->getQueuePtrUserSGPR();
3536   assert(UserSGPR != AMDGPU::NoRegister);
3537 
3538   SDValue QueuePtr = CreateLiveInRegister(
3539     DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
3540 
3541   // Offset into amd_queue_t for group_segment_aperture_base_hi /
3542   // private_segment_aperture_base_hi.
3543   uint32_t StructOffset = (AS == AMDGPUASI.LOCAL_ADDRESS) ? 0x40 : 0x44;
3544 
3545   SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, QueuePtr,
3546                             DAG.getConstant(StructOffset, DL, MVT::i64));
3547 
3548   // TODO: Use custom target PseudoSourceValue.
3549   // TODO: We should use the value from the IR intrinsic call, but it might not
3550   // be available and how do we get it?
3551   Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()),
3552                                               AMDGPUASI.CONSTANT_ADDRESS));
3553 
3554   MachinePointerInfo PtrInfo(V, StructOffset);
3555   return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
3556                      MinAlign(64, StructOffset),
3557                      MachineMemOperand::MODereferenceable |
3558                          MachineMemOperand::MOInvariant);
3559 }
3560 
3561 SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
3562                                              SelectionDAG &DAG) const {
3563   SDLoc SL(Op);
3564   const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
3565 
3566   SDValue Src = ASC->getOperand(0);
3567   SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
3568 
3569   const AMDGPUTargetMachine &TM =
3570     static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
3571 
3572   // flat -> local/private
3573   if (ASC->getSrcAddressSpace() == AMDGPUASI.FLAT_ADDRESS) {
3574     unsigned DestAS = ASC->getDestAddressSpace();
3575 
3576     if (DestAS == AMDGPUASI.LOCAL_ADDRESS ||
3577         DestAS == AMDGPUASI.PRIVATE_ADDRESS) {
3578       unsigned NullVal = TM.getNullPointerValue(DestAS);
3579       SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
3580       SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
3581       SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
3582 
3583       return DAG.getNode(ISD::SELECT, SL, MVT::i32,
3584                          NonNull, Ptr, SegmentNullPtr);
3585     }
3586   }
3587 
3588   // local/private -> flat
3589   if (ASC->getDestAddressSpace() == AMDGPUASI.FLAT_ADDRESS) {
3590     unsigned SrcAS = ASC->getSrcAddressSpace();
3591 
3592     if (SrcAS == AMDGPUASI.LOCAL_ADDRESS ||
3593         SrcAS == AMDGPUASI.PRIVATE_ADDRESS) {
3594       unsigned NullVal = TM.getNullPointerValue(SrcAS);
3595       SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
3596 
3597       SDValue NonNull
3598         = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
3599 
3600       SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG);
3601       SDValue CvtPtr
3602         = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
3603 
3604       return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull,
3605                          DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr),
3606                          FlatNullPtr);
3607     }
3608   }
3609 
3610   // global <-> flat are no-ops and never emitted.
3611 
3612   const MachineFunction &MF = DAG.getMachineFunction();
3613   DiagnosticInfoUnsupported InvalidAddrSpaceCast(
3614     *MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
3615   DAG.getContext()->diagnose(InvalidAddrSpaceCast);
3616 
3617   return DAG.getUNDEF(ASC->getValueType(0));
3618 }
3619 
3620 SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
3621                                                  SelectionDAG &DAG) const {
3622   SDValue Idx = Op.getOperand(2);
3623   if (isa<ConstantSDNode>(Idx))
3624     return SDValue();
3625 
3626   // Avoid stack access for dynamic indexing.
3627   SDLoc SL(Op);
3628   SDValue Vec = Op.getOperand(0);
3629   SDValue Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Op.getOperand(1));
3630 
3631   // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
3632   SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Val);
3633 
3634   // Convert vector index to bit-index.
3635   SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx,
3636                                   DAG.getConstant(16, SL, MVT::i32));
3637 
3638   SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
3639 
3640   SDValue BFM = DAG.getNode(ISD::SHL, SL, MVT::i32,
3641                             DAG.getConstant(0xffff, SL, MVT::i32),
3642                             ScaledIdx);
3643 
3644   SDValue LHS = DAG.getNode(ISD::AND, SL, MVT::i32, BFM, ExtVal);
3645   SDValue RHS = DAG.getNode(ISD::AND, SL, MVT::i32,
3646                             DAG.getNOT(SL, BFM, MVT::i32), BCVec);
3647 
3648   SDValue BFI = DAG.getNode(ISD::OR, SL, MVT::i32, LHS, RHS);
3649   return DAG.getNode(ISD::BITCAST, SL, Op.getValueType(), BFI);
3650 }
3651 
3652 SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
3653                                                   SelectionDAG &DAG) const {
3654   SDLoc SL(Op);
3655 
3656   EVT ResultVT = Op.getValueType();
3657   SDValue Vec = Op.getOperand(0);
3658   SDValue Idx = Op.getOperand(1);
3659 
3660   DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
3661 
3662   // Make sure we we do any optimizations that will make it easier to fold
3663   // source modifiers before obscuring it with bit operations.
3664 
3665   // XXX - Why doesn't this get called when vector_shuffle is expanded?
3666   if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
3667     return Combined;
3668 
3669   if (const ConstantSDNode *CIdx = dyn_cast<ConstantSDNode>(Idx)) {
3670     SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
3671 
3672     if (CIdx->getZExtValue() == 1) {
3673       Result = DAG.getNode(ISD::SRL, SL, MVT::i32, Result,
3674                            DAG.getConstant(16, SL, MVT::i32));
3675     } else {
3676       assert(CIdx->getZExtValue() == 0);
3677     }
3678 
3679     if (ResultVT.bitsLT(MVT::i32))
3680       Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Result);
3681     return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
3682   }
3683 
3684   SDValue Sixteen = DAG.getConstant(16, SL, MVT::i32);
3685 
3686   // Convert vector index to bit-index.
3687   SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, Sixteen);
3688 
3689   SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
3690   SDValue Elt = DAG.getNode(ISD::SRL, SL, MVT::i32, BC, ScaledIdx);
3691 
3692   SDValue Result = Elt;
3693   if (ResultVT.bitsLT(MVT::i32))
3694     Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Result);
3695 
3696   return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
3697 }
3698 
3699 bool
3700 SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
3701   // We can fold offsets for anything that doesn't require a GOT relocation.
3702   return (GA->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
3703               GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) &&
3704          !shouldEmitGOTReloc(GA->getGlobal());
3705 }
3706 
3707 static SDValue
3708 buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
3709                         const SDLoc &DL, unsigned Offset, EVT PtrVT,
3710                         unsigned GAFlags = SIInstrInfo::MO_NONE) {
3711   // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
3712   // lowered to the following code sequence:
3713   //
3714   // For constant address space:
3715   //   s_getpc_b64 s[0:1]
3716   //   s_add_u32 s0, s0, $symbol
3717   //   s_addc_u32 s1, s1, 0
3718   //
3719   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
3720   //   a fixup or relocation is emitted to replace $symbol with a literal
3721   //   constant, which is a pc-relative offset from the encoding of the $symbol
3722   //   operand to the global variable.
3723   //
3724   // For global address space:
3725   //   s_getpc_b64 s[0:1]
3726   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
3727   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
3728   //
3729   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
3730   //   fixups or relocations are emitted to replace $symbol@*@lo and
3731   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
3732   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
3733   //   operand to the global variable.
3734   //
3735   // What we want here is an offset from the value returned by s_getpc
3736   // (which is the address of the s_add_u32 instruction) to the global
3737   // variable, but since the encoding of $symbol starts 4 bytes after the start
3738   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
3739   // small. This requires us to add 4 to the global variable offset in order to
3740   // compute the correct address.
3741   SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
3742                                              GAFlags);
3743   SDValue PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
3744                                              GAFlags == SIInstrInfo::MO_NONE ?
3745                                              GAFlags : GAFlags + 1);
3746   return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
3747 }
3748 
3749 SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
3750                                              SDValue Op,
3751                                              SelectionDAG &DAG) const {
3752   GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
3753   const GlobalValue *GV = GSD->getGlobal();
3754 
3755   if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS &&
3756       GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS &&
3757       // FIXME: It isn't correct to rely on the type of the pointer. This should
3758       // be removed when address space 0 is 64-bit.
3759       !GV->getType()->getElementType()->isFunctionTy())
3760     return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
3761 
3762   SDLoc DL(GSD);
3763   EVT PtrVT = Op.getValueType();
3764 
3765   if (shouldEmitFixup(GV))
3766     return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
3767   else if (shouldEmitPCReloc(GV))
3768     return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
3769                                    SIInstrInfo::MO_REL32);
3770 
3771   SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
3772                                             SIInstrInfo::MO_GOTPCREL32);
3773 
3774   Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
3775   PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS);
3776   const DataLayout &DataLayout = DAG.getDataLayout();
3777   unsigned Align = DataLayout.getABITypeAlignment(PtrTy);
3778   // FIXME: Use a PseudoSourceValue once those can be assigned an address space.
3779   MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
3780 
3781   return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align,
3782                      MachineMemOperand::MODereferenceable |
3783                          MachineMemOperand::MOInvariant);
3784 }
3785 
3786 SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
3787                                    const SDLoc &DL, SDValue V) const {
3788   // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
3789   // the destination register.
3790   //
3791   // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
3792   // so we will end up with redundant moves to m0.
3793   //
3794   // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
3795 
3796   // A Null SDValue creates a glue result.
3797   SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
3798                                   V, Chain);
3799   return SDValue(M0, 0);
3800 }
3801 
3802 SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
3803                                                  SDValue Op,
3804                                                  MVT VT,
3805                                                  unsigned Offset) const {
3806   SDLoc SL(Op);
3807   SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL,
3808                                            DAG.getEntryNode(), Offset, false);
3809   // The local size values will have the hi 16-bits as zero.
3810   return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
3811                      DAG.getValueType(VT));
3812 }
3813 
3814 static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
3815                                         EVT VT) {
3816   DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(),
3817                                       "non-hsa intrinsic with hsa target",
3818                                       DL.getDebugLoc());
3819   DAG.getContext()->diagnose(BadIntrin);
3820   return DAG.getUNDEF(VT);
3821 }
3822 
3823 static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
3824                                          EVT VT) {
3825   DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(),
3826                                       "intrinsic not supported on subtarget",
3827                                       DL.getDebugLoc());
3828   DAG.getContext()->diagnose(BadIntrin);
3829   return DAG.getUNDEF(VT);
3830 }
3831 
3832 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
3833                                                   SelectionDAG &DAG) const {
3834   MachineFunction &MF = DAG.getMachineFunction();
3835   auto MFI = MF.getInfo<SIMachineFunctionInfo>();
3836 
3837   EVT VT = Op.getValueType();
3838   SDLoc DL(Op);
3839   unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
3840 
3841   // TODO: Should this propagate fast-math-flags?
3842 
3843   switch (IntrinsicID) {
3844   case Intrinsic::amdgcn_implicit_buffer_ptr: {
3845     if (getSubtarget()->isAmdCodeObjectV2(MF))
3846       return emitNonHSAIntrinsicError(DAG, DL, VT);
3847     return getPreloadedValue(DAG, *MFI, VT,
3848                              AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
3849   }
3850   case Intrinsic::amdgcn_dispatch_ptr:
3851   case Intrinsic::amdgcn_queue_ptr: {
3852     if (!Subtarget->isAmdCodeObjectV2(MF)) {
3853       DiagnosticInfoUnsupported BadIntrin(
3854           *MF.getFunction(), "unsupported hsa intrinsic without hsa target",
3855           DL.getDebugLoc());
3856       DAG.getContext()->diagnose(BadIntrin);
3857       return DAG.getUNDEF(VT);
3858     }
3859 
3860     auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
3861       AMDGPUFunctionArgInfo::DISPATCH_PTR : AMDGPUFunctionArgInfo::QUEUE_PTR;
3862     return getPreloadedValue(DAG, *MFI, VT, RegID);
3863   }
3864   case Intrinsic::amdgcn_implicitarg_ptr: {
3865     if (MFI->isEntryFunction())
3866       return getImplicitArgPtr(DAG, DL);
3867     return getPreloadedValue(DAG, *MFI, VT,
3868                              AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
3869   }
3870   case Intrinsic::amdgcn_kernarg_segment_ptr: {
3871     return getPreloadedValue(DAG, *MFI, VT,
3872                              AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
3873   }
3874   case Intrinsic::amdgcn_dispatch_id: {
3875     return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
3876   }
3877   case Intrinsic::amdgcn_rcp:
3878     return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
3879   case Intrinsic::amdgcn_rsq:
3880     return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
3881   case Intrinsic::amdgcn_rsq_legacy:
3882     if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
3883       return emitRemovedIntrinsicError(DAG, DL, VT);
3884 
3885     return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
3886   case Intrinsic::amdgcn_rcp_legacy:
3887     if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
3888       return emitRemovedIntrinsicError(DAG, DL, VT);
3889     return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
3890   case Intrinsic::amdgcn_rsq_clamp: {
3891     if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
3892       return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
3893 
3894     Type *Type = VT.getTypeForEVT(*DAG.getContext());
3895     APFloat Max = APFloat::getLargest(Type->getFltSemantics());
3896     APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
3897 
3898     SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
3899     SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
3900                               DAG.getConstantFP(Max, DL, VT));
3901     return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
3902                        DAG.getConstantFP(Min, DL, VT));
3903   }
3904   case Intrinsic::r600_read_ngroups_x:
3905     if (Subtarget->isAmdHsaOS())
3906       return emitNonHSAIntrinsicError(DAG, DL, VT);
3907 
3908     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
3909                                     SI::KernelInputOffsets::NGROUPS_X, false);
3910   case Intrinsic::r600_read_ngroups_y:
3911     if (Subtarget->isAmdHsaOS())
3912       return emitNonHSAIntrinsicError(DAG, DL, VT);
3913 
3914     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
3915                                     SI::KernelInputOffsets::NGROUPS_Y, false);
3916   case Intrinsic::r600_read_ngroups_z:
3917     if (Subtarget->isAmdHsaOS())
3918       return emitNonHSAIntrinsicError(DAG, DL, VT);
3919 
3920     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
3921                                     SI::KernelInputOffsets::NGROUPS_Z, false);
3922   case Intrinsic::r600_read_global_size_x:
3923     if (Subtarget->isAmdHsaOS())
3924       return emitNonHSAIntrinsicError(DAG, DL, VT);
3925 
3926     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
3927                                     SI::KernelInputOffsets::GLOBAL_SIZE_X, false);
3928   case Intrinsic::r600_read_global_size_y:
3929     if (Subtarget->isAmdHsaOS())
3930       return emitNonHSAIntrinsicError(DAG, DL, VT);
3931 
3932     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
3933                                     SI::KernelInputOffsets::GLOBAL_SIZE_Y, false);
3934   case Intrinsic::r600_read_global_size_z:
3935     if (Subtarget->isAmdHsaOS())
3936       return emitNonHSAIntrinsicError(DAG, DL, VT);
3937 
3938     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
3939                                     SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
3940   case Intrinsic::r600_read_local_size_x:
3941     if (Subtarget->isAmdHsaOS())
3942       return emitNonHSAIntrinsicError(DAG, DL, VT);
3943 
3944     return lowerImplicitZextParam(DAG, Op, MVT::i16,
3945                                   SI::KernelInputOffsets::LOCAL_SIZE_X);
3946   case Intrinsic::r600_read_local_size_y:
3947     if (Subtarget->isAmdHsaOS())
3948       return emitNonHSAIntrinsicError(DAG, DL, VT);
3949 
3950     return lowerImplicitZextParam(DAG, Op, MVT::i16,
3951                                   SI::KernelInputOffsets::LOCAL_SIZE_Y);
3952   case Intrinsic::r600_read_local_size_z:
3953     if (Subtarget->isAmdHsaOS())
3954       return emitNonHSAIntrinsicError(DAG, DL, VT);
3955 
3956     return lowerImplicitZextParam(DAG, Op, MVT::i16,
3957                                   SI::KernelInputOffsets::LOCAL_SIZE_Z);
3958   case Intrinsic::amdgcn_workgroup_id_x:
3959   case Intrinsic::r600_read_tgid_x:
3960     return getPreloadedValue(DAG, *MFI, VT,
3961                              AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
3962   case Intrinsic::amdgcn_workgroup_id_y:
3963   case Intrinsic::r600_read_tgid_y:
3964     return getPreloadedValue(DAG, *MFI, VT,
3965                              AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
3966   case Intrinsic::amdgcn_workgroup_id_z:
3967   case Intrinsic::r600_read_tgid_z:
3968     return getPreloadedValue(DAG, *MFI, VT,
3969                              AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
3970   case Intrinsic::amdgcn_workitem_id_x: {
3971   case Intrinsic::r600_read_tidig_x:
3972     return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
3973                           SDLoc(DAG.getEntryNode()),
3974                           MFI->getArgInfo().WorkItemIDX);
3975   }
3976   case Intrinsic::amdgcn_workitem_id_y:
3977   case Intrinsic::r600_read_tidig_y:
3978     return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
3979                           SDLoc(DAG.getEntryNode()),
3980                           MFI->getArgInfo().WorkItemIDY);
3981   case Intrinsic::amdgcn_workitem_id_z:
3982   case Intrinsic::r600_read_tidig_z:
3983     return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
3984                           SDLoc(DAG.getEntryNode()),
3985                           MFI->getArgInfo().WorkItemIDZ);
3986   case AMDGPUIntrinsic::SI_load_const: {
3987     SDValue Ops[] = {
3988       Op.getOperand(1),
3989       Op.getOperand(2)
3990     };
3991 
3992     MachineMemOperand *MMO = MF.getMachineMemOperand(
3993         MachinePointerInfo(),
3994         MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
3995             MachineMemOperand::MOInvariant,
3996         VT.getStoreSize(), 4);
3997     return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
3998                                    Op->getVTList(), Ops, VT, MMO);
3999   }
4000   case Intrinsic::amdgcn_fdiv_fast:
4001     return lowerFDIV_FAST(Op, DAG);
4002   case Intrinsic::amdgcn_interp_mov: {
4003     SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
4004     SDValue Glue = M0.getValue(1);
4005     return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, Op.getOperand(1),
4006                        Op.getOperand(2), Op.getOperand(3), Glue);
4007   }
4008   case Intrinsic::amdgcn_interp_p1: {
4009     SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
4010     SDValue Glue = M0.getValue(1);
4011     return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1),
4012                        Op.getOperand(2), Op.getOperand(3), Glue);
4013   }
4014   case Intrinsic::amdgcn_interp_p2: {
4015     SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
4016     SDValue Glue = SDValue(M0.getNode(), 1);
4017     return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1),
4018                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
4019                        Glue);
4020   }
4021   case Intrinsic::amdgcn_sin:
4022     return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
4023 
4024   case Intrinsic::amdgcn_cos:
4025     return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
4026 
4027   case Intrinsic::amdgcn_log_clamp: {
4028     if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS)
4029       return SDValue();
4030 
4031     DiagnosticInfoUnsupported BadIntrin(
4032       *MF.getFunction(), "intrinsic not supported on subtarget",
4033       DL.getDebugLoc());
4034       DAG.getContext()->diagnose(BadIntrin);
4035       return DAG.getUNDEF(VT);
4036   }
4037   case Intrinsic::amdgcn_ldexp:
4038     return DAG.getNode(AMDGPUISD::LDEXP, DL, VT,
4039                        Op.getOperand(1), Op.getOperand(2));
4040 
4041   case Intrinsic::amdgcn_fract:
4042     return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
4043 
4044   case Intrinsic::amdgcn_class:
4045     return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
4046                        Op.getOperand(1), Op.getOperand(2));
4047   case Intrinsic::amdgcn_div_fmas:
4048     return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
4049                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
4050                        Op.getOperand(4));
4051 
4052   case Intrinsic::amdgcn_div_fixup:
4053     return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
4054                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4055 
4056   case Intrinsic::amdgcn_trig_preop:
4057     return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
4058                        Op.getOperand(1), Op.getOperand(2));
4059   case Intrinsic::amdgcn_div_scale: {
4060     // 3rd parameter required to be a constant.
4061     const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3));
4062     if (!Param)
4063       return DAG.getMergeValues({ DAG.getUNDEF(VT), DAG.getUNDEF(MVT::i1) }, DL);
4064 
4065     // Translate to the operands expected by the machine instruction. The
4066     // first parameter must be the same as the first instruction.
4067     SDValue Numerator = Op.getOperand(1);
4068     SDValue Denominator = Op.getOperand(2);
4069 
4070     // Note this order is opposite of the machine instruction's operations,
4071     // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
4072     // intrinsic has the numerator as the first operand to match a normal
4073     // division operation.
4074 
4075     SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
4076 
4077     return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
4078                        Denominator, Numerator);
4079   }
4080   case Intrinsic::amdgcn_icmp: {
4081     const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3));
4082     if (!CD)
4083       return DAG.getUNDEF(VT);
4084 
4085     int CondCode = CD->getSExtValue();
4086     if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE ||
4087         CondCode > ICmpInst::Predicate::LAST_ICMP_PREDICATE)
4088       return DAG.getUNDEF(VT);
4089 
4090     ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
4091     ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
4092     return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1),
4093                        Op.getOperand(2), DAG.getCondCode(CCOpcode));
4094   }
4095   case Intrinsic::amdgcn_fcmp: {
4096     const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3));
4097     if (!CD)
4098       return DAG.getUNDEF(VT);
4099 
4100     int CondCode = CD->getSExtValue();
4101     if (CondCode < FCmpInst::Predicate::FIRST_FCMP_PREDICATE ||
4102         CondCode > FCmpInst::Predicate::LAST_FCMP_PREDICATE)
4103       return DAG.getUNDEF(VT);
4104 
4105     FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
4106     ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
4107     return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1),
4108                        Op.getOperand(2), DAG.getCondCode(CCOpcode));
4109   }
4110   case Intrinsic::amdgcn_fmed3:
4111     return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
4112                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4113   case Intrinsic::amdgcn_fmul_legacy:
4114     return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
4115                        Op.getOperand(1), Op.getOperand(2));
4116   case Intrinsic::amdgcn_sffbh:
4117     return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
4118   case Intrinsic::amdgcn_sbfe:
4119     return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
4120                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4121   case Intrinsic::amdgcn_ubfe:
4122     return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
4123                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4124   case Intrinsic::amdgcn_cvt_pkrtz: {
4125     // FIXME: Stop adding cast if v2f16 legal.
4126     EVT VT = Op.getValueType();
4127     SDValue Node = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, DL, MVT::i32,
4128                                Op.getOperand(1), Op.getOperand(2));
4129     return DAG.getNode(ISD::BITCAST, DL, VT, Node);
4130   }
4131   case Intrinsic::amdgcn_wqm: {
4132     SDValue Src = Op.getOperand(1);
4133     return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src),
4134                    0);
4135   }
4136   case Intrinsic::amdgcn_wwm: {
4137     SDValue Src = Op.getOperand(1);
4138     return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src),
4139                    0);
4140   }
4141   default:
4142     return Op;
4143   }
4144 }
4145 
4146 SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
4147                                                  SelectionDAG &DAG) const {
4148   unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4149   SDLoc DL(Op);
4150   MachineFunction &MF = DAG.getMachineFunction();
4151 
4152   switch (IntrID) {
4153   case Intrinsic::amdgcn_atomic_inc:
4154   case Intrinsic::amdgcn_atomic_dec: {
4155     MemSDNode *M = cast<MemSDNode>(Op);
4156     unsigned Opc = (IntrID == Intrinsic::amdgcn_atomic_inc) ?
4157       AMDGPUISD::ATOMIC_INC : AMDGPUISD::ATOMIC_DEC;
4158     SDValue Ops[] = {
4159       M->getOperand(0), // Chain
4160       M->getOperand(2), // Ptr
4161       M->getOperand(3)  // Value
4162     };
4163 
4164     return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
4165                                    M->getMemoryVT(), M->getMemOperand());
4166   }
4167   case Intrinsic::amdgcn_buffer_load:
4168   case Intrinsic::amdgcn_buffer_load_format: {
4169     SDValue Ops[] = {
4170       Op.getOperand(0), // Chain
4171       Op.getOperand(2), // rsrc
4172       Op.getOperand(3), // vindex
4173       Op.getOperand(4), // offset
4174       Op.getOperand(5), // glc
4175       Op.getOperand(6)  // slc
4176     };
4177     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
4178 
4179     unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
4180         AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
4181     EVT VT = Op.getValueType();
4182     EVT IntVT = VT.changeTypeToInteger();
4183 
4184     MachineMemOperand *MMO = MF.getMachineMemOperand(
4185       MachinePointerInfo(MFI->getBufferPSV()),
4186       MachineMemOperand::MOLoad,
4187       VT.getStoreSize(), VT.getStoreSize());
4188 
4189     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, MMO);
4190   }
4191   case Intrinsic::amdgcn_tbuffer_load: {
4192     SDValue Ops[] = {
4193       Op.getOperand(0),  // Chain
4194       Op.getOperand(2),  // rsrc
4195       Op.getOperand(3),  // vindex
4196       Op.getOperand(4),  // voffset
4197       Op.getOperand(5),  // soffset
4198       Op.getOperand(6),  // offset
4199       Op.getOperand(7),  // dfmt
4200       Op.getOperand(8),  // nfmt
4201       Op.getOperand(9),  // glc
4202       Op.getOperand(10)   // slc
4203     };
4204 
4205     EVT VT = Op.getOperand(2).getValueType();
4206 
4207     MachineMemOperand *MMO = MF.getMachineMemOperand(
4208       MachinePointerInfo(),
4209       MachineMemOperand::MOLoad,
4210       VT.getStoreSize(), VT.getStoreSize());
4211     return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
4212                                    Op->getVTList(), Ops, VT, MMO);
4213   }
4214   // Basic sample.
4215   case Intrinsic::amdgcn_image_sample:
4216   case Intrinsic::amdgcn_image_sample_cl:
4217   case Intrinsic::amdgcn_image_sample_d:
4218   case Intrinsic::amdgcn_image_sample_d_cl:
4219   case Intrinsic::amdgcn_image_sample_l:
4220   case Intrinsic::amdgcn_image_sample_b:
4221   case Intrinsic::amdgcn_image_sample_b_cl:
4222   case Intrinsic::amdgcn_image_sample_lz:
4223   case Intrinsic::amdgcn_image_sample_cd:
4224   case Intrinsic::amdgcn_image_sample_cd_cl:
4225 
4226   // Sample with comparison.
4227   case Intrinsic::amdgcn_image_sample_c:
4228   case Intrinsic::amdgcn_image_sample_c_cl:
4229   case Intrinsic::amdgcn_image_sample_c_d:
4230   case Intrinsic::amdgcn_image_sample_c_d_cl:
4231   case Intrinsic::amdgcn_image_sample_c_l:
4232   case Intrinsic::amdgcn_image_sample_c_b:
4233   case Intrinsic::amdgcn_image_sample_c_b_cl:
4234   case Intrinsic::amdgcn_image_sample_c_lz:
4235   case Intrinsic::amdgcn_image_sample_c_cd:
4236   case Intrinsic::amdgcn_image_sample_c_cd_cl:
4237 
4238   // Sample with offsets.
4239   case Intrinsic::amdgcn_image_sample_o:
4240   case Intrinsic::amdgcn_image_sample_cl_o:
4241   case Intrinsic::amdgcn_image_sample_d_o:
4242   case Intrinsic::amdgcn_image_sample_d_cl_o:
4243   case Intrinsic::amdgcn_image_sample_l_o:
4244   case Intrinsic::amdgcn_image_sample_b_o:
4245   case Intrinsic::amdgcn_image_sample_b_cl_o:
4246   case Intrinsic::amdgcn_image_sample_lz_o:
4247   case Intrinsic::amdgcn_image_sample_cd_o:
4248   case Intrinsic::amdgcn_image_sample_cd_cl_o:
4249 
4250   // Sample with comparison and offsets.
4251   case Intrinsic::amdgcn_image_sample_c_o:
4252   case Intrinsic::amdgcn_image_sample_c_cl_o:
4253   case Intrinsic::amdgcn_image_sample_c_d_o:
4254   case Intrinsic::amdgcn_image_sample_c_d_cl_o:
4255   case Intrinsic::amdgcn_image_sample_c_l_o:
4256   case Intrinsic::amdgcn_image_sample_c_b_o:
4257   case Intrinsic::amdgcn_image_sample_c_b_cl_o:
4258   case Intrinsic::amdgcn_image_sample_c_lz_o:
4259   case Intrinsic::amdgcn_image_sample_c_cd_o:
4260   case Intrinsic::amdgcn_image_sample_c_cd_cl_o:
4261 
4262   case Intrinsic::amdgcn_image_getlod: {
4263     // Replace dmask with everything disabled with undef.
4264     const ConstantSDNode *DMask = dyn_cast<ConstantSDNode>(Op.getOperand(5));
4265     if (!DMask || DMask->isNullValue()) {
4266       SDValue Undef = DAG.getUNDEF(Op.getValueType());
4267       return DAG.getMergeValues({ Undef, Op.getOperand(0) }, SDLoc(Op));
4268     }
4269 
4270     return SDValue();
4271   }
4272   default:
4273     return SDValue();
4274   }
4275 }
4276 
4277 SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
4278                                               SelectionDAG &DAG) const {
4279   SDLoc DL(Op);
4280   SDValue Chain = Op.getOperand(0);
4281   unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4282   MachineFunction &MF = DAG.getMachineFunction();
4283 
4284   switch (IntrinsicID) {
4285   case Intrinsic::amdgcn_exp: {
4286     const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
4287     const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
4288     const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(8));
4289     const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(9));
4290 
4291     const SDValue Ops[] = {
4292       Chain,
4293       DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
4294       DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8),  // en
4295       Op.getOperand(4), // src0
4296       Op.getOperand(5), // src1
4297       Op.getOperand(6), // src2
4298       Op.getOperand(7), // src3
4299       DAG.getTargetConstant(0, DL, MVT::i1), // compr
4300       DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
4301     };
4302 
4303     unsigned Opc = Done->isNullValue() ?
4304       AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
4305     return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
4306   }
4307   case Intrinsic::amdgcn_exp_compr: {
4308     const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
4309     const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
4310     SDValue Src0 = Op.getOperand(4);
4311     SDValue Src1 = Op.getOperand(5);
4312     const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
4313     const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(7));
4314 
4315     SDValue Undef = DAG.getUNDEF(MVT::f32);
4316     const SDValue Ops[] = {
4317       Chain,
4318       DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
4319       DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8),  // en
4320       DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0),
4321       DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1),
4322       Undef, // src2
4323       Undef, // src3
4324       DAG.getTargetConstant(1, DL, MVT::i1), // compr
4325       DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
4326     };
4327 
4328     unsigned Opc = Done->isNullValue() ?
4329       AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
4330     return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
4331   }
4332   case Intrinsic::amdgcn_s_sendmsg:
4333   case Intrinsic::amdgcn_s_sendmsghalt: {
4334     unsigned NodeOp = (IntrinsicID == Intrinsic::amdgcn_s_sendmsg) ?
4335       AMDGPUISD::SENDMSG : AMDGPUISD::SENDMSGHALT;
4336     Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
4337     SDValue Glue = Chain.getValue(1);
4338     return DAG.getNode(NodeOp, DL, MVT::Other, Chain,
4339                        Op.getOperand(2), Glue);
4340   }
4341   case Intrinsic::amdgcn_init_exec: {
4342     return DAG.getNode(AMDGPUISD::INIT_EXEC, DL, MVT::Other, Chain,
4343                        Op.getOperand(2));
4344   }
4345   case Intrinsic::amdgcn_init_exec_from_input: {
4346     return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain,
4347                        Op.getOperand(2), Op.getOperand(3));
4348   }
4349   case AMDGPUIntrinsic::AMDGPU_kill: {
4350     SDValue Src = Op.getOperand(2);
4351     if (const ConstantFPSDNode *K = dyn_cast<ConstantFPSDNode>(Src)) {
4352       if (!K->isNegative())
4353         return Chain;
4354 
4355       SDValue NegOne = DAG.getTargetConstant(FloatToBits(-1.0f), DL, MVT::i32);
4356       return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, NegOne);
4357     }
4358 
4359     SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Src);
4360     return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, Cast);
4361   }
4362   case Intrinsic::amdgcn_s_barrier: {
4363     if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
4364       const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
4365       unsigned WGSize = ST.getFlatWorkGroupSizes(*MF.getFunction()).second;
4366       if (WGSize <= ST.getWavefrontSize())
4367         return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
4368                                           Op.getOperand(0)), 0);
4369     }
4370     return SDValue();
4371   };
4372   case AMDGPUIntrinsic::SI_tbuffer_store: {
4373 
4374     // Extract vindex and voffset from vaddr as appropriate
4375     const ConstantSDNode *OffEn = cast<ConstantSDNode>(Op.getOperand(10));
4376     const ConstantSDNode *IdxEn = cast<ConstantSDNode>(Op.getOperand(11));
4377     SDValue VAddr = Op.getOperand(5);
4378 
4379     SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32);
4380 
4381     assert(!(OffEn->isOne() && IdxEn->isOne()) &&
4382            "Legacy intrinsic doesn't support both offset and index - use new version");
4383 
4384     SDValue VIndex = IdxEn->isOne() ? VAddr : Zero;
4385     SDValue VOffset = OffEn->isOne() ? VAddr : Zero;
4386 
4387     // Deal with the vec-3 case
4388     const ConstantSDNode *NumChannels = cast<ConstantSDNode>(Op.getOperand(4));
4389     auto Opcode = NumChannels->getZExtValue() == 3 ?
4390       AMDGPUISD::TBUFFER_STORE_FORMAT_X3 : AMDGPUISD::TBUFFER_STORE_FORMAT;
4391 
4392     SDValue Ops[] = {
4393      Chain,
4394      Op.getOperand(3),  // vdata
4395      Op.getOperand(2),  // rsrc
4396      VIndex,
4397      VOffset,
4398      Op.getOperand(6),  // soffset
4399      Op.getOperand(7),  // inst_offset
4400      Op.getOperand(8),  // dfmt
4401      Op.getOperand(9),  // nfmt
4402      Op.getOperand(12), // glc
4403      Op.getOperand(13), // slc
4404     };
4405 
4406     assert((cast<ConstantSDNode>(Op.getOperand(14)))->getZExtValue() == 0 &&
4407            "Value of tfe other than zero is unsupported");
4408 
4409     EVT VT = Op.getOperand(3).getValueType();
4410     MachineMemOperand *MMO = MF.getMachineMemOperand(
4411       MachinePointerInfo(),
4412       MachineMemOperand::MOStore,
4413       VT.getStoreSize(), 4);
4414     return DAG.getMemIntrinsicNode(Opcode, DL,
4415                                    Op->getVTList(), Ops, VT, MMO);
4416   }
4417 
4418   case Intrinsic::amdgcn_tbuffer_store: {
4419     SDValue Ops[] = {
4420       Chain,
4421       Op.getOperand(2),  // vdata
4422       Op.getOperand(3),  // rsrc
4423       Op.getOperand(4),  // vindex
4424       Op.getOperand(5),  // voffset
4425       Op.getOperand(6),  // soffset
4426       Op.getOperand(7),  // offset
4427       Op.getOperand(8),  // dfmt
4428       Op.getOperand(9),  // nfmt
4429       Op.getOperand(10), // glc
4430       Op.getOperand(11)  // slc
4431     };
4432     EVT VT = Op.getOperand(3).getValueType();
4433     MachineMemOperand *MMO = MF.getMachineMemOperand(
4434       MachinePointerInfo(),
4435       MachineMemOperand::MOStore,
4436       VT.getStoreSize(), 4);
4437     return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
4438                                    Op->getVTList(), Ops, VT, MMO);
4439   }
4440 
4441   default:
4442     return Op;
4443   }
4444 }
4445 
4446 SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
4447   SDLoc DL(Op);
4448   LoadSDNode *Load = cast<LoadSDNode>(Op);
4449   ISD::LoadExtType ExtType = Load->getExtensionType();
4450   EVT MemVT = Load->getMemoryVT();
4451 
4452   if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
4453     if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
4454       return SDValue();
4455 
4456     // FIXME: Copied from PPC
4457     // First, load into 32 bits, then truncate to 1 bit.
4458 
4459     SDValue Chain = Load->getChain();
4460     SDValue BasePtr = Load->getBasePtr();
4461     MachineMemOperand *MMO = Load->getMemOperand();
4462 
4463     EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
4464 
4465     SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
4466                                    BasePtr, RealMemVT, MMO);
4467 
4468     SDValue Ops[] = {
4469       DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
4470       NewLD.getValue(1)
4471     };
4472 
4473     return DAG.getMergeValues(Ops, DL);
4474   }
4475 
4476   if (!MemVT.isVector())
4477     return SDValue();
4478 
4479   assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
4480          "Custom lowering for non-i32 vectors hasn't been implemented.");
4481 
4482   unsigned AS = Load->getAddressSpace();
4483   if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
4484                           AS, Load->getAlignment())) {
4485     SDValue Ops[2];
4486     std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
4487     return DAG.getMergeValues(Ops, DL);
4488   }
4489 
4490   MachineFunction &MF = DAG.getMachineFunction();
4491   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
4492   // If there is a possibilty that flat instruction access scratch memory
4493   // then we need to use the same legalization rules we use for private.
4494   if (AS == AMDGPUASI.FLAT_ADDRESS)
4495     AS = MFI->hasFlatScratchInit() ?
4496          AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS;
4497 
4498   unsigned NumElements = MemVT.getVectorNumElements();
4499   if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
4500     if (isMemOpUniform(Load))
4501       return SDValue();
4502     // Non-uniform loads will be selected to MUBUF instructions, so they
4503     // have the same legalization requirements as global and private
4504     // loads.
4505     //
4506   }
4507   if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS) {
4508     if (Subtarget->getScalarizeGlobalBehavior() && isMemOpUniform(Load) &&
4509         !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load))
4510       return SDValue();
4511     // Non-uniform loads will be selected to MUBUF instructions, so they
4512     // have the same legalization requirements as global and private
4513     // loads.
4514     //
4515   }
4516   if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS ||
4517       AS == AMDGPUASI.FLAT_ADDRESS) {
4518     if (NumElements > 4)
4519       return SplitVectorLoad(Op, DAG);
4520     // v4 loads are supported for private and global memory.
4521     return SDValue();
4522   }
4523   if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
4524     // Depending on the setting of the private_element_size field in the
4525     // resource descriptor, we can only make private accesses up to a certain
4526     // size.
4527     switch (Subtarget->getMaxPrivateElementSize()) {
4528     case 4:
4529       return scalarizeVectorLoad(Load, DAG);
4530     case 8:
4531       if (NumElements > 2)
4532         return SplitVectorLoad(Op, DAG);
4533       return SDValue();
4534     case 16:
4535       // Same as global/flat
4536       if (NumElements > 4)
4537         return SplitVectorLoad(Op, DAG);
4538       return SDValue();
4539     default:
4540       llvm_unreachable("unsupported private_element_size");
4541     }
4542   } else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
4543     if (NumElements > 2)
4544       return SplitVectorLoad(Op, DAG);
4545 
4546     if (NumElements == 2)
4547       return SDValue();
4548 
4549     // If properly aligned, if we split we might be able to use ds_read_b64.
4550     return SplitVectorLoad(Op, DAG);
4551   }
4552   return SDValue();
4553 }
4554 
4555 SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
4556   if (Op.getValueType() != MVT::i64)
4557     return SDValue();
4558 
4559   SDLoc DL(Op);
4560   SDValue Cond = Op.getOperand(0);
4561 
4562   SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
4563   SDValue One = DAG.getConstant(1, DL, MVT::i32);
4564 
4565   SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
4566   SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
4567 
4568   SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
4569   SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
4570 
4571   SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
4572 
4573   SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
4574   SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
4575 
4576   SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
4577 
4578   SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
4579   return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res);
4580 }
4581 
4582 // Catch division cases where we can use shortcuts with rcp and rsq
4583 // instructions.
4584 SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
4585                                               SelectionDAG &DAG) const {
4586   SDLoc SL(Op);
4587   SDValue LHS = Op.getOperand(0);
4588   SDValue RHS = Op.getOperand(1);
4589   EVT VT = Op.getValueType();
4590   const SDNodeFlags Flags = Op->getFlags();
4591   bool Unsafe = DAG.getTarget().Options.UnsafeFPMath ||
4592                 Flags.hasUnsafeAlgebra() || Flags.hasAllowReciprocal();
4593 
4594   if (!Unsafe && VT == MVT::f32 && Subtarget->hasFP32Denormals())
4595     return SDValue();
4596 
4597   if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
4598     if (Unsafe || VT == MVT::f32 || VT == MVT::f16) {
4599       if (CLHS->isExactlyValue(1.0)) {
4600         // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
4601         // the CI documentation has a worst case error of 1 ulp.
4602         // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
4603         // use it as long as we aren't trying to use denormals.
4604         //
4605         // v_rcp_f16 and v_rsq_f16 DO support denormals.
4606 
4607         // 1.0 / sqrt(x) -> rsq(x)
4608 
4609         // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
4610         // error seems really high at 2^29 ULP.
4611         if (RHS.getOpcode() == ISD::FSQRT)
4612           return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
4613 
4614         // 1.0 / x -> rcp(x)
4615         return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
4616       }
4617 
4618       // Same as for 1.0, but expand the sign out of the constant.
4619       if (CLHS->isExactlyValue(-1.0)) {
4620         // -1.0 / x -> rcp (fneg x)
4621         SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4622         return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
4623       }
4624     }
4625   }
4626 
4627   if (Unsafe) {
4628     // Turn into multiply by the reciprocal.
4629     // x / y -> x * (1.0 / y)
4630     SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
4631     return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
4632   }
4633 
4634   return SDValue();
4635 }
4636 
4637 static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
4638                           EVT VT, SDValue A, SDValue B, SDValue GlueChain) {
4639   if (GlueChain->getNumValues() <= 1) {
4640     return DAG.getNode(Opcode, SL, VT, A, B);
4641   }
4642 
4643   assert(GlueChain->getNumValues() == 3);
4644 
4645   SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
4646   switch (Opcode) {
4647   default: llvm_unreachable("no chain equivalent for opcode");
4648   case ISD::FMUL:
4649     Opcode = AMDGPUISD::FMUL_W_CHAIN;
4650     break;
4651   }
4652 
4653   return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B,
4654                      GlueChain.getValue(2));
4655 }
4656 
4657 static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
4658                            EVT VT, SDValue A, SDValue B, SDValue C,
4659                            SDValue GlueChain) {
4660   if (GlueChain->getNumValues() <= 1) {
4661     return DAG.getNode(Opcode, SL, VT, A, B, C);
4662   }
4663 
4664   assert(GlueChain->getNumValues() == 3);
4665 
4666   SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
4667   switch (Opcode) {
4668   default: llvm_unreachable("no chain equivalent for opcode");
4669   case ISD::FMA:
4670     Opcode = AMDGPUISD::FMA_W_CHAIN;
4671     break;
4672   }
4673 
4674   return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, C,
4675                      GlueChain.getValue(2));
4676 }
4677 
4678 SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
4679   if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
4680     return FastLowered;
4681 
4682   SDLoc SL(Op);
4683   SDValue Src0 = Op.getOperand(0);
4684   SDValue Src1 = Op.getOperand(1);
4685 
4686   SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
4687   SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
4688 
4689   SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
4690   SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
4691 
4692   SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
4693   SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
4694 
4695   return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
4696 }
4697 
4698 // Faster 2.5 ULP division that does not support denormals.
4699 SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
4700   SDLoc SL(Op);
4701   SDValue LHS = Op.getOperand(1);
4702   SDValue RHS = Op.getOperand(2);
4703 
4704   SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
4705 
4706   const APFloat K0Val(BitsToFloat(0x6f800000));
4707   const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
4708 
4709   const APFloat K1Val(BitsToFloat(0x2f800000));
4710   const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
4711 
4712   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
4713 
4714   EVT SetCCVT =
4715     getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
4716 
4717   SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
4718 
4719   SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
4720 
4721   // TODO: Should this propagate fast-math-flags?
4722   r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
4723 
4724   // rcp does not support denormals.
4725   SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
4726 
4727   SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
4728 
4729   return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
4730 }
4731 
4732 SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
4733   if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
4734     return FastLowered;
4735 
4736   SDLoc SL(Op);
4737   SDValue LHS = Op.getOperand(0);
4738   SDValue RHS = Op.getOperand(1);
4739 
4740   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
4741 
4742   SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
4743 
4744   SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
4745                                           RHS, RHS, LHS);
4746   SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
4747                                         LHS, RHS, LHS);
4748 
4749   // Denominator is scaled to not be denormal, so using rcp is ok.
4750   SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
4751                                   DenominatorScaled);
4752   SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
4753                                      DenominatorScaled);
4754 
4755   const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE |
4756                                (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
4757                                (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
4758 
4759   const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16);
4760 
4761   if (!Subtarget->hasFP32Denormals()) {
4762     SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
4763     const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
4764                                                       SL, MVT::i32);
4765     SDValue EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs,
4766                                        DAG.getEntryNode(),
4767                                        EnableDenormValue, BitField);
4768     SDValue Ops[3] = {
4769       NegDivScale0,
4770       EnableDenorm.getValue(0),
4771       EnableDenorm.getValue(1)
4772     };
4773 
4774     NegDivScale0 = DAG.getMergeValues(Ops, SL);
4775   }
4776 
4777   SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
4778                              ApproxRcp, One, NegDivScale0);
4779 
4780   SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
4781                              ApproxRcp, Fma0);
4782 
4783   SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
4784                            Fma1, Fma1);
4785 
4786   SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
4787                              NumeratorScaled, Mul);
4788 
4789   SDValue Fma3 = getFPTernOp(DAG, ISD::FMA,SL, MVT::f32, Fma2, Fma1, Mul, Fma2);
4790 
4791   SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
4792                              NumeratorScaled, Fma3);
4793 
4794   if (!Subtarget->hasFP32Denormals()) {
4795     const SDValue DisableDenormValue =
4796         DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
4797     SDValue DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other,
4798                                         Fma4.getValue(1),
4799                                         DisableDenormValue,
4800                                         BitField,
4801                                         Fma4.getValue(2));
4802 
4803     SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
4804                                       DisableDenorm, DAG.getRoot());
4805     DAG.setRoot(OutputChain);
4806   }
4807 
4808   SDValue Scale = NumeratorScaled.getValue(1);
4809   SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
4810                              Fma4, Fma1, Fma3, Scale);
4811 
4812   return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS);
4813 }
4814 
4815 SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
4816   if (DAG.getTarget().Options.UnsafeFPMath)
4817     return lowerFastUnsafeFDIV(Op, DAG);
4818 
4819   SDLoc SL(Op);
4820   SDValue X = Op.getOperand(0);
4821   SDValue Y = Op.getOperand(1);
4822 
4823   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
4824 
4825   SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
4826 
4827   SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
4828 
4829   SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
4830 
4831   SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
4832 
4833   SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
4834 
4835   SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
4836 
4837   SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
4838 
4839   SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
4840 
4841   SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
4842   SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
4843 
4844   SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
4845                              NegDivScale0, Mul, DivScale1);
4846 
4847   SDValue Scale;
4848 
4849   if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS) {
4850     // Workaround a hardware bug on SI where the condition output from div_scale
4851     // is not usable.
4852 
4853     const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
4854 
4855     // Figure out if the scale to use for div_fmas.
4856     SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
4857     SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
4858     SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
4859     SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
4860 
4861     SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
4862     SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
4863 
4864     SDValue Scale0Hi
4865       = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
4866     SDValue Scale1Hi
4867       = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
4868 
4869     SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
4870     SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
4871     Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
4872   } else {
4873     Scale = DivScale1.getValue(1);
4874   }
4875 
4876   SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
4877                              Fma4, Fma3, Mul, Scale);
4878 
4879   return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
4880 }
4881 
4882 SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
4883   EVT VT = Op.getValueType();
4884 
4885   if (VT == MVT::f32)
4886     return LowerFDIV32(Op, DAG);
4887 
4888   if (VT == MVT::f64)
4889     return LowerFDIV64(Op, DAG);
4890 
4891   if (VT == MVT::f16)
4892     return LowerFDIV16(Op, DAG);
4893 
4894   llvm_unreachable("Unexpected type for fdiv");
4895 }
4896 
4897 SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
4898   SDLoc DL(Op);
4899   StoreSDNode *Store = cast<StoreSDNode>(Op);
4900   EVT VT = Store->getMemoryVT();
4901 
4902   if (VT == MVT::i1) {
4903     return DAG.getTruncStore(Store->getChain(), DL,
4904        DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
4905        Store->getBasePtr(), MVT::i1, Store->getMemOperand());
4906   }
4907 
4908   assert(VT.isVector() &&
4909          Store->getValue().getValueType().getScalarType() == MVT::i32);
4910 
4911   unsigned AS = Store->getAddressSpace();
4912   if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
4913                           AS, Store->getAlignment())) {
4914     return expandUnalignedStore(Store, DAG);
4915   }
4916 
4917   MachineFunction &MF = DAG.getMachineFunction();
4918   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
4919   // If there is a possibilty that flat instruction access scratch memory
4920   // then we need to use the same legalization rules we use for private.
4921   if (AS == AMDGPUASI.FLAT_ADDRESS)
4922     AS = MFI->hasFlatScratchInit() ?
4923          AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS;
4924 
4925   unsigned NumElements = VT.getVectorNumElements();
4926   if (AS == AMDGPUASI.GLOBAL_ADDRESS ||
4927       AS == AMDGPUASI.FLAT_ADDRESS) {
4928     if (NumElements > 4)
4929       return SplitVectorStore(Op, DAG);
4930     return SDValue();
4931   } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
4932     switch (Subtarget->getMaxPrivateElementSize()) {
4933     case 4:
4934       return scalarizeVectorStore(Store, DAG);
4935     case 8:
4936       if (NumElements > 2)
4937         return SplitVectorStore(Op, DAG);
4938       return SDValue();
4939     case 16:
4940       if (NumElements > 4)
4941         return SplitVectorStore(Op, DAG);
4942       return SDValue();
4943     default:
4944       llvm_unreachable("unsupported private_element_size");
4945     }
4946   } else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
4947     if (NumElements > 2)
4948       return SplitVectorStore(Op, DAG);
4949 
4950     if (NumElements == 2)
4951       return Op;
4952 
4953     // If properly aligned, if we split we might be able to use ds_write_b64.
4954     return SplitVectorStore(Op, DAG);
4955   } else {
4956     llvm_unreachable("unhandled address space");
4957   }
4958 }
4959 
4960 SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
4961   SDLoc DL(Op);
4962   EVT VT = Op.getValueType();
4963   SDValue Arg = Op.getOperand(0);
4964   // TODO: Should this propagate fast-math-flags?
4965   SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, DL, VT,
4966                                   DAG.getNode(ISD::FMUL, DL, VT, Arg,
4967                                               DAG.getConstantFP(0.5/M_PI, DL,
4968                                                                 VT)));
4969 
4970   switch (Op.getOpcode()) {
4971   case ISD::FCOS:
4972     return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, FractPart);
4973   case ISD::FSIN:
4974     return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, FractPart);
4975   default:
4976     llvm_unreachable("Wrong trig opcode");
4977   }
4978 }
4979 
4980 SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
4981   AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
4982   assert(AtomicNode->isCompareAndSwap());
4983   unsigned AS = AtomicNode->getAddressSpace();
4984 
4985   // No custom lowering required for local address space
4986   if (!isFlatGlobalAddrSpace(AS, AMDGPUASI))
4987     return Op;
4988 
4989   // Non-local address space requires custom lowering for atomic compare
4990   // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
4991   SDLoc DL(Op);
4992   SDValue ChainIn = Op.getOperand(0);
4993   SDValue Addr = Op.getOperand(1);
4994   SDValue Old = Op.getOperand(2);
4995   SDValue New = Op.getOperand(3);
4996   EVT VT = Op.getValueType();
4997   MVT SimpleVT = VT.getSimpleVT();
4998   MVT VecType = MVT::getVectorVT(SimpleVT, 2);
4999 
5000   SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
5001   SDValue Ops[] = { ChainIn, Addr, NewOld };
5002 
5003   return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
5004                                  Ops, VT, AtomicNode->getMemOperand());
5005 }
5006 
5007 //===----------------------------------------------------------------------===//
5008 // Custom DAG optimizations
5009 //===----------------------------------------------------------------------===//
5010 
5011 SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
5012                                                      DAGCombinerInfo &DCI) const {
5013   EVT VT = N->getValueType(0);
5014   EVT ScalarVT = VT.getScalarType();
5015   if (ScalarVT != MVT::f32)
5016     return SDValue();
5017 
5018   SelectionDAG &DAG = DCI.DAG;
5019   SDLoc DL(N);
5020 
5021   SDValue Src = N->getOperand(0);
5022   EVT SrcVT = Src.getValueType();
5023 
5024   // TODO: We could try to match extracting the higher bytes, which would be
5025   // easier if i8 vectors weren't promoted to i32 vectors, particularly after
5026   // types are legalized. v4i8 -> v4f32 is probably the only case to worry
5027   // about in practice.
5028   if (DCI.isAfterLegalizeVectorOps() && SrcVT == MVT::i32) {
5029     if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
5030       SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src);
5031       DCI.AddToWorklist(Cvt.getNode());
5032       return Cvt;
5033     }
5034   }
5035 
5036   return SDValue();
5037 }
5038 
5039 /// \brief Return true if the given offset Size in bytes can be folded into
5040 /// the immediate offsets of a memory instruction for the given address space.
5041 static bool canFoldOffset(unsigned OffsetSize, unsigned AS,
5042                           const SISubtarget &STI) {
5043   auto AMDGPUASI = STI.getAMDGPUAS();
5044   if (AS == AMDGPUASI.GLOBAL_ADDRESS) {
5045     // MUBUF instructions a 12-bit offset in bytes.
5046     return isUInt<12>(OffsetSize);
5047   }
5048   if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
5049     // SMRD instructions have an 8-bit offset in dwords on SI and
5050     // a 20-bit offset in bytes on VI.
5051     if (STI.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
5052       return isUInt<20>(OffsetSize);
5053     else
5054       return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4);
5055   }
5056   if (AS == AMDGPUASI.LOCAL_ADDRESS ||
5057       AS == AMDGPUASI.REGION_ADDRESS) {
5058     // The single offset versions have a 16-bit offset in bytes.
5059     return isUInt<16>(OffsetSize);
5060   }
5061   // Indirect register addressing does not use any offsets.
5062   return false;
5063 }
5064 
5065 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
5066 
5067 // This is a variant of
5068 // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
5069 //
5070 // The normal DAG combiner will do this, but only if the add has one use since
5071 // that would increase the number of instructions.
5072 //
5073 // This prevents us from seeing a constant offset that can be folded into a
5074 // memory instruction's addressing mode. If we know the resulting add offset of
5075 // a pointer can be folded into an addressing offset, we can replace the pointer
5076 // operand with the add of new constant offset. This eliminates one of the uses,
5077 // and may allow the remaining use to also be simplified.
5078 //
5079 SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
5080                                                unsigned AddrSpace,
5081                                                DAGCombinerInfo &DCI) const {
5082   SDValue N0 = N->getOperand(0);
5083   SDValue N1 = N->getOperand(1);
5084 
5085   if (N0.getOpcode() != ISD::ADD)
5086     return SDValue();
5087 
5088   const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
5089   if (!CN1)
5090     return SDValue();
5091 
5092   const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
5093   if (!CAdd)
5094     return SDValue();
5095 
5096   // If the resulting offset is too large, we can't fold it into the addressing
5097   // mode offset.
5098   APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
5099   if (!canFoldOffset(Offset.getZExtValue(), AddrSpace, *getSubtarget()))
5100     return SDValue();
5101 
5102   SelectionDAG &DAG = DCI.DAG;
5103   SDLoc SL(N);
5104   EVT VT = N->getValueType(0);
5105 
5106   SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
5107   SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32);
5108 
5109   return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset);
5110 }
5111 
5112 SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
5113                                                   DAGCombinerInfo &DCI) const {
5114   SDValue Ptr = N->getBasePtr();
5115   SelectionDAG &DAG = DCI.DAG;
5116   SDLoc SL(N);
5117 
5118   // TODO: We could also do this for multiplies.
5119   unsigned AS = N->getAddressSpace();
5120   if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUASI.PRIVATE_ADDRESS) {
5121     SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI);
5122     if (NewPtr) {
5123       SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
5124 
5125       NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr;
5126       return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
5127     }
5128   }
5129 
5130   return SDValue();
5131 }
5132 
5133 static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
5134   return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
5135          (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
5136          (Opc == ISD::XOR && Val == 0);
5137 }
5138 
5139 // Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
5140 // will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
5141 // integer combine opportunities since most 64-bit operations are decomposed
5142 // this way.  TODO: We won't want this for SALU especially if it is an inline
5143 // immediate.
5144 SDValue SITargetLowering::splitBinaryBitConstantOp(
5145   DAGCombinerInfo &DCI,
5146   const SDLoc &SL,
5147   unsigned Opc, SDValue LHS,
5148   const ConstantSDNode *CRHS) const {
5149   uint64_t Val = CRHS->getZExtValue();
5150   uint32_t ValLo = Lo_32(Val);
5151   uint32_t ValHi = Hi_32(Val);
5152   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
5153 
5154     if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
5155          bitOpWithConstantIsReducible(Opc, ValHi)) ||
5156         (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
5157     // If we need to materialize a 64-bit immediate, it will be split up later
5158     // anyway. Avoid creating the harder to understand 64-bit immediate
5159     // materialization.
5160     return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
5161   }
5162 
5163   return SDValue();
5164 }
5165 
5166 // Returns true if argument is a boolean value which is not serialized into
5167 // memory or argument and does not require v_cmdmask_b32 to be deserialized.
5168 static bool isBoolSGPR(SDValue V) {
5169   if (V.getValueType() != MVT::i1)
5170     return false;
5171   switch (V.getOpcode()) {
5172   default: break;
5173   case ISD::SETCC:
5174   case ISD::AND:
5175   case ISD::OR:
5176   case ISD::XOR:
5177   case AMDGPUISD::FP_CLASS:
5178     return true;
5179   }
5180   return false;
5181 }
5182 
5183 SDValue SITargetLowering::performAndCombine(SDNode *N,
5184                                             DAGCombinerInfo &DCI) const {
5185   if (DCI.isBeforeLegalize())
5186     return SDValue();
5187 
5188   SelectionDAG &DAG = DCI.DAG;
5189   EVT VT = N->getValueType(0);
5190   SDValue LHS = N->getOperand(0);
5191   SDValue RHS = N->getOperand(1);
5192 
5193 
5194   const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
5195   if (VT == MVT::i64 && CRHS) {
5196     if (SDValue Split
5197         = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
5198       return Split;
5199   }
5200 
5201   if (CRHS && VT == MVT::i32) {
5202     // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
5203     // nb = number of trailing zeroes in mask
5204     // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
5205     // given that we are selecting 8 or 16 bit fields starting at byte boundary.
5206     uint64_t Mask = CRHS->getZExtValue();
5207     unsigned Bits = countPopulation(Mask);
5208     if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
5209         (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
5210       if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
5211         unsigned Shift = CShift->getZExtValue();
5212         unsigned NB = CRHS->getAPIntValue().countTrailingZeros();
5213         unsigned Offset = NB + Shift;
5214         if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
5215           SDLoc SL(N);
5216           SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
5217                                     LHS->getOperand(0),
5218                                     DAG.getConstant(Offset, SL, MVT::i32),
5219                                     DAG.getConstant(Bits, SL, MVT::i32));
5220           EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
5221           SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
5222                                     DAG.getValueType(NarrowVT));
5223           SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
5224                                     DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
5225           return Shl;
5226         }
5227       }
5228     }
5229   }
5230 
5231   // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
5232   // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
5233   if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
5234     ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
5235     ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
5236 
5237     SDValue X = LHS.getOperand(0);
5238     SDValue Y = RHS.getOperand(0);
5239     if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X)
5240       return SDValue();
5241 
5242     if (LCC == ISD::SETO) {
5243       if (X != LHS.getOperand(1))
5244         return SDValue();
5245 
5246       if (RCC == ISD::SETUNE) {
5247         const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
5248         if (!C1 || !C1->isInfinity() || C1->isNegative())
5249           return SDValue();
5250 
5251         const uint32_t Mask = SIInstrFlags::N_NORMAL |
5252                               SIInstrFlags::N_SUBNORMAL |
5253                               SIInstrFlags::N_ZERO |
5254                               SIInstrFlags::P_ZERO |
5255                               SIInstrFlags::P_SUBNORMAL |
5256                               SIInstrFlags::P_NORMAL;
5257 
5258         static_assert(((~(SIInstrFlags::S_NAN |
5259                           SIInstrFlags::Q_NAN |
5260                           SIInstrFlags::N_INFINITY |
5261                           SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
5262                       "mask not equal");
5263 
5264         SDLoc DL(N);
5265         return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
5266                            X, DAG.getConstant(Mask, DL, MVT::i32));
5267       }
5268     }
5269   }
5270 
5271   if (VT == MVT::i32 &&
5272       (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) {
5273     // and x, (sext cc from i1) => select cc, x, 0
5274     if (RHS.getOpcode() != ISD::SIGN_EXTEND)
5275       std::swap(LHS, RHS);
5276     if (isBoolSGPR(RHS.getOperand(0)))
5277       return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0),
5278                            LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
5279   }
5280 
5281   return SDValue();
5282 }
5283 
5284 SDValue SITargetLowering::performOrCombine(SDNode *N,
5285                                            DAGCombinerInfo &DCI) const {
5286   SelectionDAG &DAG = DCI.DAG;
5287   SDValue LHS = N->getOperand(0);
5288   SDValue RHS = N->getOperand(1);
5289 
5290   EVT VT = N->getValueType(0);
5291   if (VT == MVT::i1) {
5292     // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
5293     if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
5294         RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
5295       SDValue Src = LHS.getOperand(0);
5296       if (Src != RHS.getOperand(0))
5297         return SDValue();
5298 
5299       const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
5300       const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
5301       if (!CLHS || !CRHS)
5302         return SDValue();
5303 
5304       // Only 10 bits are used.
5305       static const uint32_t MaxMask = 0x3ff;
5306 
5307       uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
5308       SDLoc DL(N);
5309       return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
5310                          Src, DAG.getConstant(NewMask, DL, MVT::i32));
5311     }
5312 
5313     return SDValue();
5314   }
5315 
5316   if (VT != MVT::i64)
5317     return SDValue();
5318 
5319   // TODO: This could be a generic combine with a predicate for extracting the
5320   // high half of an integer being free.
5321 
5322   // (or i64:x, (zero_extend i32:y)) ->
5323   //   i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
5324   if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
5325       RHS.getOpcode() != ISD::ZERO_EXTEND)
5326     std::swap(LHS, RHS);
5327 
5328   if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
5329     SDValue ExtSrc = RHS.getOperand(0);
5330     EVT SrcVT = ExtSrc.getValueType();
5331     if (SrcVT == MVT::i32) {
5332       SDLoc SL(N);
5333       SDValue LowLHS, HiBits;
5334       std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
5335       SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
5336 
5337       DCI.AddToWorklist(LowOr.getNode());
5338       DCI.AddToWorklist(HiBits.getNode());
5339 
5340       SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5341                                 LowOr, HiBits);
5342       return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
5343     }
5344   }
5345 
5346   const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
5347   if (CRHS) {
5348     if (SDValue Split
5349           = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR, LHS, CRHS))
5350       return Split;
5351   }
5352 
5353   return SDValue();
5354 }
5355 
5356 SDValue SITargetLowering::performXorCombine(SDNode *N,
5357                                             DAGCombinerInfo &DCI) const {
5358   EVT VT = N->getValueType(0);
5359   if (VT != MVT::i64)
5360     return SDValue();
5361 
5362   SDValue LHS = N->getOperand(0);
5363   SDValue RHS = N->getOperand(1);
5364 
5365   const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
5366   if (CRHS) {
5367     if (SDValue Split
5368           = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
5369       return Split;
5370   }
5371 
5372   return SDValue();
5373 }
5374 
5375 // Instructions that will be lowered with a final instruction that zeros the
5376 // high result bits.
5377 // XXX - probably only need to list legal operations.
5378 static bool fp16SrcZerosHighBits(unsigned Opc) {
5379   switch (Opc) {
5380   case ISD::FADD:
5381   case ISD::FSUB:
5382   case ISD::FMUL:
5383   case ISD::FDIV:
5384   case ISD::FREM:
5385   case ISD::FMA:
5386   case ISD::FMAD:
5387   case ISD::FCANONICALIZE:
5388   case ISD::FP_ROUND:
5389   case ISD::UINT_TO_FP:
5390   case ISD::SINT_TO_FP:
5391   case ISD::FABS:
5392     // Fabs is lowered to a bit operation, but it's an and which will clear the
5393     // high bits anyway.
5394   case ISD::FSQRT:
5395   case ISD::FSIN:
5396   case ISD::FCOS:
5397   case ISD::FPOWI:
5398   case ISD::FPOW:
5399   case ISD::FLOG:
5400   case ISD::FLOG2:
5401   case ISD::FLOG10:
5402   case ISD::FEXP:
5403   case ISD::FEXP2:
5404   case ISD::FCEIL:
5405   case ISD::FTRUNC:
5406   case ISD::FRINT:
5407   case ISD::FNEARBYINT:
5408   case ISD::FROUND:
5409   case ISD::FFLOOR:
5410   case ISD::FMINNUM:
5411   case ISD::FMAXNUM:
5412   case AMDGPUISD::FRACT:
5413   case AMDGPUISD::CLAMP:
5414   case AMDGPUISD::COS_HW:
5415   case AMDGPUISD::SIN_HW:
5416   case AMDGPUISD::FMIN3:
5417   case AMDGPUISD::FMAX3:
5418   case AMDGPUISD::FMED3:
5419   case AMDGPUISD::FMAD_FTZ:
5420   case AMDGPUISD::RCP:
5421   case AMDGPUISD::RSQ:
5422   case AMDGPUISD::LDEXP:
5423     return true;
5424   default:
5425     // fcopysign, select and others may be lowered to 32-bit bit operations
5426     // which don't zero the high bits.
5427     return false;
5428   }
5429 }
5430 
5431 SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
5432                                                    DAGCombinerInfo &DCI) const {
5433   if (!Subtarget->has16BitInsts() ||
5434       DCI.getDAGCombineLevel() < AfterLegalizeDAG)
5435     return SDValue();
5436 
5437   EVT VT = N->getValueType(0);
5438   if (VT != MVT::i32)
5439     return SDValue();
5440 
5441   SDValue Src = N->getOperand(0);
5442   if (Src.getValueType() != MVT::i16)
5443     return SDValue();
5444 
5445   // (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src
5446   // FIXME: It is not universally true that the high bits are zeroed on gfx9.
5447   if (Src.getOpcode() == ISD::BITCAST) {
5448     SDValue BCSrc = Src.getOperand(0);
5449     if (BCSrc.getValueType() == MVT::f16 &&
5450         fp16SrcZerosHighBits(BCSrc.getOpcode()))
5451       return DCI.DAG.getNode(AMDGPUISD::FP16_ZEXT, SDLoc(N), VT, BCSrc);
5452   }
5453 
5454   return SDValue();
5455 }
5456 
5457 SDValue SITargetLowering::performClassCombine(SDNode *N,
5458                                               DAGCombinerInfo &DCI) const {
5459   SelectionDAG &DAG = DCI.DAG;
5460   SDValue Mask = N->getOperand(1);
5461 
5462   // fp_class x, 0 -> false
5463   if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) {
5464     if (CMask->isNullValue())
5465       return DAG.getConstant(0, SDLoc(N), MVT::i1);
5466   }
5467 
5468   if (N->getOperand(0).isUndef())
5469     return DAG.getUNDEF(MVT::i1);
5470 
5471   return SDValue();
5472 }
5473 
5474 static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
5475   if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions())
5476     return true;
5477 
5478   return DAG.isKnownNeverNaN(Op);
5479 }
5480 
5481 static bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
5482                             const SISubtarget *ST, unsigned MaxDepth=5) {
5483   // If source is a result of another standard FP operation it is already in
5484   // canonical form.
5485 
5486   switch (Op.getOpcode()) {
5487   default:
5488     break;
5489 
5490   // These will flush denorms if required.
5491   case ISD::FADD:
5492   case ISD::FSUB:
5493   case ISD::FMUL:
5494   case ISD::FSQRT:
5495   case ISD::FCEIL:
5496   case ISD::FFLOOR:
5497   case ISD::FMA:
5498   case ISD::FMAD:
5499 
5500   case ISD::FCANONICALIZE:
5501     return true;
5502 
5503   case ISD::FP_ROUND:
5504     return Op.getValueType().getScalarType() != MVT::f16 ||
5505            ST->hasFP16Denormals();
5506 
5507   case ISD::FP_EXTEND:
5508     return Op.getOperand(0).getValueType().getScalarType() != MVT::f16 ||
5509            ST->hasFP16Denormals();
5510 
5511   case ISD::FP16_TO_FP:
5512   case ISD::FP_TO_FP16:
5513     return ST->hasFP16Denormals();
5514 
5515   // It can/will be lowered or combined as a bit operation.
5516   // Need to check their input recursively to handle.
5517   case ISD::FNEG:
5518   case ISD::FABS:
5519     return (MaxDepth > 0) &&
5520            isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1);
5521 
5522   case ISD::FSIN:
5523   case ISD::FCOS:
5524   case ISD::FSINCOS:
5525     return Op.getValueType().getScalarType() != MVT::f16;
5526 
5527   // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms.
5528   // For such targets need to check their input recursively.
5529   case ISD::FMINNUM:
5530   case ISD::FMAXNUM:
5531   case ISD::FMINNAN:
5532   case ISD::FMAXNAN:
5533 
5534     if (ST->supportsMinMaxDenormModes() &&
5535         DAG.isKnownNeverNaN(Op.getOperand(0)) &&
5536         DAG.isKnownNeverNaN(Op.getOperand(1)))
5537       return true;
5538 
5539     return (MaxDepth > 0) &&
5540            isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1) &&
5541            isCanonicalized(DAG, Op.getOperand(1), ST, MaxDepth - 1);
5542 
5543   case ISD::ConstantFP: {
5544     auto F = cast<ConstantFPSDNode>(Op)->getValueAPF();
5545     return !F.isDenormal() && !(F.isNaN() && F.isSignaling());
5546   }
5547   }
5548   return false;
5549 }
5550 
5551 // Constant fold canonicalize.
5552 SDValue SITargetLowering::performFCanonicalizeCombine(
5553   SDNode *N,
5554   DAGCombinerInfo &DCI) const {
5555   SelectionDAG &DAG = DCI.DAG;
5556   ConstantFPSDNode *CFP = isConstOrConstSplatFP(N->getOperand(0));
5557 
5558   if (!CFP) {
5559     SDValue N0 = N->getOperand(0);
5560     EVT VT = N0.getValueType().getScalarType();
5561     auto ST = getSubtarget();
5562 
5563     if (((VT == MVT::f32 && ST->hasFP32Denormals()) ||
5564          (VT == MVT::f64 && ST->hasFP64Denormals()) ||
5565          (VT == MVT::f16 && ST->hasFP16Denormals())) &&
5566         DAG.isKnownNeverNaN(N0))
5567       return N0;
5568 
5569     bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction());
5570 
5571     if ((IsIEEEMode || isKnownNeverSNan(DAG, N0)) &&
5572         isCanonicalized(DAG, N0, ST))
5573       return N0;
5574 
5575     return SDValue();
5576   }
5577 
5578   const APFloat &C = CFP->getValueAPF();
5579 
5580   // Flush denormals to 0 if not enabled.
5581   if (C.isDenormal()) {
5582     EVT VT = N->getValueType(0);
5583     EVT SVT = VT.getScalarType();
5584     if (SVT == MVT::f32 && !Subtarget->hasFP32Denormals())
5585       return DAG.getConstantFP(0.0, SDLoc(N), VT);
5586 
5587     if (SVT == MVT::f64 && !Subtarget->hasFP64Denormals())
5588       return DAG.getConstantFP(0.0, SDLoc(N), VT);
5589 
5590     if (SVT == MVT::f16 && !Subtarget->hasFP16Denormals())
5591       return DAG.getConstantFP(0.0, SDLoc(N), VT);
5592   }
5593 
5594   if (C.isNaN()) {
5595     EVT VT = N->getValueType(0);
5596     APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
5597     if (C.isSignaling()) {
5598       // Quiet a signaling NaN.
5599       return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
5600     }
5601 
5602     // Make sure it is the canonical NaN bitpattern.
5603     //
5604     // TODO: Can we use -1 as the canonical NaN value since it's an inline
5605     // immediate?
5606     if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
5607       return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
5608   }
5609 
5610   return N->getOperand(0);
5611 }
5612 
5613 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
5614   switch (Opc) {
5615   case ISD::FMAXNUM:
5616     return AMDGPUISD::FMAX3;
5617   case ISD::SMAX:
5618     return AMDGPUISD::SMAX3;
5619   case ISD::UMAX:
5620     return AMDGPUISD::UMAX3;
5621   case ISD::FMINNUM:
5622     return AMDGPUISD::FMIN3;
5623   case ISD::SMIN:
5624     return AMDGPUISD::SMIN3;
5625   case ISD::UMIN:
5626     return AMDGPUISD::UMIN3;
5627   default:
5628     llvm_unreachable("Not a min/max opcode");
5629   }
5630 }
5631 
5632 SDValue SITargetLowering::performIntMed3ImmCombine(
5633   SelectionDAG &DAG, const SDLoc &SL,
5634   SDValue Op0, SDValue Op1, bool Signed) const {
5635   ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1);
5636   if (!K1)
5637     return SDValue();
5638 
5639   ConstantSDNode *K0 = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
5640   if (!K0)
5641     return SDValue();
5642 
5643   if (Signed) {
5644     if (K0->getAPIntValue().sge(K1->getAPIntValue()))
5645       return SDValue();
5646   } else {
5647     if (K0->getAPIntValue().uge(K1->getAPIntValue()))
5648       return SDValue();
5649   }
5650 
5651   EVT VT = K0->getValueType(0);
5652   unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
5653   if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16())) {
5654     return DAG.getNode(Med3Opc, SL, VT,
5655                        Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
5656   }
5657 
5658   // If there isn't a 16-bit med3 operation, convert to 32-bit.
5659   MVT NVT = MVT::i32;
5660   unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5661 
5662   SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
5663   SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
5664   SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
5665 
5666   SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3);
5667   return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3);
5668 }
5669 
5670 static ConstantFPSDNode *getSplatConstantFP(SDValue Op) {
5671   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
5672     return C;
5673 
5674   if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
5675     if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
5676       return C;
5677   }
5678 
5679   return nullptr;
5680 }
5681 
5682 SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
5683                                                   const SDLoc &SL,
5684                                                   SDValue Op0,
5685                                                   SDValue Op1) const {
5686   ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
5687   if (!K1)
5688     return SDValue();
5689 
5690   ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
5691   if (!K0)
5692     return SDValue();
5693 
5694   // Ordered >= (although NaN inputs should have folded away by now).
5695   APFloat::cmpResult Cmp = K0->getValueAPF().compare(K1->getValueAPF());
5696   if (Cmp == APFloat::cmpGreaterThan)
5697     return SDValue();
5698 
5699   // TODO: Check IEEE bit enabled?
5700   EVT VT = Op0.getValueType();
5701   if (Subtarget->enableDX10Clamp()) {
5702     // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
5703     // hardware fmed3 behavior converting to a min.
5704     // FIXME: Should this be allowing -0.0?
5705     if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
5706       return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
5707   }
5708 
5709   // med3 for f16 is only available on gfx9+, and not available for v2f16.
5710   if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
5711     // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
5712     // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
5713     // then give the other result, which is different from med3 with a NaN
5714     // input.
5715     SDValue Var = Op0.getOperand(0);
5716     if (!isKnownNeverSNan(DAG, Var))
5717       return SDValue();
5718 
5719     return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
5720                        Var, SDValue(K0, 0), SDValue(K1, 0));
5721   }
5722 
5723   return SDValue();
5724 }
5725 
5726 SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
5727                                                DAGCombinerInfo &DCI) const {
5728   SelectionDAG &DAG = DCI.DAG;
5729 
5730   EVT VT = N->getValueType(0);
5731   unsigned Opc = N->getOpcode();
5732   SDValue Op0 = N->getOperand(0);
5733   SDValue Op1 = N->getOperand(1);
5734 
5735   // Only do this if the inner op has one use since this will just increases
5736   // register pressure for no benefit.
5737 
5738 
5739   if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
5740       VT != MVT::f64 &&
5741       ((VT != MVT::f16 && VT != MVT::i16) || Subtarget->hasMin3Max3_16())) {
5742     // max(max(a, b), c) -> max3(a, b, c)
5743     // min(min(a, b), c) -> min3(a, b, c)
5744     if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
5745       SDLoc DL(N);
5746       return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
5747                          DL,
5748                          N->getValueType(0),
5749                          Op0.getOperand(0),
5750                          Op0.getOperand(1),
5751                          Op1);
5752     }
5753 
5754     // Try commuted.
5755     // max(a, max(b, c)) -> max3(a, b, c)
5756     // min(a, min(b, c)) -> min3(a, b, c)
5757     if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
5758       SDLoc DL(N);
5759       return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
5760                          DL,
5761                          N->getValueType(0),
5762                          Op0,
5763                          Op1.getOperand(0),
5764                          Op1.getOperand(1));
5765     }
5766   }
5767 
5768   // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
5769   if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
5770     if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true))
5771       return Med3;
5772   }
5773 
5774   if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
5775     if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false))
5776       return Med3;
5777   }
5778 
5779   // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
5780   if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
5781        (Opc == AMDGPUISD::FMIN_LEGACY &&
5782         Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
5783       (VT == MVT::f32 || VT == MVT::f64 ||
5784        (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
5785        (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
5786       Op0.hasOneUse()) {
5787     if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
5788       return Res;
5789   }
5790 
5791   return SDValue();
5792 }
5793 
5794 static bool isClampZeroToOne(SDValue A, SDValue B) {
5795   if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
5796     if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
5797       // FIXME: Should this be allowing -0.0?
5798       return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
5799              (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
5800     }
5801   }
5802 
5803   return false;
5804 }
5805 
5806 // FIXME: Should only worry about snans for version with chain.
5807 SDValue SITargetLowering::performFMed3Combine(SDNode *N,
5808                                               DAGCombinerInfo &DCI) const {
5809   EVT VT = N->getValueType(0);
5810   // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
5811   // NaNs. With a NaN input, the order of the operands may change the result.
5812 
5813   SelectionDAG &DAG = DCI.DAG;
5814   SDLoc SL(N);
5815 
5816   SDValue Src0 = N->getOperand(0);
5817   SDValue Src1 = N->getOperand(1);
5818   SDValue Src2 = N->getOperand(2);
5819 
5820   if (isClampZeroToOne(Src0, Src1)) {
5821     // const_a, const_b, x -> clamp is safe in all cases including signaling
5822     // nans.
5823     // FIXME: Should this be allowing -0.0?
5824     return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
5825   }
5826 
5827   // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
5828   // handling no dx10-clamp?
5829   if (Subtarget->enableDX10Clamp()) {
5830     // If NaNs is clamped to 0, we are free to reorder the inputs.
5831 
5832     if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
5833       std::swap(Src0, Src1);
5834 
5835     if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
5836       std::swap(Src1, Src2);
5837 
5838     if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
5839       std::swap(Src0, Src1);
5840 
5841     if (isClampZeroToOne(Src1, Src2))
5842       return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
5843   }
5844 
5845   return SDValue();
5846 }
5847 
5848 SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
5849                                                  DAGCombinerInfo &DCI) const {
5850   SDValue Src0 = N->getOperand(0);
5851   SDValue Src1 = N->getOperand(1);
5852   if (Src0.isUndef() && Src1.isUndef())
5853     return DCI.DAG.getUNDEF(N->getValueType(0));
5854   return SDValue();
5855 }
5856 
5857 SDValue SITargetLowering::performExtractVectorEltCombine(
5858   SDNode *N, DAGCombinerInfo &DCI) const {
5859   SDValue Vec = N->getOperand(0);
5860 
5861   SelectionDAG &DAG = DCI.DAG;
5862   if (Vec.getOpcode() == ISD::FNEG && allUsesHaveSourceMods(N)) {
5863     SDLoc SL(N);
5864     EVT EltVT = N->getValueType(0);
5865     SDValue Idx = N->getOperand(1);
5866     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
5867                               Vec.getOperand(0), Idx);
5868     return DAG.getNode(ISD::FNEG, SL, EltVT, Elt);
5869   }
5870 
5871   return SDValue();
5872 }
5873 
5874 static bool convertBuildVectorCastElt(SelectionDAG &DAG,
5875                                       SDValue &Lo, SDValue &Hi) {
5876   if (Hi.getOpcode() == ISD::BITCAST &&
5877       Hi.getOperand(0).getValueType() == MVT::f16 &&
5878       (isa<ConstantSDNode>(Lo) || Lo.isUndef())) {
5879     Lo = DAG.getNode(ISD::BITCAST, SDLoc(Lo), MVT::f16, Lo);
5880     Hi = Hi.getOperand(0);
5881     return true;
5882   }
5883 
5884   return false;
5885 }
5886 
5887 SDValue SITargetLowering::performBuildVectorCombine(
5888   SDNode *N, DAGCombinerInfo &DCI) const {
5889   SDLoc SL(N);
5890 
5891   if (!isTypeLegal(MVT::v2i16))
5892     return SDValue();
5893   SelectionDAG &DAG = DCI.DAG;
5894   EVT VT = N->getValueType(0);
5895 
5896   if (VT == MVT::v2i16) {
5897     SDValue Lo = N->getOperand(0);
5898     SDValue Hi = N->getOperand(1);
5899 
5900     // v2i16 build_vector (const|undef), (bitcast f16:$x)
5901     // -> bitcast (v2f16 build_vector const|undef, $x
5902     if (convertBuildVectorCastElt(DAG, Lo, Hi)) {
5903       SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Lo, Hi  });
5904       return DAG.getNode(ISD::BITCAST, SL, VT, NewVec);
5905     }
5906 
5907     if (convertBuildVectorCastElt(DAG, Hi, Lo)) {
5908       SDValue NewVec = DAG.getBuildVector(MVT::v2f16, SL, { Hi, Lo  });
5909       return DAG.getNode(ISD::BITCAST, SL, VT, NewVec);
5910     }
5911   }
5912 
5913   return SDValue();
5914 }
5915 
5916 unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
5917                                           const SDNode *N0,
5918                                           const SDNode *N1) const {
5919   EVT VT = N0->getValueType(0);
5920 
5921   // Only do this if we are not trying to support denormals. v_mad_f32 does not
5922   // support denormals ever.
5923   if ((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) ||
5924       (VT == MVT::f16 && !Subtarget->hasFP16Denormals()))
5925     return ISD::FMAD;
5926 
5927   const TargetOptions &Options = DAG.getTarget().Options;
5928   if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
5929        (N0->getFlags().hasUnsafeAlgebra() &&
5930         N1->getFlags().hasUnsafeAlgebra())) &&
5931       isFMAFasterThanFMulAndFAdd(VT)) {
5932     return ISD::FMA;
5933   }
5934 
5935   return 0;
5936 }
5937 
5938 SDValue SITargetLowering::performAddCombine(SDNode *N,
5939                                             DAGCombinerInfo &DCI) const {
5940   SelectionDAG &DAG = DCI.DAG;
5941   EVT VT = N->getValueType(0);
5942 
5943   if (VT != MVT::i32)
5944     return SDValue();
5945 
5946   SDLoc SL(N);
5947   SDValue LHS = N->getOperand(0);
5948   SDValue RHS = N->getOperand(1);
5949 
5950   // add x, zext (setcc) => addcarry x, 0, setcc
5951   // add x, sext (setcc) => subcarry x, 0, setcc
5952   unsigned Opc = LHS.getOpcode();
5953   if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
5954       Opc == ISD::ANY_EXTEND || Opc == ISD::ADDCARRY)
5955     std::swap(RHS, LHS);
5956 
5957   Opc = RHS.getOpcode();
5958   switch (Opc) {
5959   default: break;
5960   case ISD::ZERO_EXTEND:
5961   case ISD::SIGN_EXTEND:
5962   case ISD::ANY_EXTEND: {
5963     auto Cond = RHS.getOperand(0);
5964     if (!isBoolSGPR(Cond))
5965       break;
5966     SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
5967     SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
5968     Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::SUBCARRY : ISD::ADDCARRY;
5969     return DAG.getNode(Opc, SL, VTList, Args);
5970   }
5971   case ISD::ADDCARRY: {
5972     // add x, (addcarry y, 0, cc) => addcarry x, y, cc
5973     auto C = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
5974     if (!C || C->getZExtValue() != 0) break;
5975     SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) };
5976     return DAG.getNode(ISD::ADDCARRY, SDLoc(N), RHS->getVTList(), Args);
5977   }
5978   }
5979   return SDValue();
5980 }
5981 
5982 SDValue SITargetLowering::performSubCombine(SDNode *N,
5983                                             DAGCombinerInfo &DCI) const {
5984   SelectionDAG &DAG = DCI.DAG;
5985   EVT VT = N->getValueType(0);
5986 
5987   if (VT != MVT::i32)
5988     return SDValue();
5989 
5990   SDLoc SL(N);
5991   SDValue LHS = N->getOperand(0);
5992   SDValue RHS = N->getOperand(1);
5993 
5994   unsigned Opc = LHS.getOpcode();
5995   if (Opc != ISD::SUBCARRY)
5996     std::swap(RHS, LHS);
5997 
5998   if (LHS.getOpcode() == ISD::SUBCARRY) {
5999     // sub (subcarry x, 0, cc), y => subcarry x, y, cc
6000     auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
6001     if (!C || C->getZExtValue() != 0)
6002       return SDValue();
6003     SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
6004     return DAG.getNode(ISD::SUBCARRY, SDLoc(N), LHS->getVTList(), Args);
6005   }
6006   return SDValue();
6007 }
6008 
6009 SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
6010   DAGCombinerInfo &DCI) const {
6011 
6012   if (N->getValueType(0) != MVT::i32)
6013     return SDValue();
6014 
6015   auto C = dyn_cast<ConstantSDNode>(N->getOperand(1));
6016   if (!C || C->getZExtValue() != 0)
6017     return SDValue();
6018 
6019   SelectionDAG &DAG = DCI.DAG;
6020   SDValue LHS = N->getOperand(0);
6021 
6022   // addcarry (add x, y), 0, cc => addcarry x, y, cc
6023   // subcarry (sub x, y), 0, cc => subcarry x, y, cc
6024   unsigned LHSOpc = LHS.getOpcode();
6025   unsigned Opc = N->getOpcode();
6026   if ((LHSOpc == ISD::ADD && Opc == ISD::ADDCARRY) ||
6027       (LHSOpc == ISD::SUB && Opc == ISD::SUBCARRY)) {
6028     SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) };
6029     return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
6030   }
6031   return SDValue();
6032 }
6033 
6034 SDValue SITargetLowering::performFAddCombine(SDNode *N,
6035                                              DAGCombinerInfo &DCI) const {
6036   if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
6037     return SDValue();
6038 
6039   SelectionDAG &DAG = DCI.DAG;
6040   EVT VT = N->getValueType(0);
6041 
6042   SDLoc SL(N);
6043   SDValue LHS = N->getOperand(0);
6044   SDValue RHS = N->getOperand(1);
6045 
6046   // These should really be instruction patterns, but writing patterns with
6047   // source modiifiers is a pain.
6048 
6049   // fadd (fadd (a, a), b) -> mad 2.0, a, b
6050   if (LHS.getOpcode() == ISD::FADD) {
6051     SDValue A = LHS.getOperand(0);
6052     if (A == LHS.getOperand(1)) {
6053       unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
6054       if (FusedOp != 0) {
6055         const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
6056         return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
6057       }
6058     }
6059   }
6060 
6061   // fadd (b, fadd (a, a)) -> mad 2.0, a, b
6062   if (RHS.getOpcode() == ISD::FADD) {
6063     SDValue A = RHS.getOperand(0);
6064     if (A == RHS.getOperand(1)) {
6065       unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
6066       if (FusedOp != 0) {
6067         const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
6068         return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
6069       }
6070     }
6071   }
6072 
6073   return SDValue();
6074 }
6075 
6076 SDValue SITargetLowering::performFSubCombine(SDNode *N,
6077                                              DAGCombinerInfo &DCI) const {
6078   if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
6079     return SDValue();
6080 
6081   SelectionDAG &DAG = DCI.DAG;
6082   SDLoc SL(N);
6083   EVT VT = N->getValueType(0);
6084   assert(!VT.isVector());
6085 
6086   // Try to get the fneg to fold into the source modifier. This undoes generic
6087   // DAG combines and folds them into the mad.
6088   //
6089   // Only do this if we are not trying to support denormals. v_mad_f32 does
6090   // not support denormals ever.
6091   SDValue LHS = N->getOperand(0);
6092   SDValue RHS = N->getOperand(1);
6093   if (LHS.getOpcode() == ISD::FADD) {
6094     // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
6095     SDValue A = LHS.getOperand(0);
6096     if (A == LHS.getOperand(1)) {
6097       unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
6098       if (FusedOp != 0){
6099         const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
6100         SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
6101 
6102         return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
6103       }
6104     }
6105   }
6106 
6107   if (RHS.getOpcode() == ISD::FADD) {
6108     // (fsub c, (fadd a, a)) -> mad -2.0, a, c
6109 
6110     SDValue A = RHS.getOperand(0);
6111     if (A == RHS.getOperand(1)) {
6112       unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
6113       if (FusedOp != 0){
6114         const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
6115         return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
6116       }
6117     }
6118   }
6119 
6120   return SDValue();
6121 }
6122 
6123 SDValue SITargetLowering::performSetCCCombine(SDNode *N,
6124                                               DAGCombinerInfo &DCI) const {
6125   SelectionDAG &DAG = DCI.DAG;
6126   SDLoc SL(N);
6127 
6128   SDValue LHS = N->getOperand(0);
6129   SDValue RHS = N->getOperand(1);
6130   EVT VT = LHS.getValueType();
6131   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
6132 
6133   auto CRHS = dyn_cast<ConstantSDNode>(RHS);
6134   if (!CRHS) {
6135     CRHS = dyn_cast<ConstantSDNode>(LHS);
6136     if (CRHS) {
6137       std::swap(LHS, RHS);
6138       CC = getSetCCSwappedOperands(CC);
6139     }
6140   }
6141 
6142   if (CRHS && VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
6143       isBoolSGPR(LHS.getOperand(0))) {
6144     // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
6145     // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
6146     // setcc (sext from i1 cc),  0, eq|sge|ule) => not cc => xor cc, -1
6147     // setcc (sext from i1 cc),  0, ne|ugt|slt) => cc
6148     if ((CRHS->isAllOnesValue() &&
6149          (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
6150         (CRHS->isNullValue() &&
6151          (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
6152       return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
6153                          DAG.getConstant(-1, SL, MVT::i1));
6154     if ((CRHS->isAllOnesValue() &&
6155          (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
6156         (CRHS->isNullValue() &&
6157          (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
6158       return LHS.getOperand(0);
6159   }
6160 
6161   if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() &&
6162                                            VT != MVT::f16))
6163     return SDValue();
6164 
6165   // Match isinf pattern
6166   // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
6167   if (CC == ISD::SETOEQ && LHS.getOpcode() == ISD::FABS) {
6168     const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
6169     if (!CRHS)
6170       return SDValue();
6171 
6172     const APFloat &APF = CRHS->getValueAPF();
6173     if (APF.isInfinity() && !APF.isNegative()) {
6174       unsigned Mask = SIInstrFlags::P_INFINITY | SIInstrFlags::N_INFINITY;
6175       return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
6176                          DAG.getConstant(Mask, SL, MVT::i32));
6177     }
6178   }
6179 
6180   return SDValue();
6181 }
6182 
6183 SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
6184                                                      DAGCombinerInfo &DCI) const {
6185   SelectionDAG &DAG = DCI.DAG;
6186   SDLoc SL(N);
6187   unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
6188 
6189   SDValue Src = N->getOperand(0);
6190   SDValue Srl = N->getOperand(0);
6191   if (Srl.getOpcode() == ISD::ZERO_EXTEND)
6192     Srl = Srl.getOperand(0);
6193 
6194   // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero.
6195   if (Srl.getOpcode() == ISD::SRL) {
6196     // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
6197     // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
6198     // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
6199 
6200     if (const ConstantSDNode *C =
6201         dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
6202       Srl = DAG.getZExtOrTrunc(Srl.getOperand(0), SDLoc(Srl.getOperand(0)),
6203                                EVT(MVT::i32));
6204 
6205       unsigned SrcOffset = C->getZExtValue() + 8 * Offset;
6206       if (SrcOffset < 32 && SrcOffset % 8 == 0) {
6207         return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, SL,
6208                            MVT::f32, Srl);
6209       }
6210     }
6211   }
6212 
6213   APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
6214 
6215   KnownBits Known;
6216   TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
6217                                         !DCI.isBeforeLegalizeOps());
6218   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6219   if (TLI.ShrinkDemandedConstant(Src, Demanded, TLO) ||
6220       TLI.SimplifyDemandedBits(Src, Demanded, Known, TLO)) {
6221     DCI.CommitTargetLoweringOpt(TLO);
6222   }
6223 
6224   return SDValue();
6225 }
6226 
6227 SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
6228                                             DAGCombinerInfo &DCI) const {
6229   switch (N->getOpcode()) {
6230   default:
6231     return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
6232   case ISD::ADD:
6233     return performAddCombine(N, DCI);
6234   case ISD::SUB:
6235     return performSubCombine(N, DCI);
6236   case ISD::ADDCARRY:
6237   case ISD::SUBCARRY:
6238     return performAddCarrySubCarryCombine(N, DCI);
6239   case ISD::FADD:
6240     return performFAddCombine(N, DCI);
6241   case ISD::FSUB:
6242     return performFSubCombine(N, DCI);
6243   case ISD::SETCC:
6244     return performSetCCCombine(N, DCI);
6245   case ISD::FMAXNUM:
6246   case ISD::FMINNUM:
6247   case ISD::SMAX:
6248   case ISD::SMIN:
6249   case ISD::UMAX:
6250   case ISD::UMIN:
6251   case AMDGPUISD::FMIN_LEGACY:
6252   case AMDGPUISD::FMAX_LEGACY: {
6253     if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
6254         getTargetMachine().getOptLevel() > CodeGenOpt::None)
6255       return performMinMaxCombine(N, DCI);
6256     break;
6257   }
6258   case ISD::LOAD:
6259   case ISD::STORE:
6260   case ISD::ATOMIC_LOAD:
6261   case ISD::ATOMIC_STORE:
6262   case ISD::ATOMIC_CMP_SWAP:
6263   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
6264   case ISD::ATOMIC_SWAP:
6265   case ISD::ATOMIC_LOAD_ADD:
6266   case ISD::ATOMIC_LOAD_SUB:
6267   case ISD::ATOMIC_LOAD_AND:
6268   case ISD::ATOMIC_LOAD_OR:
6269   case ISD::ATOMIC_LOAD_XOR:
6270   case ISD::ATOMIC_LOAD_NAND:
6271   case ISD::ATOMIC_LOAD_MIN:
6272   case ISD::ATOMIC_LOAD_MAX:
6273   case ISD::ATOMIC_LOAD_UMIN:
6274   case ISD::ATOMIC_LOAD_UMAX:
6275   case AMDGPUISD::ATOMIC_INC:
6276   case AMDGPUISD::ATOMIC_DEC: // TODO: Target mem intrinsics.
6277     if (DCI.isBeforeLegalize())
6278       break;
6279     return performMemSDNodeCombine(cast<MemSDNode>(N), DCI);
6280   case ISD::AND:
6281     return performAndCombine(N, DCI);
6282   case ISD::OR:
6283     return performOrCombine(N, DCI);
6284   case ISD::XOR:
6285     return performXorCombine(N, DCI);
6286   case ISD::ZERO_EXTEND:
6287     return performZeroExtendCombine(N, DCI);
6288   case AMDGPUISD::FP_CLASS:
6289     return performClassCombine(N, DCI);
6290   case ISD::FCANONICALIZE:
6291     return performFCanonicalizeCombine(N, DCI);
6292   case AMDGPUISD::FRACT:
6293   case AMDGPUISD::RCP:
6294   case AMDGPUISD::RSQ:
6295   case AMDGPUISD::RCP_LEGACY:
6296   case AMDGPUISD::RSQ_LEGACY:
6297   case AMDGPUISD::RSQ_CLAMP:
6298   case AMDGPUISD::LDEXP: {
6299     SDValue Src = N->getOperand(0);
6300     if (Src.isUndef())
6301       return Src;
6302     break;
6303   }
6304   case ISD::SINT_TO_FP:
6305   case ISD::UINT_TO_FP:
6306     return performUCharToFloatCombine(N, DCI);
6307   case AMDGPUISD::CVT_F32_UBYTE0:
6308   case AMDGPUISD::CVT_F32_UBYTE1:
6309   case AMDGPUISD::CVT_F32_UBYTE2:
6310   case AMDGPUISD::CVT_F32_UBYTE3:
6311     return performCvtF32UByteNCombine(N, DCI);
6312   case AMDGPUISD::FMED3:
6313     return performFMed3Combine(N, DCI);
6314   case AMDGPUISD::CVT_PKRTZ_F16_F32:
6315     return performCvtPkRTZCombine(N, DCI);
6316   case ISD::SCALAR_TO_VECTOR: {
6317     SelectionDAG &DAG = DCI.DAG;
6318     EVT VT = N->getValueType(0);
6319 
6320     // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
6321     if (VT == MVT::v2i16 || VT == MVT::v2f16) {
6322       SDLoc SL(N);
6323       SDValue Src = N->getOperand(0);
6324       EVT EltVT = Src.getValueType();
6325       if (EltVT == MVT::f16)
6326         Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
6327 
6328       SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
6329       return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
6330     }
6331 
6332     break;
6333   }
6334   case ISD::EXTRACT_VECTOR_ELT:
6335     return performExtractVectorEltCombine(N, DCI);
6336   case ISD::BUILD_VECTOR:
6337     return performBuildVectorCombine(N, DCI);
6338   }
6339   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
6340 }
6341 
6342 /// \brief Helper function for adjustWritemask
6343 static unsigned SubIdx2Lane(unsigned Idx) {
6344   switch (Idx) {
6345   default: return 0;
6346   case AMDGPU::sub0: return 0;
6347   case AMDGPU::sub1: return 1;
6348   case AMDGPU::sub2: return 2;
6349   case AMDGPU::sub3: return 3;
6350   }
6351 }
6352 
6353 /// \brief Adjust the writemask of MIMG instructions
6354 void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
6355                                        SelectionDAG &DAG) const {
6356   SDNode *Users[4] = { };
6357   unsigned Lane = 0;
6358   unsigned DmaskIdx = (Node->getNumOperands() - Node->getNumValues() == 9) ? 2 : 3;
6359   unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
6360   unsigned NewDmask = 0;
6361 
6362   // Try to figure out the used register components
6363   for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
6364        I != E; ++I) {
6365 
6366     // Don't look at users of the chain.
6367     if (I.getUse().getResNo() != 0)
6368       continue;
6369 
6370     // Abort if we can't understand the usage
6371     if (!I->isMachineOpcode() ||
6372         I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
6373       return;
6374 
6375     // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used.
6376     // Note that subregs are packed, i.e. Lane==0 is the first bit set
6377     // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
6378     // set, etc.
6379     Lane = SubIdx2Lane(I->getConstantOperandVal(1));
6380 
6381     // Set which texture component corresponds to the lane.
6382     unsigned Comp;
6383     for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) {
6384       assert(Dmask);
6385       Comp = countTrailingZeros(Dmask);
6386       Dmask &= ~(1 << Comp);
6387     }
6388 
6389     // Abort if we have more than one user per component
6390     if (Users[Lane])
6391       return;
6392 
6393     Users[Lane] = *I;
6394     NewDmask |= 1 << Comp;
6395   }
6396 
6397   // Abort if there's no change
6398   if (NewDmask == OldDmask)
6399     return;
6400 
6401   // Adjust the writemask in the node
6402   std::vector<SDValue> Ops;
6403   Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
6404   Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
6405   Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
6406   Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops);
6407 
6408   // If we only got one lane, replace it with a copy
6409   // (if NewDmask has only one bit set...)
6410   if (NewDmask && (NewDmask & (NewDmask-1)) == 0) {
6411     SDValue RC = DAG.getTargetConstant(AMDGPU::VGPR_32RegClassID, SDLoc(),
6412                                        MVT::i32);
6413     SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
6414                                       SDLoc(), Users[Lane]->getValueType(0),
6415                                       SDValue(Node, 0), RC);
6416     DAG.ReplaceAllUsesWith(Users[Lane], Copy);
6417     return;
6418   }
6419 
6420   // Update the users of the node with the new indices
6421   for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) {
6422     SDNode *User = Users[i];
6423     if (!User)
6424       continue;
6425 
6426     SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
6427     DAG.UpdateNodeOperands(User, User->getOperand(0), Op);
6428 
6429     switch (Idx) {
6430     default: break;
6431     case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
6432     case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
6433     case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
6434     }
6435   }
6436 }
6437 
6438 static bool isFrameIndexOp(SDValue Op) {
6439   if (Op.getOpcode() == ISD::AssertZext)
6440     Op = Op.getOperand(0);
6441 
6442   return isa<FrameIndexSDNode>(Op);
6443 }
6444 
6445 /// \brief Legalize target independent instructions (e.g. INSERT_SUBREG)
6446 /// with frame index operands.
6447 /// LLVM assumes that inputs are to these instructions are registers.
6448 SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
6449                                                         SelectionDAG &DAG) const {
6450   if (Node->getOpcode() == ISD::CopyToReg) {
6451     RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
6452     SDValue SrcVal = Node->getOperand(2);
6453 
6454     // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
6455     // to try understanding copies to physical registers.
6456     if (SrcVal.getValueType() == MVT::i1 &&
6457         TargetRegisterInfo::isPhysicalRegister(DestReg->getReg())) {
6458       SDLoc SL(Node);
6459       MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
6460       SDValue VReg = DAG.getRegister(
6461         MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
6462 
6463       SDNode *Glued = Node->getGluedNode();
6464       SDValue ToVReg
6465         = DAG.getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
6466                          SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
6467       SDValue ToResultReg
6468         = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
6469                            VReg, ToVReg.getValue(1));
6470       DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
6471       DAG.RemoveDeadNode(Node);
6472       return ToResultReg.getNode();
6473     }
6474   }
6475 
6476   SmallVector<SDValue, 8> Ops;
6477   for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
6478     if (!isFrameIndexOp(Node->getOperand(i))) {
6479       Ops.push_back(Node->getOperand(i));
6480       continue;
6481     }
6482 
6483     SDLoc DL(Node);
6484     Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
6485                                      Node->getOperand(i).getValueType(),
6486                                      Node->getOperand(i)), 0));
6487   }
6488 
6489   DAG.UpdateNodeOperands(Node, Ops);
6490   return Node;
6491 }
6492 
6493 /// \brief Fold the instructions after selecting them.
6494 SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
6495                                           SelectionDAG &DAG) const {
6496   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
6497   unsigned Opcode = Node->getMachineOpcode();
6498 
6499   if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() &&
6500       !TII->isGather4(Opcode))
6501     adjustWritemask(Node, DAG);
6502 
6503   if (Opcode == AMDGPU::INSERT_SUBREG ||
6504       Opcode == AMDGPU::REG_SEQUENCE) {
6505     legalizeTargetIndependentNode(Node, DAG);
6506     return Node;
6507   }
6508 
6509   switch (Opcode) {
6510   case AMDGPU::V_DIV_SCALE_F32:
6511   case AMDGPU::V_DIV_SCALE_F64: {
6512     // Satisfy the operand register constraint when one of the inputs is
6513     // undefined. Ordinarily each undef value will have its own implicit_def of
6514     // a vreg, so force these to use a single register.
6515     SDValue Src0 = Node->getOperand(0);
6516     SDValue Src1 = Node->getOperand(1);
6517     SDValue Src2 = Node->getOperand(2);
6518 
6519     if ((Src0.isMachineOpcode() &&
6520          Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
6521         (Src0 == Src1 || Src0 == Src2))
6522       break;
6523 
6524     MVT VT = Src0.getValueType().getSimpleVT();
6525     const TargetRegisterClass *RC = getRegClassFor(VT);
6526 
6527     MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
6528     SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
6529 
6530     SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node),
6531                                       UndefReg, Src0, SDValue());
6532 
6533     // src0 must be the same register as src1 or src2, even if the value is
6534     // undefined, so make sure we don't violate this constraint.
6535     if (Src0.isMachineOpcode() &&
6536         Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
6537       if (Src1.isMachineOpcode() &&
6538           Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
6539         Src0 = Src1;
6540       else if (Src2.isMachineOpcode() &&
6541                Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
6542         Src0 = Src2;
6543       else {
6544         assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
6545         Src0 = UndefReg;
6546         Src1 = UndefReg;
6547       }
6548     } else
6549       break;
6550 
6551     SmallVector<SDValue, 4> Ops = { Src0, Src1, Src2 };
6552     for (unsigned I = 3, N = Node->getNumOperands(); I != N; ++I)
6553       Ops.push_back(Node->getOperand(I));
6554 
6555     Ops.push_back(ImpDef.getValue(1));
6556     return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
6557   }
6558   default:
6559     break;
6560   }
6561 
6562   return Node;
6563 }
6564 
6565 /// \brief Assign the register class depending on the number of
6566 /// bits set in the writemask
6567 void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
6568                                                      SDNode *Node) const {
6569   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
6570 
6571   MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
6572 
6573   if (TII->isVOP3(MI.getOpcode())) {
6574     // Make sure constant bus requirements are respected.
6575     TII->legalizeOperandsVOP3(MRI, MI);
6576     return;
6577   }
6578 
6579   if (TII->isMIMG(MI)) {
6580     unsigned VReg = MI.getOperand(0).getReg();
6581     const TargetRegisterClass *RC = MRI.getRegClass(VReg);
6582     // TODO: Need mapping tables to handle other cases (register classes).
6583     if (RC != &AMDGPU::VReg_128RegClass)
6584       return;
6585 
6586     unsigned DmaskIdx = MI.getNumOperands() == 12 ? 3 : 4;
6587     unsigned Writemask = MI.getOperand(DmaskIdx).getImm();
6588     unsigned BitsSet = 0;
6589     for (unsigned i = 0; i < 4; ++i)
6590       BitsSet += Writemask & (1 << i) ? 1 : 0;
6591     switch (BitsSet) {
6592     default: return;
6593     case 1:  RC = &AMDGPU::VGPR_32RegClass; break;
6594     case 2:  RC = &AMDGPU::VReg_64RegClass; break;
6595     case 3:  RC = &AMDGPU::VReg_96RegClass; break;
6596     }
6597 
6598     unsigned NewOpcode = TII->getMaskedMIMGOp(MI.getOpcode(), BitsSet);
6599     MI.setDesc(TII->get(NewOpcode));
6600     MRI.setRegClass(VReg, RC);
6601     return;
6602   }
6603 
6604   // Replace unused atomics with the no return version.
6605   int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode());
6606   if (NoRetAtomicOp != -1) {
6607     if (!Node->hasAnyUseOfValue(0)) {
6608       MI.setDesc(TII->get(NoRetAtomicOp));
6609       MI.RemoveOperand(0);
6610       return;
6611     }
6612 
6613     // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg
6614     // instruction, because the return type of these instructions is a vec2 of
6615     // the memory type, so it can be tied to the input operand.
6616     // This means these instructions always have a use, so we need to add a
6617     // special case to check if the atomic has only one extract_subreg use,
6618     // which itself has no uses.
6619     if ((Node->hasNUsesOfValue(1, 0) &&
6620          Node->use_begin()->isMachineOpcode() &&
6621          Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG &&
6622          !Node->use_begin()->hasAnyUseOfValue(0))) {
6623       unsigned Def = MI.getOperand(0).getReg();
6624 
6625       // Change this into a noret atomic.
6626       MI.setDesc(TII->get(NoRetAtomicOp));
6627       MI.RemoveOperand(0);
6628 
6629       // If we only remove the def operand from the atomic instruction, the
6630       // extract_subreg will be left with a use of a vreg without a def.
6631       // So we need to insert an implicit_def to avoid machine verifier
6632       // errors.
6633       BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
6634               TII->get(AMDGPU::IMPLICIT_DEF), Def);
6635     }
6636     return;
6637   }
6638 }
6639 
6640 static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
6641                               uint64_t Val) {
6642   SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
6643   return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
6644 }
6645 
6646 MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
6647                                                 const SDLoc &DL,
6648                                                 SDValue Ptr) const {
6649   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
6650 
6651   // Build the half of the subregister with the constants before building the
6652   // full 128-bit register. If we are building multiple resource descriptors,
6653   // this will allow CSEing of the 2-component register.
6654   const SDValue Ops0[] = {
6655     DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
6656     buildSMovImm32(DAG, DL, 0),
6657     DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
6658     buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
6659     DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
6660   };
6661 
6662   SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
6663                                                 MVT::v2i32, Ops0), 0);
6664 
6665   // Combine the constants and the pointer.
6666   const SDValue Ops1[] = {
6667     DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
6668     Ptr,
6669     DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
6670     SubRegHi,
6671     DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
6672   };
6673 
6674   return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
6675 }
6676 
6677 /// \brief Return a resource descriptor with the 'Add TID' bit enabled
6678 ///        The TID (Thread ID) is multiplied by the stride value (bits [61:48]
6679 ///        of the resource descriptor) to create an offset, which is added to
6680 ///        the resource pointer.
6681 MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
6682                                            SDValue Ptr, uint32_t RsrcDword1,
6683                                            uint64_t RsrcDword2And3) const {
6684   SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
6685   SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
6686   if (RsrcDword1) {
6687     PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
6688                                      DAG.getConstant(RsrcDword1, DL, MVT::i32)),
6689                     0);
6690   }
6691 
6692   SDValue DataLo = buildSMovImm32(DAG, DL,
6693                                   RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
6694   SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
6695 
6696   const SDValue Ops[] = {
6697     DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
6698     PtrLo,
6699     DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
6700     PtrHi,
6701     DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
6702     DataLo,
6703     DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
6704     DataHi,
6705     DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
6706   };
6707 
6708   return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
6709 }
6710 
6711 //===----------------------------------------------------------------------===//
6712 //                         SI Inline Assembly Support
6713 //===----------------------------------------------------------------------===//
6714 
6715 std::pair<unsigned, const TargetRegisterClass *>
6716 SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
6717                                                StringRef Constraint,
6718                                                MVT VT) const {
6719   if (!isTypeLegal(VT))
6720     return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
6721 
6722   if (Constraint.size() == 1) {
6723     switch (Constraint[0]) {
6724     case 's':
6725     case 'r':
6726       switch (VT.getSizeInBits()) {
6727       default:
6728         return std::make_pair(0U, nullptr);
6729       case 32:
6730       case 16:
6731         return std::make_pair(0U, &AMDGPU::SReg_32_XM0RegClass);
6732       case 64:
6733         return std::make_pair(0U, &AMDGPU::SGPR_64RegClass);
6734       case 128:
6735         return std::make_pair(0U, &AMDGPU::SReg_128RegClass);
6736       case 256:
6737         return std::make_pair(0U, &AMDGPU::SReg_256RegClass);
6738       case 512:
6739         return std::make_pair(0U, &AMDGPU::SReg_512RegClass);
6740       }
6741 
6742     case 'v':
6743       switch (VT.getSizeInBits()) {
6744       default:
6745         return std::make_pair(0U, nullptr);
6746       case 32:
6747       case 16:
6748         return std::make_pair(0U, &AMDGPU::VGPR_32RegClass);
6749       case 64:
6750         return std::make_pair(0U, &AMDGPU::VReg_64RegClass);
6751       case 96:
6752         return std::make_pair(0U, &AMDGPU::VReg_96RegClass);
6753       case 128:
6754         return std::make_pair(0U, &AMDGPU::VReg_128RegClass);
6755       case 256:
6756         return std::make_pair(0U, &AMDGPU::VReg_256RegClass);
6757       case 512:
6758         return std::make_pair(0U, &AMDGPU::VReg_512RegClass);
6759       }
6760     }
6761   }
6762 
6763   if (Constraint.size() > 1) {
6764     const TargetRegisterClass *RC = nullptr;
6765     if (Constraint[1] == 'v') {
6766       RC = &AMDGPU::VGPR_32RegClass;
6767     } else if (Constraint[1] == 's') {
6768       RC = &AMDGPU::SGPR_32RegClass;
6769     }
6770 
6771     if (RC) {
6772       uint32_t Idx;
6773       bool Failed = Constraint.substr(2).getAsInteger(10, Idx);
6774       if (!Failed && Idx < RC->getNumRegs())
6775         return std::make_pair(RC->getRegister(Idx), RC);
6776     }
6777   }
6778   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
6779 }
6780 
6781 SITargetLowering::ConstraintType
6782 SITargetLowering::getConstraintType(StringRef Constraint) const {
6783   if (Constraint.size() == 1) {
6784     switch (Constraint[0]) {
6785     default: break;
6786     case 's':
6787     case 'v':
6788       return C_RegisterClass;
6789     }
6790   }
6791   return TargetLowering::getConstraintType(Constraint);
6792 }
6793 
6794 // Figure out which registers should be reserved for stack access. Only after
6795 // the function is legalized do we know all of the non-spill stack objects or if
6796 // calls are present.
6797 void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
6798   MachineRegisterInfo &MRI = MF.getRegInfo();
6799   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
6800   const MachineFrameInfo &MFI = MF.getFrameInfo();
6801   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
6802   const SIRegisterInfo *TRI = ST.getRegisterInfo();
6803 
6804   if (Info->isEntryFunction()) {
6805     // Callable functions have fixed registers used for stack access.
6806     reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);
6807   }
6808 
6809   // We have to assume the SP is needed in case there are calls in the function
6810   // during lowering. Calls are only detected after the function is
6811   // lowered. We're about to reserve registers, so don't bother using it if we
6812   // aren't really going to use it.
6813   bool NeedSP = !Info->isEntryFunction() ||
6814     MFI.hasVarSizedObjects() ||
6815     MFI.hasCalls();
6816 
6817   if (NeedSP) {
6818     unsigned ReservedStackPtrOffsetReg = TRI->reservedStackPtrOffsetReg(MF);
6819     Info->setStackPtrOffsetReg(ReservedStackPtrOffsetReg);
6820 
6821     assert(Info->getStackPtrOffsetReg() != Info->getFrameOffsetReg());
6822     assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
6823                                Info->getStackPtrOffsetReg()));
6824     MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
6825   }
6826 
6827   MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
6828   MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
6829   MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG,
6830                      Info->getScratchWaveOffsetReg());
6831 
6832   TargetLoweringBase::finalizeLowering(MF);
6833 }
6834