1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Custom DAG lowering for SI
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #if defined(_MSC_VER) || defined(__MINGW32__)
15 // Provide M_PI.
16 #define _USE_MATH_DEFINES
17 #endif
18 
19 #include "SIISelLowering.h"
20 #include "AMDGPU.h"
21 #include "AMDGPUSubtarget.h"
22 #include "AMDGPUTargetMachine.h"
23 #include "SIDefines.h"
24 #include "SIInstrInfo.h"
25 #include "SIMachineFunctionInfo.h"
26 #include "SIRegisterInfo.h"
27 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
28 #include "Utils/AMDGPUBaseInfo.h"
29 #include "llvm/ADT/APFloat.h"
30 #include "llvm/ADT/APInt.h"
31 #include "llvm/ADT/ArrayRef.h"
32 #include "llvm/ADT/BitVector.h"
33 #include "llvm/ADT/SmallVector.h"
34 #include "llvm/ADT/Statistic.h"
35 #include "llvm/ADT/StringRef.h"
36 #include "llvm/ADT/StringSwitch.h"
37 #include "llvm/ADT/Twine.h"
38 #include "llvm/CodeGen/Analysis.h"
39 #include "llvm/CodeGen/CallingConvLower.h"
40 #include "llvm/CodeGen/DAGCombine.h"
41 #include "llvm/CodeGen/ISDOpcodes.h"
42 #include "llvm/CodeGen/MachineBasicBlock.h"
43 #include "llvm/CodeGen/MachineFrameInfo.h"
44 #include "llvm/CodeGen/MachineFunction.h"
45 #include "llvm/CodeGen/MachineInstr.h"
46 #include "llvm/CodeGen/MachineInstrBuilder.h"
47 #include "llvm/CodeGen/MachineMemOperand.h"
48 #include "llvm/CodeGen/MachineModuleInfo.h"
49 #include "llvm/CodeGen/MachineOperand.h"
50 #include "llvm/CodeGen/MachineRegisterInfo.h"
51 #include "llvm/CodeGen/SelectionDAG.h"
52 #include "llvm/CodeGen/SelectionDAGNodes.h"
53 #include "llvm/CodeGen/TargetCallingConv.h"
54 #include "llvm/CodeGen/TargetRegisterInfo.h"
55 #include "llvm/CodeGen/ValueTypes.h"
56 #include "llvm/IR/Constants.h"
57 #include "llvm/IR/DataLayout.h"
58 #include "llvm/IR/DebugLoc.h"
59 #include "llvm/IR/DerivedTypes.h"
60 #include "llvm/IR/DiagnosticInfo.h"
61 #include "llvm/IR/Function.h"
62 #include "llvm/IR/GlobalValue.h"
63 #include "llvm/IR/InstrTypes.h"
64 #include "llvm/IR/Instruction.h"
65 #include "llvm/IR/Instructions.h"
66 #include "llvm/IR/IntrinsicInst.h"
67 #include "llvm/IR/Type.h"
68 #include "llvm/Support/Casting.h"
69 #include "llvm/Support/CodeGen.h"
70 #include "llvm/Support/CommandLine.h"
71 #include "llvm/Support/Compiler.h"
72 #include "llvm/Support/ErrorHandling.h"
73 #include "llvm/Support/KnownBits.h"
74 #include "llvm/Support/MachineValueType.h"
75 #include "llvm/Support/MathExtras.h"
76 #include "llvm/Target/TargetOptions.h"
77 #include <cassert>
78 #include <cmath>
79 #include <cstdint>
80 #include <iterator>
81 #include <tuple>
82 #include <utility>
83 #include <vector>
84 
85 using namespace llvm;
86 
87 #define DEBUG_TYPE "si-lower"
88 
89 STATISTIC(NumTailCalls, "Number of tail calls");
90 
91 static cl::opt<bool> EnableVGPRIndexMode(
92   "amdgpu-vgpr-index-mode",
93   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
94   cl::init(false));
95 
96 static cl::opt<bool> DisableLoopAlignment(
97   "amdgpu-disable-loop-alignment",
98   cl::desc("Do not align and prefetch loops"),
99   cl::init(false));
100 
101 static unsigned findFirstFreeSGPR(CCState &CCInfo) {
102   unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
103   for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
104     if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
105       return AMDGPU::SGPR0 + Reg;
106     }
107   }
108   llvm_unreachable("Cannot allocate sgpr");
109 }
110 
111 SITargetLowering::SITargetLowering(const TargetMachine &TM,
112                                    const GCNSubtarget &STI)
113     : AMDGPUTargetLowering(TM, STI),
114       Subtarget(&STI) {
115   addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
116   addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
117 
118   addRegisterClass(MVT::i32, &AMDGPU::SReg_32_XM0RegClass);
119   addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
120 
121   addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
122   addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
123   addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
124 
125   addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
126   addRegisterClass(MVT::v3f32, &AMDGPU::VReg_96RegClass);
127 
128   addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass);
129   addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass);
130 
131   addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
132   addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
133 
134   addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
135   addRegisterClass(MVT::v5f32, &AMDGPU::VReg_160RegClass);
136 
137   addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
138   addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
139 
140   addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
141   addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
142 
143   if (Subtarget->has16BitInsts()) {
144     addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass);
145     addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
146 
147     // Unless there are also VOP3P operations, not operations are really legal.
148     addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass);
149     addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass);
150     addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
151     addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
152   }
153 
154   computeRegisterProperties(Subtarget->getRegisterInfo());
155 
156   // We need to custom lower vector stores from local memory
157   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
158   setOperationAction(ISD::LOAD, MVT::v3i32, Custom);
159   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
160   setOperationAction(ISD::LOAD, MVT::v5i32, Custom);
161   setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
162   setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
163   setOperationAction(ISD::LOAD, MVT::i1, Custom);
164   setOperationAction(ISD::LOAD, MVT::v32i32, Custom);
165 
166   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
167   setOperationAction(ISD::STORE, MVT::v3i32, Custom);
168   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
169   setOperationAction(ISD::STORE, MVT::v5i32, Custom);
170   setOperationAction(ISD::STORE, MVT::v8i32, Custom);
171   setOperationAction(ISD::STORE, MVT::v16i32, Custom);
172   setOperationAction(ISD::STORE, MVT::i1, Custom);
173   setOperationAction(ISD::STORE, MVT::v32i32, Custom);
174 
175   setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
176   setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
177   setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
178   setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
179   setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
180   setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
181   setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
182   setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
183   setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
184   setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
185 
186   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
187   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
188 
189   setOperationAction(ISD::SELECT, MVT::i1, Promote);
190   setOperationAction(ISD::SELECT, MVT::i64, Custom);
191   setOperationAction(ISD::SELECT, MVT::f64, Promote);
192   AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
193 
194   setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
195   setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
196   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
197   setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
198   setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
199 
200   setOperationAction(ISD::SETCC, MVT::i1, Promote);
201   setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
202   setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
203   AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
204 
205   setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
206   setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
207 
208   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
209   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
210   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
211   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
212   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
213   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
214   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
215 
216   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
217   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
218   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
219   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
220   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f16, Custom);
221   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);
222   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
223 
224   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom);
225   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom);
226   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom);
227   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
228   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i16, Custom);
229   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
230 
231   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
232   setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
233   setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
234   setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom);
235   setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom);
236   setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom);
237 
238   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
239   setOperationAction(ISD::BR_CC, MVT::i1, Expand);
240   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
241   setOperationAction(ISD::BR_CC, MVT::i64, Expand);
242   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
243   setOperationAction(ISD::BR_CC, MVT::f64, Expand);
244 
245   setOperationAction(ISD::UADDO, MVT::i32, Legal);
246   setOperationAction(ISD::USUBO, MVT::i32, Legal);
247 
248   setOperationAction(ISD::ADDCARRY, MVT::i32, Legal);
249   setOperationAction(ISD::SUBCARRY, MVT::i32, Legal);
250 
251   setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
252   setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
253   setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
254 
255 #if 0
256   setOperationAction(ISD::ADDCARRY, MVT::i64, Legal);
257   setOperationAction(ISD::SUBCARRY, MVT::i64, Legal);
258 #endif
259 
260   // We only support LOAD/STORE and vector manipulation ops for vectors
261   // with > 4 elements.
262   for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
263         MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16, MVT::v32i32 }) {
264     for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
265       switch (Op) {
266       case ISD::LOAD:
267       case ISD::STORE:
268       case ISD::BUILD_VECTOR:
269       case ISD::BITCAST:
270       case ISD::EXTRACT_VECTOR_ELT:
271       case ISD::INSERT_VECTOR_ELT:
272       case ISD::INSERT_SUBVECTOR:
273       case ISD::EXTRACT_SUBVECTOR:
274       case ISD::SCALAR_TO_VECTOR:
275         break;
276       case ISD::CONCAT_VECTORS:
277         setOperationAction(Op, VT, Custom);
278         break;
279       default:
280         setOperationAction(Op, VT, Expand);
281         break;
282       }
283     }
284   }
285 
286   setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);
287 
288   // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
289   // is expanded to avoid having two separate loops in case the index is a VGPR.
290 
291   // Most operations are naturally 32-bit vector operations. We only support
292   // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
293   for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
294     setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
295     AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
296 
297     setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
298     AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
299 
300     setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
301     AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
302 
303     setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
304     AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
305   }
306 
307   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
308   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
309   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
310   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
311 
312   setOperationAction(ISD::BUILD_VECTOR, MVT::v4f16, Custom);
313   setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom);
314 
315   // Avoid stack access for these.
316   // TODO: Generalize to more vector types.
317   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom);
318   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom);
319   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
320   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
321 
322   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
323   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
324   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i8, Custom);
325   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8, Custom);
326   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i8, Custom);
327 
328   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i8, Custom);
329   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8, Custom);
330   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i8, Custom);
331 
332   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i16, Custom);
333   setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f16, Custom);
334   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
335   setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
336 
337   // Deal with vec3 vector operations when widened to vec4.
338   setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v3i32, Expand);
339   setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v3f32, Expand);
340   setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i32, Expand);
341   setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4f32, Expand);
342 
343   // Deal with vec5 vector operations when widened to vec8.
344   setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5i32, Expand);
345   setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5f32, Expand);
346   setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i32, Expand);
347   setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8f32, Expand);
348 
349   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
350   // and output demarshalling
351   setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
352   setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
353 
354   // We can't return success/failure, only the old value,
355   // let LLVM add the comparison
356   setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
357   setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand);
358 
359   if (Subtarget->hasFlatAddressSpace()) {
360     setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
361     setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
362   }
363 
364   setOperationAction(ISD::BSWAP, MVT::i32, Legal);
365   setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
366 
367   // On SI this is s_memtime and s_memrealtime on VI.
368   setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
369   setOperationAction(ISD::TRAP, MVT::Other, Custom);
370   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Custom);
371 
372   if (Subtarget->has16BitInsts()) {
373     setOperationAction(ISD::FLOG, MVT::f16, Custom);
374     setOperationAction(ISD::FEXP, MVT::f16, Custom);
375     setOperationAction(ISD::FLOG10, MVT::f16, Custom);
376   }
377 
378   // v_mad_f32 does not support denormals according to some sources.
379   if (!Subtarget->hasFP32Denormals())
380     setOperationAction(ISD::FMAD, MVT::f32, Legal);
381 
382   if (!Subtarget->hasBFI()) {
383     // fcopysign can be done in a single instruction with BFI.
384     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
385     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
386   }
387 
388   if (!Subtarget->hasBCNT(32))
389     setOperationAction(ISD::CTPOP, MVT::i32, Expand);
390 
391   if (!Subtarget->hasBCNT(64))
392     setOperationAction(ISD::CTPOP, MVT::i64, Expand);
393 
394   if (Subtarget->hasFFBH())
395     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
396 
397   if (Subtarget->hasFFBL())
398     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
399 
400   // We only really have 32-bit BFE instructions (and 16-bit on VI).
401   //
402   // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
403   // effort to match them now. We want this to be false for i64 cases when the
404   // extraction isn't restricted to the upper or lower half. Ideally we would
405   // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
406   // span the midpoint are probably relatively rare, so don't worry about them
407   // for now.
408   if (Subtarget->hasBFE())
409     setHasExtractBitsInsn(true);
410 
411   setOperationAction(ISD::FMINNUM, MVT::f32, Custom);
412   setOperationAction(ISD::FMAXNUM, MVT::f32, Custom);
413   setOperationAction(ISD::FMINNUM, MVT::f64, Custom);
414   setOperationAction(ISD::FMAXNUM, MVT::f64, Custom);
415 
416 
417   // These are really only legal for ieee_mode functions. We should be avoiding
418   // them for functions that don't have ieee_mode enabled, so just say they are
419   // legal.
420   setOperationAction(ISD::FMINNUM_IEEE, MVT::f32, Legal);
421   setOperationAction(ISD::FMAXNUM_IEEE, MVT::f32, Legal);
422   setOperationAction(ISD::FMINNUM_IEEE, MVT::f64, Legal);
423   setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal);
424 
425 
426   if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
427     setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
428     setOperationAction(ISD::FCEIL, MVT::f64, Legal);
429     setOperationAction(ISD::FRINT, MVT::f64, Legal);
430   } else {
431     setOperationAction(ISD::FCEIL, MVT::f64, Custom);
432     setOperationAction(ISD::FTRUNC, MVT::f64, Custom);
433     setOperationAction(ISD::FRINT, MVT::f64, Custom);
434     setOperationAction(ISD::FFLOOR, MVT::f64, Custom);
435   }
436 
437   setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
438 
439   setOperationAction(ISD::FSIN, MVT::f32, Custom);
440   setOperationAction(ISD::FCOS, MVT::f32, Custom);
441   setOperationAction(ISD::FDIV, MVT::f32, Custom);
442   setOperationAction(ISD::FDIV, MVT::f64, Custom);
443 
444   if (Subtarget->has16BitInsts()) {
445     setOperationAction(ISD::Constant, MVT::i16, Legal);
446 
447     setOperationAction(ISD::SMIN, MVT::i16, Legal);
448     setOperationAction(ISD::SMAX, MVT::i16, Legal);
449 
450     setOperationAction(ISD::UMIN, MVT::i16, Legal);
451     setOperationAction(ISD::UMAX, MVT::i16, Legal);
452 
453     setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote);
454     AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
455 
456     setOperationAction(ISD::ROTR, MVT::i16, Promote);
457     setOperationAction(ISD::ROTL, MVT::i16, Promote);
458 
459     setOperationAction(ISD::SDIV, MVT::i16, Promote);
460     setOperationAction(ISD::UDIV, MVT::i16, Promote);
461     setOperationAction(ISD::SREM, MVT::i16, Promote);
462     setOperationAction(ISD::UREM, MVT::i16, Promote);
463 
464     setOperationAction(ISD::BSWAP, MVT::i16, Promote);
465     setOperationAction(ISD::BITREVERSE, MVT::i16, Promote);
466 
467     setOperationAction(ISD::CTTZ, MVT::i16, Promote);
468     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote);
469     setOperationAction(ISD::CTLZ, MVT::i16, Promote);
470     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote);
471     setOperationAction(ISD::CTPOP, MVT::i16, Promote);
472 
473     setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
474 
475     setOperationAction(ISD::BR_CC, MVT::i16, Expand);
476 
477     setOperationAction(ISD::LOAD, MVT::i16, Custom);
478 
479     setTruncStoreAction(MVT::i64, MVT::i16, Expand);
480 
481     setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
482     AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
483     setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
484     AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
485 
486     setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
487     setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
488     setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
489     setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
490 
491     // F16 - Constant Actions.
492     setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
493 
494     // F16 - Load/Store Actions.
495     setOperationAction(ISD::LOAD, MVT::f16, Promote);
496     AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
497     setOperationAction(ISD::STORE, MVT::f16, Promote);
498     AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
499 
500     // F16 - VOP1 Actions.
501     setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
502     setOperationAction(ISD::FCOS, MVT::f16, Promote);
503     setOperationAction(ISD::FSIN, MVT::f16, Promote);
504     setOperationAction(ISD::FP_TO_SINT, MVT::f16, Promote);
505     setOperationAction(ISD::FP_TO_UINT, MVT::f16, Promote);
506     setOperationAction(ISD::SINT_TO_FP, MVT::f16, Promote);
507     setOperationAction(ISD::UINT_TO_FP, MVT::f16, Promote);
508     setOperationAction(ISD::FROUND, MVT::f16, Custom);
509 
510     // F16 - VOP2 Actions.
511     setOperationAction(ISD::BR_CC, MVT::f16, Expand);
512     setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
513 
514     setOperationAction(ISD::FDIV, MVT::f16, Custom);
515 
516     // F16 - VOP3 Actions.
517     setOperationAction(ISD::FMA, MVT::f16, Legal);
518     if (!Subtarget->hasFP16Denormals() && STI.hasMadF16())
519       setOperationAction(ISD::FMAD, MVT::f16, Legal);
520 
521     for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) {
522       for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
523         switch (Op) {
524         case ISD::LOAD:
525         case ISD::STORE:
526         case ISD::BUILD_VECTOR:
527         case ISD::BITCAST:
528         case ISD::EXTRACT_VECTOR_ELT:
529         case ISD::INSERT_VECTOR_ELT:
530         case ISD::INSERT_SUBVECTOR:
531         case ISD::EXTRACT_SUBVECTOR:
532         case ISD::SCALAR_TO_VECTOR:
533           break;
534         case ISD::CONCAT_VECTORS:
535           setOperationAction(Op, VT, Custom);
536           break;
537         default:
538           setOperationAction(Op, VT, Expand);
539           break;
540         }
541       }
542     }
543 
544     // XXX - Do these do anything? Vector constants turn into build_vector.
545     setOperationAction(ISD::Constant, MVT::v2i16, Legal);
546     setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal);
547 
548     setOperationAction(ISD::UNDEF, MVT::v2i16, Legal);
549     setOperationAction(ISD::UNDEF, MVT::v2f16, Legal);
550 
551     setOperationAction(ISD::STORE, MVT::v2i16, Promote);
552     AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
553     setOperationAction(ISD::STORE, MVT::v2f16, Promote);
554     AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
555 
556     setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
557     AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
558     setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
559     AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
560 
561     setOperationAction(ISD::AND, MVT::v2i16, Promote);
562     AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
563     setOperationAction(ISD::OR, MVT::v2i16, Promote);
564     AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
565     setOperationAction(ISD::XOR, MVT::v2i16, Promote);
566     AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
567 
568     setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
569     AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
570     setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
571     AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
572 
573     setOperationAction(ISD::STORE, MVT::v4i16, Promote);
574     AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
575     setOperationAction(ISD::STORE, MVT::v4f16, Promote);
576     AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
577 
578     setOperationAction(ISD::ANY_EXTEND, MVT::v2i32, Expand);
579     setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
580     setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
581     setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
582 
583     setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Expand);
584     setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Expand);
585     setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Expand);
586 
587     if (!Subtarget->hasVOP3PInsts()) {
588       setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom);
589       setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
590     }
591 
592     setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
593     // This isn't really legal, but this avoids the legalizer unrolling it (and
594     // allows matching fneg (fabs x) patterns)
595     setOperationAction(ISD::FABS, MVT::v2f16, Legal);
596 
597     setOperationAction(ISD::FMAXNUM, MVT::f16, Custom);
598     setOperationAction(ISD::FMINNUM, MVT::f16, Custom);
599     setOperationAction(ISD::FMAXNUM_IEEE, MVT::f16, Legal);
600     setOperationAction(ISD::FMINNUM_IEEE, MVT::f16, Legal);
601 
602     setOperationAction(ISD::FMINNUM_IEEE, MVT::v4f16, Custom);
603     setOperationAction(ISD::FMAXNUM_IEEE, MVT::v4f16, Custom);
604 
605     setOperationAction(ISD::FMINNUM, MVT::v4f16, Expand);
606     setOperationAction(ISD::FMAXNUM, MVT::v4f16, Expand);
607   }
608 
609   if (Subtarget->hasVOP3PInsts()) {
610     setOperationAction(ISD::ADD, MVT::v2i16, Legal);
611     setOperationAction(ISD::SUB, MVT::v2i16, Legal);
612     setOperationAction(ISD::MUL, MVT::v2i16, Legal);
613     setOperationAction(ISD::SHL, MVT::v2i16, Legal);
614     setOperationAction(ISD::SRL, MVT::v2i16, Legal);
615     setOperationAction(ISD::SRA, MVT::v2i16, Legal);
616     setOperationAction(ISD::SMIN, MVT::v2i16, Legal);
617     setOperationAction(ISD::UMIN, MVT::v2i16, Legal);
618     setOperationAction(ISD::SMAX, MVT::v2i16, Legal);
619     setOperationAction(ISD::UMAX, MVT::v2i16, Legal);
620 
621     setOperationAction(ISD::FADD, MVT::v2f16, Legal);
622     setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
623     setOperationAction(ISD::FMA, MVT::v2f16, Legal);
624 
625     setOperationAction(ISD::FMINNUM_IEEE, MVT::v2f16, Legal);
626     setOperationAction(ISD::FMAXNUM_IEEE, MVT::v2f16, Legal);
627 
628     setOperationAction(ISD::FCANONICALIZE, MVT::v2f16, Legal);
629 
630     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
631     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
632 
633     setOperationAction(ISD::SHL, MVT::v4i16, Custom);
634     setOperationAction(ISD::SRA, MVT::v4i16, Custom);
635     setOperationAction(ISD::SRL, MVT::v4i16, Custom);
636     setOperationAction(ISD::ADD, MVT::v4i16, Custom);
637     setOperationAction(ISD::SUB, MVT::v4i16, Custom);
638     setOperationAction(ISD::MUL, MVT::v4i16, Custom);
639 
640     setOperationAction(ISD::SMIN, MVT::v4i16, Custom);
641     setOperationAction(ISD::SMAX, MVT::v4i16, Custom);
642     setOperationAction(ISD::UMIN, MVT::v4i16, Custom);
643     setOperationAction(ISD::UMAX, MVT::v4i16, Custom);
644 
645     setOperationAction(ISD::FADD, MVT::v4f16, Custom);
646     setOperationAction(ISD::FMUL, MVT::v4f16, Custom);
647 
648     setOperationAction(ISD::FMAXNUM, MVT::v2f16, Custom);
649     setOperationAction(ISD::FMINNUM, MVT::v2f16, Custom);
650 
651     setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom);
652     setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom);
653     setOperationAction(ISD::FCANONICALIZE, MVT::v4f16, Custom);
654 
655     setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
656     setOperationAction(ISD::SELECT, MVT::v4i16, Custom);
657     setOperationAction(ISD::SELECT, MVT::v4f16, Custom);
658   }
659 
660   setOperationAction(ISD::FNEG, MVT::v4f16, Custom);
661   setOperationAction(ISD::FABS, MVT::v4f16, Custom);
662 
663   if (Subtarget->has16BitInsts()) {
664     setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
665     AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
666     setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
667     AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
668   } else {
669     // Legalization hack.
670     setOperationAction(ISD::SELECT, MVT::v2i16, Custom);
671     setOperationAction(ISD::SELECT, MVT::v2f16, Custom);
672 
673     setOperationAction(ISD::FNEG, MVT::v2f16, Custom);
674     setOperationAction(ISD::FABS, MVT::v2f16, Custom);
675   }
676 
677   for (MVT VT : { MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8 }) {
678     setOperationAction(ISD::SELECT, VT, Custom);
679   }
680 
681   setTargetDAGCombine(ISD::ADD);
682   setTargetDAGCombine(ISD::ADDCARRY);
683   setTargetDAGCombine(ISD::SUB);
684   setTargetDAGCombine(ISD::SUBCARRY);
685   setTargetDAGCombine(ISD::FADD);
686   setTargetDAGCombine(ISD::FSUB);
687   setTargetDAGCombine(ISD::FMINNUM);
688   setTargetDAGCombine(ISD::FMAXNUM);
689   setTargetDAGCombine(ISD::FMINNUM_IEEE);
690   setTargetDAGCombine(ISD::FMAXNUM_IEEE);
691   setTargetDAGCombine(ISD::FMA);
692   setTargetDAGCombine(ISD::SMIN);
693   setTargetDAGCombine(ISD::SMAX);
694   setTargetDAGCombine(ISD::UMIN);
695   setTargetDAGCombine(ISD::UMAX);
696   setTargetDAGCombine(ISD::SETCC);
697   setTargetDAGCombine(ISD::AND);
698   setTargetDAGCombine(ISD::OR);
699   setTargetDAGCombine(ISD::XOR);
700   setTargetDAGCombine(ISD::SINT_TO_FP);
701   setTargetDAGCombine(ISD::UINT_TO_FP);
702   setTargetDAGCombine(ISD::FCANONICALIZE);
703   setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
704   setTargetDAGCombine(ISD::ZERO_EXTEND);
705   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
706   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
707   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
708 
709   // All memory operations. Some folding on the pointer operand is done to help
710   // matching the constant offsets in the addressing modes.
711   setTargetDAGCombine(ISD::LOAD);
712   setTargetDAGCombine(ISD::STORE);
713   setTargetDAGCombine(ISD::ATOMIC_LOAD);
714   setTargetDAGCombine(ISD::ATOMIC_STORE);
715   setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP);
716   setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
717   setTargetDAGCombine(ISD::ATOMIC_SWAP);
718   setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD);
719   setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB);
720   setTargetDAGCombine(ISD::ATOMIC_LOAD_AND);
721   setTargetDAGCombine(ISD::ATOMIC_LOAD_OR);
722   setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR);
723   setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND);
724   setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN);
725   setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX);
726   setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN);
727   setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
728   setTargetDAGCombine(ISD::ATOMIC_LOAD_FADD);
729 
730   setSchedulingPreference(Sched::RegPressure);
731 }
732 
733 const GCNSubtarget *SITargetLowering::getSubtarget() const {
734   return Subtarget;
735 }
736 
737 //===----------------------------------------------------------------------===//
738 // TargetLowering queries
739 //===----------------------------------------------------------------------===//
740 
741 // v_mad_mix* support a conversion from f16 to f32.
742 //
743 // There is only one special case when denormals are enabled we don't currently,
744 // where this is OK to use.
745 bool SITargetLowering::isFPExtFoldable(unsigned Opcode,
746                                            EVT DestVT, EVT SrcVT) const {
747   return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
748           (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
749          DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() &&
750          SrcVT.getScalarType() == MVT::f16;
751 }
752 
753 bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
754   // SI has some legal vector types, but no legal vector operations. Say no
755   // shuffles are legal in order to prefer scalarizing some vector operations.
756   return false;
757 }
758 
759 MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
760                                                     CallingConv::ID CC,
761                                                     EVT VT) const {
762   // TODO: Consider splitting all arguments into 32-bit pieces.
763   if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
764     EVT ScalarVT = VT.getScalarType();
765     unsigned Size = ScalarVT.getSizeInBits();
766     if (Size == 32)
767       return ScalarVT.getSimpleVT();
768 
769     if (Size == 64)
770       return MVT::i32;
771 
772     if (Size == 16 && Subtarget->has16BitInsts())
773       return VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
774   }
775 
776   return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
777 }
778 
779 unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
780                                                          CallingConv::ID CC,
781                                                          EVT VT) const {
782   if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
783     unsigned NumElts = VT.getVectorNumElements();
784     EVT ScalarVT = VT.getScalarType();
785     unsigned Size = ScalarVT.getSizeInBits();
786 
787     if (Size == 32)
788       return NumElts;
789 
790     if (Size == 64)
791       return 2 * NumElts;
792 
793     if (Size == 16 && Subtarget->has16BitInsts())
794       return (VT.getVectorNumElements() + 1) / 2;
795   }
796 
797   return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
798 }
799 
800 unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
801   LLVMContext &Context, CallingConv::ID CC,
802   EVT VT, EVT &IntermediateVT,
803   unsigned &NumIntermediates, MVT &RegisterVT) const {
804   if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
805     unsigned NumElts = VT.getVectorNumElements();
806     EVT ScalarVT = VT.getScalarType();
807     unsigned Size = ScalarVT.getSizeInBits();
808     if (Size == 32) {
809       RegisterVT = ScalarVT.getSimpleVT();
810       IntermediateVT = RegisterVT;
811       NumIntermediates = NumElts;
812       return NumIntermediates;
813     }
814 
815     if (Size == 64) {
816       RegisterVT = MVT::i32;
817       IntermediateVT = RegisterVT;
818       NumIntermediates = 2 * NumElts;
819       return NumIntermediates;
820     }
821 
822     // FIXME: We should fix the ABI to be the same on targets without 16-bit
823     // support, but unless we can properly handle 3-vectors, it will be still be
824     // inconsistent.
825     if (Size == 16 && Subtarget->has16BitInsts()) {
826       RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
827       IntermediateVT = RegisterVT;
828       NumIntermediates = (NumElts + 1) / 2;
829       return NumIntermediates;
830     }
831   }
832 
833   return TargetLowering::getVectorTypeBreakdownForCallingConv(
834     Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
835 }
836 
837 static MVT memVTFromAggregate(Type *Ty) {
838   // Only limited forms of aggregate type currently expected.
839   assert(Ty->isStructTy() && "Expected struct type");
840 
841 
842   Type *ElementType = nullptr;
843   unsigned NumElts;
844   if (Ty->getContainedType(0)->isVectorTy()) {
845     VectorType *VecComponent = cast<VectorType>(Ty->getContainedType(0));
846     ElementType = VecComponent->getElementType();
847     NumElts = VecComponent->getNumElements();
848   } else {
849     ElementType = Ty->getContainedType(0);
850     NumElts = 1;
851   }
852 
853   assert((Ty->getContainedType(1) && Ty->getContainedType(1)->isIntegerTy(32)) && "Expected int32 type");
854 
855   // Calculate the size of the memVT type from the aggregate
856   unsigned Pow2Elts = 0;
857   unsigned ElementSize;
858   switch (ElementType->getTypeID()) {
859     default:
860       llvm_unreachable("Unknown type!");
861     case Type::IntegerTyID:
862       ElementSize = cast<IntegerType>(ElementType)->getBitWidth();
863       break;
864     case Type::HalfTyID:
865       ElementSize = 16;
866       break;
867     case Type::FloatTyID:
868       ElementSize = 32;
869       break;
870   }
871   unsigned AdditionalElts = ElementSize == 16 ? 2 : 1;
872   Pow2Elts = 1 << Log2_32_Ceil(NumElts + AdditionalElts);
873 
874   return MVT::getVectorVT(MVT::getVT(ElementType, false),
875                           Pow2Elts);
876 }
877 
878 bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
879                                           const CallInst &CI,
880                                           MachineFunction &MF,
881                                           unsigned IntrID) const {
882   if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
883           AMDGPU::lookupRsrcIntrinsic(IntrID)) {
884     AttributeList Attr = Intrinsic::getAttributes(CI.getContext(),
885                                                   (Intrinsic::ID)IntrID);
886     if (Attr.hasFnAttribute(Attribute::ReadNone))
887       return false;
888 
889     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
890 
891     if (RsrcIntr->IsImage) {
892       Info.ptrVal = MFI->getImagePSV(
893         *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
894         CI.getArgOperand(RsrcIntr->RsrcArg));
895       Info.align = 0;
896     } else {
897       Info.ptrVal = MFI->getBufferPSV(
898         *MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
899         CI.getArgOperand(RsrcIntr->RsrcArg));
900     }
901 
902     Info.flags = MachineMemOperand::MODereferenceable;
903     if (Attr.hasFnAttribute(Attribute::ReadOnly)) {
904       Info.opc = ISD::INTRINSIC_W_CHAIN;
905       Info.memVT = MVT::getVT(CI.getType(), true);
906       if (Info.memVT == MVT::Other) {
907         // Some intrinsics return an aggregate type - special case to work out
908         // the correct memVT
909         Info.memVT = memVTFromAggregate(CI.getType());
910       }
911       Info.flags |= MachineMemOperand::MOLoad;
912     } else if (Attr.hasFnAttribute(Attribute::WriteOnly)) {
913       Info.opc = ISD::INTRINSIC_VOID;
914       Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
915       Info.flags |= MachineMemOperand::MOStore;
916     } else {
917       // Atomic
918       Info.opc = ISD::INTRINSIC_W_CHAIN;
919       Info.memVT = MVT::getVT(CI.getType());
920       Info.flags = MachineMemOperand::MOLoad |
921                    MachineMemOperand::MOStore |
922                    MachineMemOperand::MODereferenceable;
923 
924       // XXX - Should this be volatile without known ordering?
925       Info.flags |= MachineMemOperand::MOVolatile;
926     }
927     return true;
928   }
929 
930   switch (IntrID) {
931   case Intrinsic::amdgcn_atomic_inc:
932   case Intrinsic::amdgcn_atomic_dec:
933   case Intrinsic::amdgcn_ds_ordered_add:
934   case Intrinsic::amdgcn_ds_ordered_swap:
935   case Intrinsic::amdgcn_ds_fadd:
936   case Intrinsic::amdgcn_ds_fmin:
937   case Intrinsic::amdgcn_ds_fmax: {
938     Info.opc = ISD::INTRINSIC_W_CHAIN;
939     Info.memVT = MVT::getVT(CI.getType());
940     Info.ptrVal = CI.getOperand(0);
941     Info.align = 0;
942     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
943 
944     const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
945     if (!Vol->isZero())
946       Info.flags |= MachineMemOperand::MOVolatile;
947 
948     return true;
949   }
950   case Intrinsic::amdgcn_ds_append:
951   case Intrinsic::amdgcn_ds_consume: {
952     Info.opc = ISD::INTRINSIC_W_CHAIN;
953     Info.memVT = MVT::getVT(CI.getType());
954     Info.ptrVal = CI.getOperand(0);
955     Info.align = 0;
956     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
957 
958     const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
959     if (!Vol->isZero())
960       Info.flags |= MachineMemOperand::MOVolatile;
961 
962     return true;
963   }
964   default:
965     return false;
966   }
967 }
968 
969 bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
970                                             SmallVectorImpl<Value*> &Ops,
971                                             Type *&AccessTy) const {
972   switch (II->getIntrinsicID()) {
973   case Intrinsic::amdgcn_atomic_inc:
974   case Intrinsic::amdgcn_atomic_dec:
975   case Intrinsic::amdgcn_ds_ordered_add:
976   case Intrinsic::amdgcn_ds_ordered_swap:
977   case Intrinsic::amdgcn_ds_fadd:
978   case Intrinsic::amdgcn_ds_fmin:
979   case Intrinsic::amdgcn_ds_fmax: {
980     Value *Ptr = II->getArgOperand(0);
981     AccessTy = II->getType();
982     Ops.push_back(Ptr);
983     return true;
984   }
985   default:
986     return false;
987   }
988 }
989 
990 bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
991   if (!Subtarget->hasFlatInstOffsets()) {
992     // Flat instructions do not have offsets, and only have the register
993     // address.
994     return AM.BaseOffs == 0 && AM.Scale == 0;
995   }
996 
997   // GFX9 added a 13-bit signed offset. When using regular flat instructions,
998   // the sign bit is ignored and is treated as a 12-bit unsigned offset.
999 
1000   // GFX10 shrinked signed offset to 12 bits. When using regular flat
1001   // instructions, the sign bit is also ignored and is treated as 11-bit
1002   // unsigned offset.
1003 
1004   if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
1005     return isUInt<11>(AM.BaseOffs) && AM.Scale == 0;
1006 
1007   // Just r + i
1008   return isUInt<12>(AM.BaseOffs) && AM.Scale == 0;
1009 }
1010 
1011 bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
1012   if (Subtarget->hasFlatGlobalInsts())
1013     return isInt<13>(AM.BaseOffs) && AM.Scale == 0;
1014 
1015   if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1016       // Assume the we will use FLAT for all global memory accesses
1017       // on VI.
1018       // FIXME: This assumption is currently wrong.  On VI we still use
1019       // MUBUF instructions for the r + i addressing mode.  As currently
1020       // implemented, the MUBUF instructions only work on buffer < 4GB.
1021       // It may be possible to support > 4GB buffers with MUBUF instructions,
1022       // by setting the stride value in the resource descriptor which would
1023       // increase the size limit to (stride * 4GB).  However, this is risky,
1024       // because it has never been validated.
1025     return isLegalFlatAddressingMode(AM);
1026   }
1027 
1028   return isLegalMUBUFAddressingMode(AM);
1029 }
1030 
1031 bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1032   // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1033   // additionally can do r + r + i with addr64. 32-bit has more addressing
1034   // mode options. Depending on the resource constant, it can also do
1035   // (i64 r0) + (i32 r1) * (i14 i).
1036   //
1037   // Private arrays end up using a scratch buffer most of the time, so also
1038   // assume those use MUBUF instructions. Scratch loads / stores are currently
1039   // implemented as mubuf instructions with offen bit set, so slightly
1040   // different than the normal addr64.
1041   if (!isUInt<12>(AM.BaseOffs))
1042     return false;
1043 
1044   // FIXME: Since we can split immediate into soffset and immediate offset,
1045   // would it make sense to allow any immediate?
1046 
1047   switch (AM.Scale) {
1048   case 0: // r + i or just i, depending on HasBaseReg.
1049     return true;
1050   case 1:
1051     return true; // We have r + r or r + i.
1052   case 2:
1053     if (AM.HasBaseReg) {
1054       // Reject 2 * r + r.
1055       return false;
1056     }
1057 
1058     // Allow 2 * r as r + r
1059     // Or  2 * r + i is allowed as r + r + i.
1060     return true;
1061   default: // Don't allow n * r
1062     return false;
1063   }
1064 }
1065 
1066 bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
1067                                              const AddrMode &AM, Type *Ty,
1068                                              unsigned AS, Instruction *I) const {
1069   // No global is ever allowed as a base.
1070   if (AM.BaseGV)
1071     return false;
1072 
1073   if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1074     return isLegalGlobalAddressingMode(AM);
1075 
1076   if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1077       AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
1078       AS == AMDGPUAS::BUFFER_FAT_POINTER) {
1079     // If the offset isn't a multiple of 4, it probably isn't going to be
1080     // correctly aligned.
1081     // FIXME: Can we get the real alignment here?
1082     if (AM.BaseOffs % 4 != 0)
1083       return isLegalMUBUFAddressingMode(AM);
1084 
1085     // There are no SMRD extloads, so if we have to do a small type access we
1086     // will use a MUBUF load.
1087     // FIXME?: We also need to do this if unaligned, but we don't know the
1088     // alignment here.
1089     if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1090       return isLegalGlobalAddressingMode(AM);
1091 
1092     if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1093       // SMRD instructions have an 8-bit, dword offset on SI.
1094       if (!isUInt<8>(AM.BaseOffs / 4))
1095         return false;
1096     } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1097       // On CI+, this can also be a 32-bit literal constant offset. If it fits
1098       // in 8-bits, it can use a smaller encoding.
1099       if (!isUInt<32>(AM.BaseOffs / 4))
1100         return false;
1101     } else if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
1102       // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1103       if (!isUInt<20>(AM.BaseOffs))
1104         return false;
1105     } else
1106       llvm_unreachable("unhandled generation");
1107 
1108     if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1109       return true;
1110 
1111     if (AM.Scale == 1 && AM.HasBaseReg)
1112       return true;
1113 
1114     return false;
1115 
1116   } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1117     return isLegalMUBUFAddressingMode(AM);
1118   } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1119              AS == AMDGPUAS::REGION_ADDRESS) {
1120     // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1121     // field.
1122     // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1123     // an 8-bit dword offset but we don't know the alignment here.
1124     if (!isUInt<16>(AM.BaseOffs))
1125       return false;
1126 
1127     if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1128       return true;
1129 
1130     if (AM.Scale == 1 && AM.HasBaseReg)
1131       return true;
1132 
1133     return false;
1134   } else if (AS == AMDGPUAS::FLAT_ADDRESS ||
1135              AS == AMDGPUAS::UNKNOWN_ADDRESS_SPACE) {
1136     // For an unknown address space, this usually means that this is for some
1137     // reason being used for pure arithmetic, and not based on some addressing
1138     // computation. We don't have instructions that compute pointers with any
1139     // addressing modes, so treat them as having no offset like flat
1140     // instructions.
1141     return isLegalFlatAddressingMode(AM);
1142   } else {
1143     llvm_unreachable("unhandled address space");
1144   }
1145 }
1146 
1147 bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
1148                                         const SelectionDAG &DAG) const {
1149   if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {
1150     return (MemVT.getSizeInBits() <= 4 * 32);
1151   } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1152     unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1153     return (MemVT.getSizeInBits() <= MaxPrivateBits);
1154   } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
1155     return (MemVT.getSizeInBits() <= 2 * 32);
1156   }
1157   return true;
1158 }
1159 
1160 bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1161                                                       unsigned AddrSpace,
1162                                                       unsigned Align,
1163                                                       bool *IsFast) const {
1164   if (IsFast)
1165     *IsFast = false;
1166 
1167   // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
1168   // which isn't a simple VT.
1169   // Until MVT is extended to handle this, simply check for the size and
1170   // rely on the condition below: allow accesses if the size is a multiple of 4.
1171   if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
1172                            VT.getStoreSize() > 16)) {
1173     return false;
1174   }
1175 
1176   if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1177       AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1178     // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
1179     // aligned, 8 byte access in a single operation using ds_read2/write2_b32
1180     // with adjacent offsets.
1181     bool AlignedBy4 = (Align % 4 == 0);
1182     if (IsFast)
1183       *IsFast = AlignedBy4;
1184 
1185     return AlignedBy4;
1186   }
1187 
1188   // FIXME: We have to be conservative here and assume that flat operations
1189   // will access scratch.  If we had access to the IR function, then we
1190   // could determine if any private memory was used in the function.
1191   if (!Subtarget->hasUnalignedScratchAccess() &&
1192       (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
1193        AddrSpace == AMDGPUAS::FLAT_ADDRESS)) {
1194     bool AlignedBy4 = Align >= 4;
1195     if (IsFast)
1196       *IsFast = AlignedBy4;
1197 
1198     return AlignedBy4;
1199   }
1200 
1201   if (Subtarget->hasUnalignedBufferAccess()) {
1202     // If we have an uniform constant load, it still requires using a slow
1203     // buffer instruction if unaligned.
1204     if (IsFast) {
1205       *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
1206                  AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ?
1207         (Align % 4 == 0) : true;
1208     }
1209 
1210     return true;
1211   }
1212 
1213   // Smaller than dword value must be aligned.
1214   if (VT.bitsLT(MVT::i32))
1215     return false;
1216 
1217   // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1218   // byte-address are ignored, thus forcing Dword alignment.
1219   // This applies to private, global, and constant memory.
1220   if (IsFast)
1221     *IsFast = true;
1222 
1223   return VT.bitsGT(MVT::i32) && Align % 4 == 0;
1224 }
1225 
1226 EVT SITargetLowering::getOptimalMemOpType(
1227     uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
1228     bool ZeroMemset, bool MemcpyStrSrc,
1229     const AttributeList &FuncAttributes) const {
1230   // FIXME: Should account for address space here.
1231 
1232   // The default fallback uses the private pointer size as a guess for a type to
1233   // use. Make sure we switch these to 64-bit accesses.
1234 
1235   if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global
1236     return MVT::v4i32;
1237 
1238   if (Size >= 8 && DstAlign >= 4)
1239     return MVT::v2i32;
1240 
1241   // Use the default.
1242   return MVT::Other;
1243 }
1244 
1245 static bool isFlatGlobalAddrSpace(unsigned AS) {
1246   return AS == AMDGPUAS::GLOBAL_ADDRESS ||
1247          AS == AMDGPUAS::FLAT_ADDRESS ||
1248          AS == AMDGPUAS::CONSTANT_ADDRESS ||
1249          AS > AMDGPUAS::MAX_AMDGPU_ADDRESS;
1250 }
1251 
1252 bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
1253                                            unsigned DestAS) const {
1254   return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS);
1255 }
1256 
1257 bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
1258   const MemSDNode *MemNode = cast<MemSDNode>(N);
1259   const Value *Ptr = MemNode->getMemOperand()->getValue();
1260   const Instruction *I = dyn_cast_or_null<Instruction>(Ptr);
1261   return I && I->getMetadata("amdgpu.noclobber");
1262 }
1263 
1264 bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS,
1265                                             unsigned DestAS) const {
1266   // Flat -> private/local is a simple truncate.
1267   // Flat -> global is no-op
1268   if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1269     return true;
1270 
1271   return isNoopAddrSpaceCast(SrcAS, DestAS);
1272 }
1273 
1274 bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
1275   const MemSDNode *MemNode = cast<MemSDNode>(N);
1276 
1277   return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand());
1278 }
1279 
1280 TargetLoweringBase::LegalizeTypeAction
1281 SITargetLowering::getPreferredVectorAction(MVT VT) const {
1282   if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
1283     return TypeSplitVector;
1284 
1285   return TargetLoweringBase::getPreferredVectorAction(VT);
1286 }
1287 
1288 bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
1289                                                          Type *Ty) const {
1290   // FIXME: Could be smarter if called for vector constants.
1291   return true;
1292 }
1293 
1294 bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
1295   if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1296     switch (Op) {
1297     case ISD::LOAD:
1298     case ISD::STORE:
1299 
1300     // These operations are done with 32-bit instructions anyway.
1301     case ISD::AND:
1302     case ISD::OR:
1303     case ISD::XOR:
1304     case ISD::SELECT:
1305       // TODO: Extensions?
1306       return true;
1307     default:
1308       return false;
1309     }
1310   }
1311 
1312   // SimplifySetCC uses this function to determine whether or not it should
1313   // create setcc with i1 operands.  We don't have instructions for i1 setcc.
1314   if (VT == MVT::i1 && Op == ISD::SETCC)
1315     return false;
1316 
1317   return TargetLowering::isTypeDesirableForOp(Op, VT);
1318 }
1319 
1320 SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1321                                                    const SDLoc &SL,
1322                                                    SDValue Chain,
1323                                                    uint64_t Offset) const {
1324   const DataLayout &DL = DAG.getDataLayout();
1325   MachineFunction &MF = DAG.getMachineFunction();
1326   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1327 
1328   const ArgDescriptor *InputPtrReg;
1329   const TargetRegisterClass *RC;
1330 
1331   std::tie(InputPtrReg, RC)
1332     = Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1333 
1334   MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
1335   MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
1336   SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
1337     MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1338 
1339   return DAG.getObjectPtrOffset(SL, BasePtr, Offset);
1340 }
1341 
1342 SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1343                                             const SDLoc &SL) const {
1344   uint64_t Offset = getImplicitParameterOffset(DAG.getMachineFunction(),
1345                                                FIRST_IMPLICIT);
1346   return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
1347 }
1348 
1349 SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
1350                                          const SDLoc &SL, SDValue Val,
1351                                          bool Signed,
1352                                          const ISD::InputArg *Arg) const {
1353   // First, if it is a widened vector, narrow it.
1354   if (VT.isVector() &&
1355       VT.getVectorNumElements() != MemVT.getVectorNumElements()) {
1356     EVT NarrowedVT =
1357         EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(),
1358                          VT.getVectorNumElements());
1359     Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
1360                       DAG.getConstant(0, SL, MVT::i32));
1361   }
1362 
1363   // Then convert the vector elements or scalar value.
1364   if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
1365       VT.bitsLT(MemVT)) {
1366     unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
1367     Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
1368   }
1369 
1370   if (MemVT.isFloatingPoint())
1371     Val = getFPExtOrFPTrunc(DAG, Val, SL, VT);
1372   else if (Signed)
1373     Val = DAG.getSExtOrTrunc(Val, SL, VT);
1374   else
1375     Val = DAG.getZExtOrTrunc(Val, SL, VT);
1376 
1377   return Val;
1378 }
1379 
1380 SDValue SITargetLowering::lowerKernargMemParameter(
1381   SelectionDAG &DAG, EVT VT, EVT MemVT,
1382   const SDLoc &SL, SDValue Chain,
1383   uint64_t Offset, unsigned Align, bool Signed,
1384   const ISD::InputArg *Arg) const {
1385   Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
1386   PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
1387   MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
1388 
1389   // Try to avoid using an extload by loading earlier than the argument address,
1390   // and extracting the relevant bits. The load should hopefully be merged with
1391   // the previous argument.
1392   if (MemVT.getStoreSize() < 4 && Align < 4) {
1393     // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
1394     int64_t AlignDownOffset = alignDown(Offset, 4);
1395     int64_t OffsetDiff = Offset - AlignDownOffset;
1396 
1397     EVT IntVT = MemVT.changeTypeToInteger();
1398 
1399     // TODO: If we passed in the base kernel offset we could have a better
1400     // alignment than 4, but we don't really need it.
1401     SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
1402     SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, 4,
1403                                MachineMemOperand::MODereferenceable |
1404                                MachineMemOperand::MOInvariant);
1405 
1406     SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
1407     SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
1408 
1409     SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
1410     ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
1411     ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
1412 
1413 
1414     return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
1415   }
1416 
1417   SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
1418   SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
1419                              MachineMemOperand::MODereferenceable |
1420                              MachineMemOperand::MOInvariant);
1421 
1422   SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
1423   return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
1424 }
1425 
1426 SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
1427                                               const SDLoc &SL, SDValue Chain,
1428                                               const ISD::InputArg &Arg) const {
1429   MachineFunction &MF = DAG.getMachineFunction();
1430   MachineFrameInfo &MFI = MF.getFrameInfo();
1431 
1432   if (Arg.Flags.isByVal()) {
1433     unsigned Size = Arg.Flags.getByValSize();
1434     int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
1435     return DAG.getFrameIndex(FrameIdx, MVT::i32);
1436   }
1437 
1438   unsigned ArgOffset = VA.getLocMemOffset();
1439   unsigned ArgSize = VA.getValVT().getStoreSize();
1440 
1441   int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
1442 
1443   // Create load nodes to retrieve arguments from the stack.
1444   SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
1445   SDValue ArgValue;
1446 
1447   // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
1448   ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
1449   MVT MemVT = VA.getValVT();
1450 
1451   switch (VA.getLocInfo()) {
1452   default:
1453     break;
1454   case CCValAssign::BCvt:
1455     MemVT = VA.getLocVT();
1456     break;
1457   case CCValAssign::SExt:
1458     ExtType = ISD::SEXTLOAD;
1459     break;
1460   case CCValAssign::ZExt:
1461     ExtType = ISD::ZEXTLOAD;
1462     break;
1463   case CCValAssign::AExt:
1464     ExtType = ISD::EXTLOAD;
1465     break;
1466   }
1467 
1468   ArgValue = DAG.getExtLoad(
1469     ExtType, SL, VA.getLocVT(), Chain, FIN,
1470     MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
1471     MemVT);
1472   return ArgValue;
1473 }
1474 
1475 SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
1476   const SIMachineFunctionInfo &MFI,
1477   EVT VT,
1478   AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
1479   const ArgDescriptor *Reg;
1480   const TargetRegisterClass *RC;
1481 
1482   std::tie(Reg, RC) = MFI.getPreloadedValue(PVID);
1483   return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
1484 }
1485 
1486 static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
1487                                    CallingConv::ID CallConv,
1488                                    ArrayRef<ISD::InputArg> Ins,
1489                                    BitVector &Skipped,
1490                                    FunctionType *FType,
1491                                    SIMachineFunctionInfo *Info) {
1492   for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
1493     const ISD::InputArg *Arg = &Ins[I];
1494 
1495     assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
1496            "vector type argument should have been split");
1497 
1498     // First check if it's a PS input addr.
1499     if (CallConv == CallingConv::AMDGPU_PS &&
1500         !Arg->Flags.isInReg() && !Arg->Flags.isByVal() && PSInputNum <= 15) {
1501 
1502       bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
1503 
1504       // Inconveniently only the first part of the split is marked as isSplit,
1505       // so skip to the end. We only want to increment PSInputNum once for the
1506       // entire split argument.
1507       if (Arg->Flags.isSplit()) {
1508         while (!Arg->Flags.isSplitEnd()) {
1509           assert(!Arg->VT.isVector() &&
1510                  "unexpected vector split in ps argument type");
1511           if (!SkipArg)
1512             Splits.push_back(*Arg);
1513           Arg = &Ins[++I];
1514         }
1515       }
1516 
1517       if (SkipArg) {
1518         // We can safely skip PS inputs.
1519         Skipped.set(Arg->getOrigArgIndex());
1520         ++PSInputNum;
1521         continue;
1522       }
1523 
1524       Info->markPSInputAllocated(PSInputNum);
1525       if (Arg->Used)
1526         Info->markPSInputEnabled(PSInputNum);
1527 
1528       ++PSInputNum;
1529     }
1530 
1531     Splits.push_back(*Arg);
1532   }
1533 }
1534 
1535 // Allocate special inputs passed in VGPRs.
1536 static void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
1537                                            MachineFunction &MF,
1538                                            const SIRegisterInfo &TRI,
1539                                            SIMachineFunctionInfo &Info) {
1540   if (Info.hasWorkItemIDX()) {
1541     unsigned Reg = AMDGPU::VGPR0;
1542     MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1543 
1544     CCInfo.AllocateReg(Reg);
1545     Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg));
1546   }
1547 
1548   if (Info.hasWorkItemIDY()) {
1549     unsigned Reg = AMDGPU::VGPR1;
1550     MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1551 
1552     CCInfo.AllocateReg(Reg);
1553     Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
1554   }
1555 
1556   if (Info.hasWorkItemIDZ()) {
1557     unsigned Reg = AMDGPU::VGPR2;
1558     MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1559 
1560     CCInfo.AllocateReg(Reg);
1561     Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
1562   }
1563 }
1564 
1565 // Try to allocate a VGPR at the end of the argument list, or if no argument
1566 // VGPRs are left allocating a stack slot.
1567 static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) {
1568   ArrayRef<MCPhysReg> ArgVGPRs
1569     = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
1570   unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
1571   if (RegIdx == ArgVGPRs.size()) {
1572     // Spill to stack required.
1573     int64_t Offset = CCInfo.AllocateStack(4, 4);
1574 
1575     return ArgDescriptor::createStack(Offset);
1576   }
1577 
1578   unsigned Reg = ArgVGPRs[RegIdx];
1579   Reg = CCInfo.AllocateReg(Reg);
1580   assert(Reg != AMDGPU::NoRegister);
1581 
1582   MachineFunction &MF = CCInfo.getMachineFunction();
1583   MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1584   return ArgDescriptor::createRegister(Reg);
1585 }
1586 
1587 static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
1588                                              const TargetRegisterClass *RC,
1589                                              unsigned NumArgRegs) {
1590   ArrayRef<MCPhysReg> ArgSGPRs = makeArrayRef(RC->begin(), 32);
1591   unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
1592   if (RegIdx == ArgSGPRs.size())
1593     report_fatal_error("ran out of SGPRs for arguments");
1594 
1595   unsigned Reg = ArgSGPRs[RegIdx];
1596   Reg = CCInfo.AllocateReg(Reg);
1597   assert(Reg != AMDGPU::NoRegister);
1598 
1599   MachineFunction &MF = CCInfo.getMachineFunction();
1600   MF.addLiveIn(Reg, RC);
1601   return ArgDescriptor::createRegister(Reg);
1602 }
1603 
1604 static ArgDescriptor allocateSGPR32Input(CCState &CCInfo) {
1605   return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
1606 }
1607 
1608 static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) {
1609   return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
1610 }
1611 
1612 static void allocateSpecialInputVGPRs(CCState &CCInfo,
1613                                       MachineFunction &MF,
1614                                       const SIRegisterInfo &TRI,
1615                                       SIMachineFunctionInfo &Info) {
1616   if (Info.hasWorkItemIDX())
1617     Info.setWorkItemIDX(allocateVGPR32Input(CCInfo));
1618 
1619   if (Info.hasWorkItemIDY())
1620     Info.setWorkItemIDY(allocateVGPR32Input(CCInfo));
1621 
1622   if (Info.hasWorkItemIDZ())
1623     Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo));
1624 }
1625 
1626 static void allocateSpecialInputSGPRs(CCState &CCInfo,
1627                                       MachineFunction &MF,
1628                                       const SIRegisterInfo &TRI,
1629                                       SIMachineFunctionInfo &Info) {
1630   auto &ArgInfo = Info.getArgInfo();
1631 
1632   // TODO: Unify handling with private memory pointers.
1633 
1634   if (Info.hasDispatchPtr())
1635     ArgInfo.DispatchPtr = allocateSGPR64Input(CCInfo);
1636 
1637   if (Info.hasQueuePtr())
1638     ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo);
1639 
1640   if (Info.hasKernargSegmentPtr())
1641     ArgInfo.KernargSegmentPtr = allocateSGPR64Input(CCInfo);
1642 
1643   if (Info.hasDispatchID())
1644     ArgInfo.DispatchID = allocateSGPR64Input(CCInfo);
1645 
1646   // flat_scratch_init is not applicable for non-kernel functions.
1647 
1648   if (Info.hasWorkGroupIDX())
1649     ArgInfo.WorkGroupIDX = allocateSGPR32Input(CCInfo);
1650 
1651   if (Info.hasWorkGroupIDY())
1652     ArgInfo.WorkGroupIDY = allocateSGPR32Input(CCInfo);
1653 
1654   if (Info.hasWorkGroupIDZ())
1655     ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo);
1656 
1657   if (Info.hasImplicitArgPtr())
1658     ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo);
1659 }
1660 
1661 // Allocate special inputs passed in user SGPRs.
1662 static void allocateHSAUserSGPRs(CCState &CCInfo,
1663                                  MachineFunction &MF,
1664                                  const SIRegisterInfo &TRI,
1665                                  SIMachineFunctionInfo &Info) {
1666   if (Info.hasImplicitBufferPtr()) {
1667     unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
1668     MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
1669     CCInfo.AllocateReg(ImplicitBufferPtrReg);
1670   }
1671 
1672   // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
1673   if (Info.hasPrivateSegmentBuffer()) {
1674     unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
1675     MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
1676     CCInfo.AllocateReg(PrivateSegmentBufferReg);
1677   }
1678 
1679   if (Info.hasDispatchPtr()) {
1680     unsigned DispatchPtrReg = Info.addDispatchPtr(TRI);
1681     MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
1682     CCInfo.AllocateReg(DispatchPtrReg);
1683   }
1684 
1685   if (Info.hasQueuePtr()) {
1686     unsigned QueuePtrReg = Info.addQueuePtr(TRI);
1687     MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
1688     CCInfo.AllocateReg(QueuePtrReg);
1689   }
1690 
1691   if (Info.hasKernargSegmentPtr()) {
1692     unsigned InputPtrReg = Info.addKernargSegmentPtr(TRI);
1693     MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
1694     CCInfo.AllocateReg(InputPtrReg);
1695   }
1696 
1697   if (Info.hasDispatchID()) {
1698     unsigned DispatchIDReg = Info.addDispatchID(TRI);
1699     MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
1700     CCInfo.AllocateReg(DispatchIDReg);
1701   }
1702 
1703   if (Info.hasFlatScratchInit()) {
1704     unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI);
1705     MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
1706     CCInfo.AllocateReg(FlatScratchInitReg);
1707   }
1708 
1709   // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
1710   // these from the dispatch pointer.
1711 }
1712 
1713 // Allocate special input registers that are initialized per-wave.
1714 static void allocateSystemSGPRs(CCState &CCInfo,
1715                                 MachineFunction &MF,
1716                                 SIMachineFunctionInfo &Info,
1717                                 CallingConv::ID CallConv,
1718                                 bool IsShader) {
1719   if (Info.hasWorkGroupIDX()) {
1720     unsigned Reg = Info.addWorkGroupIDX();
1721     MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1722     CCInfo.AllocateReg(Reg);
1723   }
1724 
1725   if (Info.hasWorkGroupIDY()) {
1726     unsigned Reg = Info.addWorkGroupIDY();
1727     MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1728     CCInfo.AllocateReg(Reg);
1729   }
1730 
1731   if (Info.hasWorkGroupIDZ()) {
1732     unsigned Reg = Info.addWorkGroupIDZ();
1733     MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1734     CCInfo.AllocateReg(Reg);
1735   }
1736 
1737   if (Info.hasWorkGroupInfo()) {
1738     unsigned Reg = Info.addWorkGroupInfo();
1739     MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
1740     CCInfo.AllocateReg(Reg);
1741   }
1742 
1743   if (Info.hasPrivateSegmentWaveByteOffset()) {
1744     // Scratch wave offset passed in system SGPR.
1745     unsigned PrivateSegmentWaveByteOffsetReg;
1746 
1747     if (IsShader) {
1748       PrivateSegmentWaveByteOffsetReg =
1749         Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
1750 
1751       // This is true if the scratch wave byte offset doesn't have a fixed
1752       // location.
1753       if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
1754         PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
1755         Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
1756       }
1757     } else
1758       PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
1759 
1760     MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
1761     CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
1762   }
1763 }
1764 
1765 static void reservePrivateMemoryRegs(const TargetMachine &TM,
1766                                      MachineFunction &MF,
1767                                      const SIRegisterInfo &TRI,
1768                                      SIMachineFunctionInfo &Info) {
1769   // Now that we've figured out where the scratch register inputs are, see if
1770   // should reserve the arguments and use them directly.
1771   MachineFrameInfo &MFI = MF.getFrameInfo();
1772   bool HasStackObjects = MFI.hasStackObjects();
1773 
1774   // Record that we know we have non-spill stack objects so we don't need to
1775   // check all stack objects later.
1776   if (HasStackObjects)
1777     Info.setHasNonSpillStackObjects(true);
1778 
1779   // Everything live out of a block is spilled with fast regalloc, so it's
1780   // almost certain that spilling will be required.
1781   if (TM.getOptLevel() == CodeGenOpt::None)
1782     HasStackObjects = true;
1783 
1784   // For now assume stack access is needed in any callee functions, so we need
1785   // the scratch registers to pass in.
1786   bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
1787 
1788   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1789   if (ST.isAmdHsaOrMesa(MF.getFunction())) {
1790     if (RequiresStackAccess) {
1791       // If we have stack objects, we unquestionably need the private buffer
1792       // resource. For the Code Object V2 ABI, this will be the first 4 user
1793       // SGPR inputs. We can reserve those and use them directly.
1794 
1795       unsigned PrivateSegmentBufferReg = Info.getPreloadedReg(
1796         AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
1797       Info.setScratchRSrcReg(PrivateSegmentBufferReg);
1798 
1799       if (MFI.hasCalls()) {
1800         // If we have calls, we need to keep the frame register in a register
1801         // that won't be clobbered by a call, so ensure it is copied somewhere.
1802 
1803         // This is not a problem for the scratch wave offset, because the same
1804         // registers are reserved in all functions.
1805 
1806         // FIXME: Nothing is really ensuring this is a call preserved register,
1807         // it's just selected from the end so it happens to be.
1808         unsigned ReservedOffsetReg
1809           = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
1810         Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1811       } else {
1812         unsigned PrivateSegmentWaveByteOffsetReg = Info.getPreloadedReg(
1813           AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
1814         Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
1815       }
1816     } else {
1817       unsigned ReservedBufferReg
1818         = TRI.reservedPrivateSegmentBufferReg(MF);
1819       unsigned ReservedOffsetReg
1820         = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
1821 
1822       // We tentatively reserve the last registers (skipping the last two
1823       // which may contain VCC). After register allocation, we'll replace
1824       // these with the ones immediately after those which were really
1825       // allocated. In the prologue copies will be inserted from the argument
1826       // to these reserved registers.
1827       Info.setScratchRSrcReg(ReservedBufferReg);
1828       Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1829     }
1830   } else {
1831     unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
1832 
1833     // Without HSA, relocations are used for the scratch pointer and the
1834     // buffer resource setup is always inserted in the prologue. Scratch wave
1835     // offset is still in an input SGPR.
1836     Info.setScratchRSrcReg(ReservedBufferReg);
1837 
1838     if (HasStackObjects && !MFI.hasCalls()) {
1839       unsigned ScratchWaveOffsetReg = Info.getPreloadedReg(
1840         AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
1841       Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg);
1842     } else {
1843       unsigned ReservedOffsetReg
1844         = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
1845       Info.setScratchWaveOffsetReg(ReservedOffsetReg);
1846     }
1847   }
1848 }
1849 
1850 bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const {
1851   const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
1852   return !Info->isEntryFunction();
1853 }
1854 
1855 void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
1856 
1857 }
1858 
1859 void SITargetLowering::insertCopiesSplitCSR(
1860   MachineBasicBlock *Entry,
1861   const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
1862   const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
1863 
1864   const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
1865   if (!IStart)
1866     return;
1867 
1868   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1869   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
1870   MachineBasicBlock::iterator MBBI = Entry->begin();
1871   for (const MCPhysReg *I = IStart; *I; ++I) {
1872     const TargetRegisterClass *RC = nullptr;
1873     if (AMDGPU::SReg_64RegClass.contains(*I))
1874       RC = &AMDGPU::SGPR_64RegClass;
1875     else if (AMDGPU::SReg_32RegClass.contains(*I))
1876       RC = &AMDGPU::SGPR_32RegClass;
1877     else
1878       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
1879 
1880     unsigned NewVR = MRI->createVirtualRegister(RC);
1881     // Create copy from CSR to a virtual register.
1882     Entry->addLiveIn(*I);
1883     BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
1884       .addReg(*I);
1885 
1886     // Insert the copy-back instructions right before the terminator.
1887     for (auto *Exit : Exits)
1888       BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
1889               TII->get(TargetOpcode::COPY), *I)
1890         .addReg(NewVR);
1891   }
1892 }
1893 
1894 SDValue SITargetLowering::LowerFormalArguments(
1895     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
1896     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
1897     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
1898   const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
1899 
1900   MachineFunction &MF = DAG.getMachineFunction();
1901   const Function &Fn = MF.getFunction();
1902   FunctionType *FType = MF.getFunction().getFunctionType();
1903   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1904 
1905   if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
1906     DiagnosticInfoUnsupported NoGraphicsHSA(
1907         Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
1908     DAG.getContext()->diagnose(NoGraphicsHSA);
1909     return DAG.getEntryNode();
1910   }
1911 
1912   SmallVector<ISD::InputArg, 16> Splits;
1913   SmallVector<CCValAssign, 16> ArgLocs;
1914   BitVector Skipped(Ins.size());
1915   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1916                  *DAG.getContext());
1917 
1918   bool IsShader = AMDGPU::isShader(CallConv);
1919   bool IsKernel = AMDGPU::isKernel(CallConv);
1920   bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
1921 
1922   if (!IsEntryFunc) {
1923     // 4 bytes are reserved at offset 0 for the emergency stack slot. Skip over
1924     // this when allocating argument fixed offsets.
1925     CCInfo.AllocateStack(4, 4);
1926   }
1927 
1928   if (IsShader) {
1929     processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
1930 
1931     // At least one interpolation mode must be enabled or else the GPU will
1932     // hang.
1933     //
1934     // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
1935     // set PSInputAddr, the user wants to enable some bits after the compilation
1936     // based on run-time states. Since we can't know what the final PSInputEna
1937     // will look like, so we shouldn't do anything here and the user should take
1938     // responsibility for the correct programming.
1939     //
1940     // Otherwise, the following restrictions apply:
1941     // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
1942     // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
1943     //   enabled too.
1944     if (CallConv == CallingConv::AMDGPU_PS) {
1945       if ((Info->getPSInputAddr() & 0x7F) == 0 ||
1946            ((Info->getPSInputAddr() & 0xF) == 0 &&
1947             Info->isPSInputAllocated(11))) {
1948         CCInfo.AllocateReg(AMDGPU::VGPR0);
1949         CCInfo.AllocateReg(AMDGPU::VGPR1);
1950         Info->markPSInputAllocated(0);
1951         Info->markPSInputEnabled(0);
1952       }
1953       if (Subtarget->isAmdPalOS()) {
1954         // For isAmdPalOS, the user does not enable some bits after compilation
1955         // based on run-time states; the register values being generated here are
1956         // the final ones set in hardware. Therefore we need to apply the
1957         // workaround to PSInputAddr and PSInputEnable together.  (The case where
1958         // a bit is set in PSInputAddr but not PSInputEnable is where the
1959         // frontend set up an input arg for a particular interpolation mode, but
1960         // nothing uses that input arg. Really we should have an earlier pass
1961         // that removes such an arg.)
1962         unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
1963         if ((PsInputBits & 0x7F) == 0 ||
1964             ((PsInputBits & 0xF) == 0 &&
1965              (PsInputBits >> 11 & 1)))
1966           Info->markPSInputEnabled(
1967               countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
1968       }
1969     }
1970 
1971     assert(!Info->hasDispatchPtr() &&
1972            !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
1973            !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
1974            !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
1975            !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
1976            !Info->hasWorkItemIDZ());
1977   } else if (IsKernel) {
1978     assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
1979   } else {
1980     Splits.append(Ins.begin(), Ins.end());
1981   }
1982 
1983   if (IsEntryFunc) {
1984     allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
1985     allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
1986   }
1987 
1988   if (IsKernel) {
1989     analyzeFormalArgumentsCompute(CCInfo, Ins);
1990   } else {
1991     CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
1992     CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
1993   }
1994 
1995   SmallVector<SDValue, 16> Chains;
1996 
1997   // FIXME: This is the minimum kernel argument alignment. We should improve
1998   // this to the maximum alignment of the arguments.
1999   //
2000   // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2001   // kern arg offset.
2002   const unsigned KernelArgBaseAlign = 16;
2003 
2004    for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2005     const ISD::InputArg &Arg = Ins[i];
2006     if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
2007       InVals.push_back(DAG.getUNDEF(Arg.VT));
2008       continue;
2009     }
2010 
2011     CCValAssign &VA = ArgLocs[ArgIdx++];
2012     MVT VT = VA.getLocVT();
2013 
2014     if (IsEntryFunc && VA.isMemLoc()) {
2015       VT = Ins[i].VT;
2016       EVT MemVT = VA.getLocVT();
2017 
2018       const uint64_t Offset = VA.getLocMemOffset();
2019       unsigned Align = MinAlign(KernelArgBaseAlign, Offset);
2020 
2021       SDValue Arg = lowerKernargMemParameter(
2022         DAG, VT, MemVT, DL, Chain, Offset, Align, Ins[i].Flags.isSExt(), &Ins[i]);
2023       Chains.push_back(Arg.getValue(1));
2024 
2025       auto *ParamTy =
2026         dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
2027       if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
2028           ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
2029                       ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
2030         // On SI local pointers are just offsets into LDS, so they are always
2031         // less than 16-bits.  On CI and newer they could potentially be
2032         // real pointers, so we can't guarantee their size.
2033         Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
2034                           DAG.getValueType(MVT::i16));
2035       }
2036 
2037       InVals.push_back(Arg);
2038       continue;
2039     } else if (!IsEntryFunc && VA.isMemLoc()) {
2040       SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
2041       InVals.push_back(Val);
2042       if (!Arg.Flags.isByVal())
2043         Chains.push_back(Val.getValue(1));
2044       continue;
2045     }
2046 
2047     assert(VA.isRegLoc() && "Parameter must be in a register!");
2048 
2049     unsigned Reg = VA.getLocReg();
2050     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
2051     EVT ValVT = VA.getValVT();
2052 
2053     Reg = MF.addLiveIn(Reg, RC);
2054     SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
2055 
2056     if (Arg.Flags.isSRet()) {
2057       // The return object should be reasonably addressable.
2058 
2059       // FIXME: This helps when the return is a real sret. If it is a
2060       // automatically inserted sret (i.e. CanLowerReturn returns false), an
2061       // extra copy is inserted in SelectionDAGBuilder which obscures this.
2062       unsigned NumBits
2063         = 32 - getSubtarget()->getKnownHighZeroBitsForFrameIndex();
2064       Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
2065         DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
2066     }
2067 
2068     // If this is an 8 or 16-bit value, it is really passed promoted
2069     // to 32 bits. Insert an assert[sz]ext to capture this, then
2070     // truncate to the right size.
2071     switch (VA.getLocInfo()) {
2072     case CCValAssign::Full:
2073       break;
2074     case CCValAssign::BCvt:
2075       Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
2076       break;
2077     case CCValAssign::SExt:
2078       Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
2079                         DAG.getValueType(ValVT));
2080       Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2081       break;
2082     case CCValAssign::ZExt:
2083       Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
2084                         DAG.getValueType(ValVT));
2085       Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2086       break;
2087     case CCValAssign::AExt:
2088       Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2089       break;
2090     default:
2091       llvm_unreachable("Unknown loc info!");
2092     }
2093 
2094     InVals.push_back(Val);
2095   }
2096 
2097   if (!IsEntryFunc) {
2098     // Special inputs come after user arguments.
2099     allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
2100   }
2101 
2102   // Start adding system SGPRs.
2103   if (IsEntryFunc) {
2104     allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
2105   } else {
2106     CCInfo.AllocateReg(Info->getScratchRSrcReg());
2107     CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
2108     CCInfo.AllocateReg(Info->getFrameOffsetReg());
2109     allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
2110   }
2111 
2112   auto &ArgUsageInfo =
2113     DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
2114   ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
2115 
2116   unsigned StackArgSize = CCInfo.getNextStackOffset();
2117   Info->setBytesInStackArgArea(StackArgSize);
2118 
2119   return Chains.empty() ? Chain :
2120     DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
2121 }
2122 
2123 // TODO: If return values can't fit in registers, we should return as many as
2124 // possible in registers before passing on stack.
2125 bool SITargetLowering::CanLowerReturn(
2126   CallingConv::ID CallConv,
2127   MachineFunction &MF, bool IsVarArg,
2128   const SmallVectorImpl<ISD::OutputArg> &Outs,
2129   LLVMContext &Context) const {
2130   // Replacing returns with sret/stack usage doesn't make sense for shaders.
2131   // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
2132   // for shaders. Vector types should be explicitly handled by CC.
2133   if (AMDGPU::isEntryFunctionCC(CallConv))
2134     return true;
2135 
2136   SmallVector<CCValAssign, 16> RVLocs;
2137   CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
2138   return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
2139 }
2140 
2141 SDValue
2142 SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2143                               bool isVarArg,
2144                               const SmallVectorImpl<ISD::OutputArg> &Outs,
2145                               const SmallVectorImpl<SDValue> &OutVals,
2146                               const SDLoc &DL, SelectionDAG &DAG) const {
2147   MachineFunction &MF = DAG.getMachineFunction();
2148   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2149 
2150   if (AMDGPU::isKernel(CallConv)) {
2151     return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
2152                                              OutVals, DL, DAG);
2153   }
2154 
2155   bool IsShader = AMDGPU::isShader(CallConv);
2156 
2157   Info->setIfReturnsVoid(Outs.empty());
2158   bool IsWaveEnd = Info->returnsVoid() && IsShader;
2159 
2160   // CCValAssign - represent the assignment of the return value to a location.
2161   SmallVector<CCValAssign, 48> RVLocs;
2162   SmallVector<ISD::OutputArg, 48> Splits;
2163 
2164   // CCState - Info about the registers and stack slots.
2165   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2166                  *DAG.getContext());
2167 
2168   // Analyze outgoing return values.
2169   CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
2170 
2171   SDValue Flag;
2172   SmallVector<SDValue, 48> RetOps;
2173   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2174 
2175   // Add return address for callable functions.
2176   if (!Info->isEntryFunction()) {
2177     const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2178     SDValue ReturnAddrReg = CreateLiveInRegister(
2179       DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
2180 
2181     // FIXME: Should be able to use a vreg here, but need a way to prevent it
2182     // from being allcoated to a CSR.
2183 
2184     SDValue PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
2185                                                 MVT::i64);
2186 
2187     Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, Flag);
2188     Flag = Chain.getValue(1);
2189 
2190     RetOps.push_back(PhysReturnAddrReg);
2191   }
2192 
2193   // Copy the result values into the output registers.
2194   for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
2195        ++I, ++RealRVLocIdx) {
2196     CCValAssign &VA = RVLocs[I];
2197     assert(VA.isRegLoc() && "Can only return in registers!");
2198     // TODO: Partially return in registers if return values don't fit.
2199     SDValue Arg = OutVals[RealRVLocIdx];
2200 
2201     // Copied from other backends.
2202     switch (VA.getLocInfo()) {
2203     case CCValAssign::Full:
2204       break;
2205     case CCValAssign::BCvt:
2206       Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2207       break;
2208     case CCValAssign::SExt:
2209       Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2210       break;
2211     case CCValAssign::ZExt:
2212       Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2213       break;
2214     case CCValAssign::AExt:
2215       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2216       break;
2217     default:
2218       llvm_unreachable("Unknown loc info!");
2219     }
2220 
2221     Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
2222     Flag = Chain.getValue(1);
2223     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2224   }
2225 
2226   // FIXME: Does sret work properly?
2227   if (!Info->isEntryFunction()) {
2228     const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2229     const MCPhysReg *I =
2230       TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2231     if (I) {
2232       for (; *I; ++I) {
2233         if (AMDGPU::SReg_64RegClass.contains(*I))
2234           RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2235         else if (AMDGPU::SReg_32RegClass.contains(*I))
2236           RetOps.push_back(DAG.getRegister(*I, MVT::i32));
2237         else
2238           llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2239       }
2240     }
2241   }
2242 
2243   // Update chain and glue.
2244   RetOps[0] = Chain;
2245   if (Flag.getNode())
2246     RetOps.push_back(Flag);
2247 
2248   unsigned Opc = AMDGPUISD::ENDPGM;
2249   if (!IsWaveEnd)
2250     Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG;
2251   return DAG.getNode(Opc, DL, MVT::Other, RetOps);
2252 }
2253 
2254 SDValue SITargetLowering::LowerCallResult(
2255     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
2256     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2257     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
2258     SDValue ThisVal) const {
2259   CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
2260 
2261   // Assign locations to each value returned by this call.
2262   SmallVector<CCValAssign, 16> RVLocs;
2263   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
2264                  *DAG.getContext());
2265   CCInfo.AnalyzeCallResult(Ins, RetCC);
2266 
2267   // Copy all of the result registers out of their specified physreg.
2268   for (unsigned i = 0; i != RVLocs.size(); ++i) {
2269     CCValAssign VA = RVLocs[i];
2270     SDValue Val;
2271 
2272     if (VA.isRegLoc()) {
2273       Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
2274       Chain = Val.getValue(1);
2275       InFlag = Val.getValue(2);
2276     } else if (VA.isMemLoc()) {
2277       report_fatal_error("TODO: return values in memory");
2278     } else
2279       llvm_unreachable("unknown argument location type");
2280 
2281     switch (VA.getLocInfo()) {
2282     case CCValAssign::Full:
2283       break;
2284     case CCValAssign::BCvt:
2285       Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
2286       break;
2287     case CCValAssign::ZExt:
2288       Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
2289                         DAG.getValueType(VA.getValVT()));
2290       Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2291       break;
2292     case CCValAssign::SExt:
2293       Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
2294                         DAG.getValueType(VA.getValVT()));
2295       Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2296       break;
2297     case CCValAssign::AExt:
2298       Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2299       break;
2300     default:
2301       llvm_unreachable("Unknown loc info!");
2302     }
2303 
2304     InVals.push_back(Val);
2305   }
2306 
2307   return Chain;
2308 }
2309 
2310 // Add code to pass special inputs required depending on used features separate
2311 // from the explicit user arguments present in the IR.
2312 void SITargetLowering::passSpecialInputs(
2313     CallLoweringInfo &CLI,
2314     CCState &CCInfo,
2315     const SIMachineFunctionInfo &Info,
2316     SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
2317     SmallVectorImpl<SDValue> &MemOpChains,
2318     SDValue Chain) const {
2319   // If we don't have a call site, this was a call inserted by
2320   // legalization. These can never use special inputs.
2321   if (!CLI.CS)
2322     return;
2323 
2324   const Function *CalleeFunc = CLI.CS.getCalledFunction();
2325   assert(CalleeFunc);
2326 
2327   SelectionDAG &DAG = CLI.DAG;
2328   const SDLoc &DL = CLI.DL;
2329 
2330   const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2331 
2332   auto &ArgUsageInfo =
2333     DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
2334   const AMDGPUFunctionArgInfo &CalleeArgInfo
2335     = ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
2336 
2337   const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
2338 
2339   // TODO: Unify with private memory register handling. This is complicated by
2340   // the fact that at least in kernels, the input argument is not necessarily
2341   // in the same location as the input.
2342   AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = {
2343     AMDGPUFunctionArgInfo::DISPATCH_PTR,
2344     AMDGPUFunctionArgInfo::QUEUE_PTR,
2345     AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR,
2346     AMDGPUFunctionArgInfo::DISPATCH_ID,
2347     AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
2348     AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
2349     AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
2350     AMDGPUFunctionArgInfo::WORKITEM_ID_X,
2351     AMDGPUFunctionArgInfo::WORKITEM_ID_Y,
2352     AMDGPUFunctionArgInfo::WORKITEM_ID_Z,
2353     AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
2354   };
2355 
2356   for (auto InputID : InputRegs) {
2357     const ArgDescriptor *OutgoingArg;
2358     const TargetRegisterClass *ArgRC;
2359 
2360     std::tie(OutgoingArg, ArgRC) = CalleeArgInfo.getPreloadedValue(InputID);
2361     if (!OutgoingArg)
2362       continue;
2363 
2364     const ArgDescriptor *IncomingArg;
2365     const TargetRegisterClass *IncomingArgRC;
2366     std::tie(IncomingArg, IncomingArgRC)
2367       = CallerArgInfo.getPreloadedValue(InputID);
2368     assert(IncomingArgRC == ArgRC);
2369 
2370     // All special arguments are ints for now.
2371     EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
2372     SDValue InputReg;
2373 
2374     if (IncomingArg) {
2375       InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
2376     } else {
2377       // The implicit arg ptr is special because it doesn't have a corresponding
2378       // input for kernels, and is computed from the kernarg segment pointer.
2379       assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
2380       InputReg = getImplicitArgPtr(DAG, DL);
2381     }
2382 
2383     if (OutgoingArg->isRegister()) {
2384       RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
2385     } else {
2386       unsigned SpecialArgOffset = CCInfo.AllocateStack(ArgVT.getStoreSize(), 4);
2387       SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
2388                                               SpecialArgOffset);
2389       MemOpChains.push_back(ArgStore);
2390     }
2391   }
2392 }
2393 
2394 static bool canGuaranteeTCO(CallingConv::ID CC) {
2395   return CC == CallingConv::Fast;
2396 }
2397 
2398 /// Return true if we might ever do TCO for calls with this calling convention.
2399 static bool mayTailCallThisCC(CallingConv::ID CC) {
2400   switch (CC) {
2401   case CallingConv::C:
2402     return true;
2403   default:
2404     return canGuaranteeTCO(CC);
2405   }
2406 }
2407 
2408 bool SITargetLowering::isEligibleForTailCallOptimization(
2409     SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
2410     const SmallVectorImpl<ISD::OutputArg> &Outs,
2411     const SmallVectorImpl<SDValue> &OutVals,
2412     const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
2413   if (!mayTailCallThisCC(CalleeCC))
2414     return false;
2415 
2416   MachineFunction &MF = DAG.getMachineFunction();
2417   const Function &CallerF = MF.getFunction();
2418   CallingConv::ID CallerCC = CallerF.getCallingConv();
2419   const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2420   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2421 
2422   // Kernels aren't callable, and don't have a live in return address so it
2423   // doesn't make sense to do a tail call with entry functions.
2424   if (!CallerPreserved)
2425     return false;
2426 
2427   bool CCMatch = CallerCC == CalleeCC;
2428 
2429   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
2430     if (canGuaranteeTCO(CalleeCC) && CCMatch)
2431       return true;
2432     return false;
2433   }
2434 
2435   // TODO: Can we handle var args?
2436   if (IsVarArg)
2437     return false;
2438 
2439   for (const Argument &Arg : CallerF.args()) {
2440     if (Arg.hasByValAttr())
2441       return false;
2442   }
2443 
2444   LLVMContext &Ctx = *DAG.getContext();
2445 
2446   // Check that the call results are passed in the same way.
2447   if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
2448                                   CCAssignFnForCall(CalleeCC, IsVarArg),
2449                                   CCAssignFnForCall(CallerCC, IsVarArg)))
2450     return false;
2451 
2452   // The callee has to preserve all registers the caller needs to preserve.
2453   if (!CCMatch) {
2454     const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2455     if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
2456       return false;
2457   }
2458 
2459   // Nothing more to check if the callee is taking no arguments.
2460   if (Outs.empty())
2461     return true;
2462 
2463   SmallVector<CCValAssign, 16> ArgLocs;
2464   CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
2465 
2466   CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
2467 
2468   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
2469   // If the stack arguments for this call do not fit into our own save area then
2470   // the call cannot be made tail.
2471   // TODO: Is this really necessary?
2472   if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
2473     return false;
2474 
2475   const MachineRegisterInfo &MRI = MF.getRegInfo();
2476   return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
2477 }
2478 
2479 bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2480   if (!CI->isTailCall())
2481     return false;
2482 
2483   const Function *ParentFn = CI->getParent()->getParent();
2484   if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
2485     return false;
2486 
2487   auto Attr = ParentFn->getFnAttribute("disable-tail-calls");
2488   return (Attr.getValueAsString() != "true");
2489 }
2490 
2491 // The wave scratch offset register is used as the global base pointer.
2492 SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
2493                                     SmallVectorImpl<SDValue> &InVals) const {
2494   SelectionDAG &DAG = CLI.DAG;
2495   const SDLoc &DL = CLI.DL;
2496   SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
2497   SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
2498   SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
2499   SDValue Chain = CLI.Chain;
2500   SDValue Callee = CLI.Callee;
2501   bool &IsTailCall = CLI.IsTailCall;
2502   CallingConv::ID CallConv = CLI.CallConv;
2503   bool IsVarArg = CLI.IsVarArg;
2504   bool IsSibCall = false;
2505   bool IsThisReturn = false;
2506   MachineFunction &MF = DAG.getMachineFunction();
2507 
2508   if (IsVarArg) {
2509     return lowerUnhandledCall(CLI, InVals,
2510                               "unsupported call to variadic function ");
2511   }
2512 
2513   if (!CLI.CS.getInstruction())
2514     report_fatal_error("unsupported libcall legalization");
2515 
2516   if (!CLI.CS.getCalledFunction()) {
2517     return lowerUnhandledCall(CLI, InVals,
2518                               "unsupported indirect call to function ");
2519   }
2520 
2521   if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
2522     return lowerUnhandledCall(CLI, InVals,
2523                               "unsupported required tail call to function ");
2524   }
2525 
2526   if (AMDGPU::isShader(MF.getFunction().getCallingConv())) {
2527     // Note the issue is with the CC of the calling function, not of the call
2528     // itself.
2529     return lowerUnhandledCall(CLI, InVals,
2530                           "unsupported call from graphics shader of function ");
2531   }
2532 
2533   // The first 4 bytes are reserved for the callee's emergency stack slot.
2534   if (IsTailCall) {
2535     IsTailCall = isEligibleForTailCallOptimization(
2536       Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
2537     if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall()) {
2538       report_fatal_error("failed to perform tail call elimination on a call "
2539                          "site marked musttail");
2540     }
2541 
2542     bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
2543 
2544     // A sibling call is one where we're under the usual C ABI and not planning
2545     // to change that but can still do a tail call:
2546     if (!TailCallOpt && IsTailCall)
2547       IsSibCall = true;
2548 
2549     if (IsTailCall)
2550       ++NumTailCalls;
2551   }
2552 
2553   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2554 
2555   // Analyze operands of the call, assigning locations to each operand.
2556   SmallVector<CCValAssign, 16> ArgLocs;
2557   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
2558   CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
2559 
2560   // The first 4 bytes are reserved for the callee's emergency stack slot.
2561   CCInfo.AllocateStack(4, 4);
2562 
2563   CCInfo.AnalyzeCallOperands(Outs, AssignFn);
2564 
2565   // Get a count of how many bytes are to be pushed on the stack.
2566   unsigned NumBytes = CCInfo.getNextStackOffset();
2567 
2568   if (IsSibCall) {
2569     // Since we're not changing the ABI to make this a tail call, the memory
2570     // operands are already available in the caller's incoming argument space.
2571     NumBytes = 0;
2572   }
2573 
2574   // FPDiff is the byte offset of the call's argument area from the callee's.
2575   // Stores to callee stack arguments will be placed in FixedStackSlots offset
2576   // by this amount for a tail call. In a sibling call it must be 0 because the
2577   // caller will deallocate the entire stack and the callee still expects its
2578   // arguments to begin at SP+0. Completely unused for non-tail calls.
2579   int32_t FPDiff = 0;
2580   MachineFrameInfo &MFI = MF.getFrameInfo();
2581   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2582 
2583   SDValue CallerSavedFP;
2584 
2585   // Adjust the stack pointer for the new arguments...
2586   // These operations are automatically eliminated by the prolog/epilog pass
2587   if (!IsSibCall) {
2588     Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
2589 
2590     SmallVector<SDValue, 4> CopyFromChains;
2591 
2592     unsigned OffsetReg = Info->getScratchWaveOffsetReg();
2593 
2594     // In the HSA case, this should be an identity copy.
2595     SDValue ScratchRSrcReg
2596       = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
2597     RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
2598     CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
2599 
2600     // TODO: Don't hardcode these registers and get from the callee function.
2601     SDValue ScratchWaveOffsetReg
2602       = DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32);
2603     RegsToPass.emplace_back(AMDGPU::SGPR4, ScratchWaveOffsetReg);
2604     CopyFromChains.push_back(ScratchWaveOffsetReg.getValue(1));
2605 
2606     if (!Info->isEntryFunction()) {
2607       // Avoid clobbering this function's FP value. In the current convention
2608       // callee will overwrite this, so do save/restore around the call site.
2609       CallerSavedFP = DAG.getCopyFromReg(Chain, DL,
2610                                          Info->getFrameOffsetReg(), MVT::i32);
2611       CopyFromChains.push_back(CallerSavedFP.getValue(1));
2612     }
2613 
2614     Chain = DAG.getTokenFactor(DL, CopyFromChains);
2615   }
2616 
2617   SmallVector<SDValue, 8> MemOpChains;
2618   MVT PtrVT = MVT::i32;
2619 
2620   // Walk the register/memloc assignments, inserting copies/loads.
2621   for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
2622        ++i, ++realArgIdx) {
2623     CCValAssign &VA = ArgLocs[i];
2624     SDValue Arg = OutVals[realArgIdx];
2625 
2626     // Promote the value if needed.
2627     switch (VA.getLocInfo()) {
2628     case CCValAssign::Full:
2629       break;
2630     case CCValAssign::BCvt:
2631       Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2632       break;
2633     case CCValAssign::ZExt:
2634       Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2635       break;
2636     case CCValAssign::SExt:
2637       Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2638       break;
2639     case CCValAssign::AExt:
2640       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2641       break;
2642     case CCValAssign::FPExt:
2643       Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
2644       break;
2645     default:
2646       llvm_unreachable("Unknown loc info!");
2647     }
2648 
2649     if (VA.isRegLoc()) {
2650       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2651     } else {
2652       assert(VA.isMemLoc());
2653 
2654       SDValue DstAddr;
2655       MachinePointerInfo DstInfo;
2656 
2657       unsigned LocMemOffset = VA.getLocMemOffset();
2658       int32_t Offset = LocMemOffset;
2659 
2660       SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
2661       unsigned Align = 0;
2662 
2663       if (IsTailCall) {
2664         ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2665         unsigned OpSize = Flags.isByVal() ?
2666           Flags.getByValSize() : VA.getValVT().getStoreSize();
2667 
2668         // FIXME: We can have better than the minimum byval required alignment.
2669         Align = Flags.isByVal() ? Flags.getByValAlign() :
2670           MinAlign(Subtarget->getStackAlignment(), Offset);
2671 
2672         Offset = Offset + FPDiff;
2673         int FI = MFI.CreateFixedObject(OpSize, Offset, true);
2674 
2675         DstAddr = DAG.getFrameIndex(FI, PtrVT);
2676         DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
2677 
2678         // Make sure any stack arguments overlapping with where we're storing
2679         // are loaded before this eventual operation. Otherwise they'll be
2680         // clobbered.
2681 
2682         // FIXME: Why is this really necessary? This seems to just result in a
2683         // lot of code to copy the stack and write them back to the same
2684         // locations, which are supposed to be immutable?
2685         Chain = addTokenForArgument(Chain, DAG, MFI, FI);
2686       } else {
2687         DstAddr = PtrOff;
2688         DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
2689         Align = MinAlign(Subtarget->getStackAlignment(), LocMemOffset);
2690       }
2691 
2692       if (Outs[i].Flags.isByVal()) {
2693         SDValue SizeNode =
2694             DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
2695         SDValue Cpy = DAG.getMemcpy(
2696             Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
2697             /*isVol = */ false, /*AlwaysInline = */ true,
2698             /*isTailCall = */ false, DstInfo,
2699             MachinePointerInfo(UndefValue::get(Type::getInt8PtrTy(
2700                 *DAG.getContext(), AMDGPUAS::PRIVATE_ADDRESS))));
2701 
2702         MemOpChains.push_back(Cpy);
2703       } else {
2704         SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Align);
2705         MemOpChains.push_back(Store);
2706       }
2707     }
2708   }
2709 
2710   // Copy special input registers after user input arguments.
2711   passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
2712 
2713   if (!MemOpChains.empty())
2714     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
2715 
2716   // Build a sequence of copy-to-reg nodes chained together with token chain
2717   // and flag operands which copy the outgoing args into the appropriate regs.
2718   SDValue InFlag;
2719   for (auto &RegToPass : RegsToPass) {
2720     Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
2721                              RegToPass.second, InFlag);
2722     InFlag = Chain.getValue(1);
2723   }
2724 
2725 
2726   SDValue PhysReturnAddrReg;
2727   if (IsTailCall) {
2728     // Since the return is being combined with the call, we need to pass on the
2729     // return address.
2730 
2731     const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2732     SDValue ReturnAddrReg = CreateLiveInRegister(
2733       DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
2734 
2735     PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
2736                                         MVT::i64);
2737     Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, InFlag);
2738     InFlag = Chain.getValue(1);
2739   }
2740 
2741   // We don't usually want to end the call-sequence here because we would tidy
2742   // the frame up *after* the call, however in the ABI-changing tail-call case
2743   // we've carefully laid out the parameters so that when sp is reset they'll be
2744   // in the correct location.
2745   if (IsTailCall && !IsSibCall) {
2746     Chain = DAG.getCALLSEQ_END(Chain,
2747                                DAG.getTargetConstant(NumBytes, DL, MVT::i32),
2748                                DAG.getTargetConstant(0, DL, MVT::i32),
2749                                InFlag, DL);
2750     InFlag = Chain.getValue(1);
2751   }
2752 
2753   std::vector<SDValue> Ops;
2754   Ops.push_back(Chain);
2755   Ops.push_back(Callee);
2756   // Add a redundant copy of the callee global which will not be legalized, as
2757   // we need direct access to the callee later.
2758   GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Callee);
2759   const GlobalValue *GV = GSD->getGlobal();
2760   Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
2761 
2762   if (IsTailCall) {
2763     // Each tail call may have to adjust the stack by a different amount, so
2764     // this information must travel along with the operation for eventual
2765     // consumption by emitEpilogue.
2766     Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
2767 
2768     Ops.push_back(PhysReturnAddrReg);
2769   }
2770 
2771   // Add argument registers to the end of the list so that they are known live
2772   // into the call.
2773   for (auto &RegToPass : RegsToPass) {
2774     Ops.push_back(DAG.getRegister(RegToPass.first,
2775                                   RegToPass.second.getValueType()));
2776   }
2777 
2778   // Add a register mask operand representing the call-preserved registers.
2779 
2780   auto *TRI = static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
2781   const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
2782   assert(Mask && "Missing call preserved mask for calling convention");
2783   Ops.push_back(DAG.getRegisterMask(Mask));
2784 
2785   if (InFlag.getNode())
2786     Ops.push_back(InFlag);
2787 
2788   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
2789 
2790   // If we're doing a tall call, use a TC_RETURN here rather than an
2791   // actual call instruction.
2792   if (IsTailCall) {
2793     MFI.setHasTailCall();
2794     return DAG.getNode(AMDGPUISD::TC_RETURN, DL, NodeTys, Ops);
2795   }
2796 
2797   // Returns a chain and a flag for retval copy to use.
2798   SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
2799   Chain = Call.getValue(0);
2800   InFlag = Call.getValue(1);
2801 
2802   if (CallerSavedFP) {
2803     SDValue FPReg = DAG.getRegister(Info->getFrameOffsetReg(), MVT::i32);
2804     Chain = DAG.getCopyToReg(Chain, DL, FPReg, CallerSavedFP, InFlag);
2805     InFlag = Chain.getValue(1);
2806   }
2807 
2808   uint64_t CalleePopBytes = NumBytes;
2809   Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32),
2810                              DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32),
2811                              InFlag, DL);
2812   if (!Ins.empty())
2813     InFlag = Chain.getValue(1);
2814 
2815   // Handle result values, copying them out of physregs into vregs that we
2816   // return.
2817   return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
2818                          InVals, IsThisReturn,
2819                          IsThisReturn ? OutVals[0] : SDValue());
2820 }
2821 
2822 unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
2823                                              SelectionDAG &DAG) const {
2824   unsigned Reg = StringSwitch<unsigned>(RegName)
2825     .Case("m0", AMDGPU::M0)
2826     .Case("exec", AMDGPU::EXEC)
2827     .Case("exec_lo", AMDGPU::EXEC_LO)
2828     .Case("exec_hi", AMDGPU::EXEC_HI)
2829     .Case("flat_scratch", AMDGPU::FLAT_SCR)
2830     .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
2831     .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
2832     .Default(AMDGPU::NoRegister);
2833 
2834   if (Reg == AMDGPU::NoRegister) {
2835     report_fatal_error(Twine("invalid register name \""
2836                              + StringRef(RegName)  + "\"."));
2837 
2838   }
2839 
2840   if ((Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||
2841        Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) &&
2842        Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
2843     report_fatal_error(Twine("invalid register \""
2844                              + StringRef(RegName)  + "\" for subtarget."));
2845   }
2846 
2847   switch (Reg) {
2848   case AMDGPU::M0:
2849   case AMDGPU::EXEC_LO:
2850   case AMDGPU::EXEC_HI:
2851   case AMDGPU::FLAT_SCR_LO:
2852   case AMDGPU::FLAT_SCR_HI:
2853     if (VT.getSizeInBits() == 32)
2854       return Reg;
2855     break;
2856   case AMDGPU::EXEC:
2857   case AMDGPU::FLAT_SCR:
2858     if (VT.getSizeInBits() == 64)
2859       return Reg;
2860     break;
2861   default:
2862     llvm_unreachable("missing register type checking");
2863   }
2864 
2865   report_fatal_error(Twine("invalid type for register \""
2866                            + StringRef(RegName) + "\"."));
2867 }
2868 
2869 // If kill is not the last instruction, split the block so kill is always a
2870 // proper terminator.
2871 MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI,
2872                                                     MachineBasicBlock *BB) const {
2873   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
2874 
2875   MachineBasicBlock::iterator SplitPoint(&MI);
2876   ++SplitPoint;
2877 
2878   if (SplitPoint == BB->end()) {
2879     // Don't bother with a new block.
2880     MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
2881     return BB;
2882   }
2883 
2884   MachineFunction *MF = BB->getParent();
2885   MachineBasicBlock *SplitBB
2886     = MF->CreateMachineBasicBlock(BB->getBasicBlock());
2887 
2888   MF->insert(++MachineFunction::iterator(BB), SplitBB);
2889   SplitBB->splice(SplitBB->begin(), BB, SplitPoint, BB->end());
2890 
2891   SplitBB->transferSuccessorsAndUpdatePHIs(BB);
2892   BB->addSuccessor(SplitBB);
2893 
2894   MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
2895   return SplitBB;
2896 }
2897 
2898 // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
2899 // wavefront. If the value is uniform and just happens to be in a VGPR, this
2900 // will only do one iteration. In the worst case, this will loop 64 times.
2901 //
2902 // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
2903 static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
2904   const SIInstrInfo *TII,
2905   MachineRegisterInfo &MRI,
2906   MachineBasicBlock &OrigBB,
2907   MachineBasicBlock &LoopBB,
2908   const DebugLoc &DL,
2909   const MachineOperand &IdxReg,
2910   unsigned InitReg,
2911   unsigned ResultReg,
2912   unsigned PhiReg,
2913   unsigned InitSaveExecReg,
2914   int Offset,
2915   bool UseGPRIdxMode,
2916   bool IsIndirectSrc) {
2917   MachineBasicBlock::iterator I = LoopBB.begin();
2918 
2919   unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2920   unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2921   unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2922   unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2923 
2924   BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
2925     .addReg(InitReg)
2926     .addMBB(&OrigBB)
2927     .addReg(ResultReg)
2928     .addMBB(&LoopBB);
2929 
2930   BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
2931     .addReg(InitSaveExecReg)
2932     .addMBB(&OrigBB)
2933     .addReg(NewExec)
2934     .addMBB(&LoopBB);
2935 
2936   // Read the next variant <- also loop target.
2937   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
2938     .addReg(IdxReg.getReg(), getUndefRegState(IdxReg.isUndef()));
2939 
2940   // Compare the just read M0 value to all possible Idx values.
2941   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
2942     .addReg(CurrentIdxReg)
2943     .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg());
2944 
2945   // Update EXEC, save the original EXEC value to VCC.
2946   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec)
2947     .addReg(CondReg, RegState::Kill);
2948 
2949   MRI.setSimpleHint(NewExec, CondReg);
2950 
2951   if (UseGPRIdxMode) {
2952     unsigned IdxReg;
2953     if (Offset == 0) {
2954       IdxReg = CurrentIdxReg;
2955     } else {
2956       IdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
2957       BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), IdxReg)
2958         .addReg(CurrentIdxReg, RegState::Kill)
2959         .addImm(Offset);
2960     }
2961     unsigned IdxMode = IsIndirectSrc ?
2962       AMDGPU::VGPRIndexMode::SRC0_ENABLE : AMDGPU::VGPRIndexMode::DST_ENABLE;
2963     MachineInstr *SetOn =
2964       BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
2965       .addReg(IdxReg, RegState::Kill)
2966       .addImm(IdxMode);
2967     SetOn->getOperand(3).setIsUndef();
2968   } else {
2969     // Move index from VCC into M0
2970     if (Offset == 0) {
2971       BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2972         .addReg(CurrentIdxReg, RegState::Kill);
2973     } else {
2974       BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
2975         .addReg(CurrentIdxReg, RegState::Kill)
2976         .addImm(Offset);
2977     }
2978   }
2979 
2980   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
2981   MachineInstr *InsertPt =
2982     BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64_term), AMDGPU::EXEC)
2983     .addReg(AMDGPU::EXEC)
2984     .addReg(NewExec);
2985 
2986   // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
2987   // s_cbranch_scc0?
2988 
2989   // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
2990   BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
2991     .addMBB(&LoopBB);
2992 
2993   return InsertPt->getIterator();
2994 }
2995 
2996 // This has slightly sub-optimal regalloc when the source vector is killed by
2997 // the read. The register allocator does not understand that the kill is
2998 // per-workitem, so is kept alive for the whole loop so we end up not re-using a
2999 // subregister from it, using 1 more VGPR than necessary. This was saved when
3000 // this was expanded after register allocation.
3001 static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
3002                                                   MachineBasicBlock &MBB,
3003                                                   MachineInstr &MI,
3004                                                   unsigned InitResultReg,
3005                                                   unsigned PhiReg,
3006                                                   int Offset,
3007                                                   bool UseGPRIdxMode,
3008                                                   bool IsIndirectSrc) {
3009   MachineFunction *MF = MBB.getParent();
3010   MachineRegisterInfo &MRI = MF->getRegInfo();
3011   const DebugLoc &DL = MI.getDebugLoc();
3012   MachineBasicBlock::iterator I(&MI);
3013 
3014   unsigned DstReg = MI.getOperand(0).getReg();
3015   unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
3016   unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
3017 
3018   BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
3019 
3020   // Save the EXEC mask
3021   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), SaveExec)
3022     .addReg(AMDGPU::EXEC);
3023 
3024   // To insert the loop we need to split the block. Move everything after this
3025   // point to a new block, and insert a new empty block between the two.
3026   MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
3027   MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
3028   MachineFunction::iterator MBBI(MBB);
3029   ++MBBI;
3030 
3031   MF->insert(MBBI, LoopBB);
3032   MF->insert(MBBI, RemainderBB);
3033 
3034   LoopBB->addSuccessor(LoopBB);
3035   LoopBB->addSuccessor(RemainderBB);
3036 
3037   // Move the rest of the block into a new block.
3038   RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
3039   RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
3040 
3041   MBB.addSuccessor(LoopBB);
3042 
3043   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3044 
3045   auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
3046                                       InitResultReg, DstReg, PhiReg, TmpExec,
3047                                       Offset, UseGPRIdxMode, IsIndirectSrc);
3048 
3049   MachineBasicBlock::iterator First = RemainderBB->begin();
3050   BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
3051     .addReg(SaveExec);
3052 
3053   return InsPt;
3054 }
3055 
3056 // Returns subreg index, offset
3057 static std::pair<unsigned, int>
3058 computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
3059                             const TargetRegisterClass *SuperRC,
3060                             unsigned VecReg,
3061                             int Offset) {
3062   int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
3063 
3064   // Skip out of bounds offsets, or else we would end up using an undefined
3065   // register.
3066   if (Offset >= NumElts || Offset < 0)
3067     return std::make_pair(AMDGPU::sub0, Offset);
3068 
3069   return std::make_pair(AMDGPU::sub0 + Offset, 0);
3070 }
3071 
3072 // Return true if the index is an SGPR and was set.
3073 static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII,
3074                                  MachineRegisterInfo &MRI,
3075                                  MachineInstr &MI,
3076                                  int Offset,
3077                                  bool UseGPRIdxMode,
3078                                  bool IsIndirectSrc) {
3079   MachineBasicBlock *MBB = MI.getParent();
3080   const DebugLoc &DL = MI.getDebugLoc();
3081   MachineBasicBlock::iterator I(&MI);
3082 
3083   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3084   const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
3085 
3086   assert(Idx->getReg() != AMDGPU::NoRegister);
3087 
3088   if (!TII->getRegisterInfo().isSGPRClass(IdxRC))
3089     return false;
3090 
3091   if (UseGPRIdxMode) {
3092     unsigned IdxMode = IsIndirectSrc ?
3093       AMDGPU::VGPRIndexMode::SRC0_ENABLE : AMDGPU::VGPRIndexMode::DST_ENABLE;
3094     if (Offset == 0) {
3095       MachineInstr *SetOn =
3096           BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
3097               .add(*Idx)
3098               .addImm(IdxMode);
3099 
3100       SetOn->getOperand(3).setIsUndef();
3101     } else {
3102       unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3103       BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
3104           .add(*Idx)
3105           .addImm(Offset);
3106       MachineInstr *SetOn =
3107         BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON))
3108         .addReg(Tmp, RegState::Kill)
3109         .addImm(IdxMode);
3110 
3111       SetOn->getOperand(3).setIsUndef();
3112     }
3113 
3114     return true;
3115   }
3116 
3117   if (Offset == 0) {
3118     BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3119       .add(*Idx);
3120   } else {
3121     BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
3122       .add(*Idx)
3123       .addImm(Offset);
3124   }
3125 
3126   return true;
3127 }
3128 
3129 // Control flow needs to be inserted if indexing with a VGPR.
3130 static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
3131                                           MachineBasicBlock &MBB,
3132                                           const GCNSubtarget &ST) {
3133   const SIInstrInfo *TII = ST.getInstrInfo();
3134   const SIRegisterInfo &TRI = TII->getRegisterInfo();
3135   MachineFunction *MF = MBB.getParent();
3136   MachineRegisterInfo &MRI = MF->getRegInfo();
3137 
3138   unsigned Dst = MI.getOperand(0).getReg();
3139   unsigned SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
3140   int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
3141 
3142   const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
3143 
3144   unsigned SubReg;
3145   std::tie(SubReg, Offset)
3146     = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
3147 
3148   bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
3149 
3150   if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) {
3151     MachineBasicBlock::iterator I(&MI);
3152     const DebugLoc &DL = MI.getDebugLoc();
3153 
3154     if (UseGPRIdxMode) {
3155       // TODO: Look at the uses to avoid the copy. This may require rescheduling
3156       // to avoid interfering with other uses, so probably requires a new
3157       // optimization pass.
3158       BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
3159         .addReg(SrcReg, RegState::Undef, SubReg)
3160         .addReg(SrcReg, RegState::Implicit)
3161         .addReg(AMDGPU::M0, RegState::Implicit);
3162       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3163     } else {
3164       BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
3165         .addReg(SrcReg, RegState::Undef, SubReg)
3166         .addReg(SrcReg, RegState::Implicit);
3167     }
3168 
3169     MI.eraseFromParent();
3170 
3171     return &MBB;
3172   }
3173 
3174   const DebugLoc &DL = MI.getDebugLoc();
3175   MachineBasicBlock::iterator I(&MI);
3176 
3177   unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3178   unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3179 
3180   BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
3181 
3182   auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg,
3183                               Offset, UseGPRIdxMode, true);
3184   MachineBasicBlock *LoopBB = InsPt->getParent();
3185 
3186   if (UseGPRIdxMode) {
3187     BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_e32), Dst)
3188       .addReg(SrcReg, RegState::Undef, SubReg)
3189       .addReg(SrcReg, RegState::Implicit)
3190       .addReg(AMDGPU::M0, RegState::Implicit);
3191     BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3192   } else {
3193     BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
3194       .addReg(SrcReg, RegState::Undef, SubReg)
3195       .addReg(SrcReg, RegState::Implicit);
3196   }
3197 
3198   MI.eraseFromParent();
3199 
3200   return LoopBB;
3201 }
3202 
3203 static unsigned getMOVRELDPseudo(const SIRegisterInfo &TRI,
3204                                  const TargetRegisterClass *VecRC) {
3205   switch (TRI.getRegSizeInBits(*VecRC)) {
3206   case 32: // 4 bytes
3207     return AMDGPU::V_MOVRELD_B32_V1;
3208   case 64: // 8 bytes
3209     return AMDGPU::V_MOVRELD_B32_V2;
3210   case 128: // 16 bytes
3211     return AMDGPU::V_MOVRELD_B32_V4;
3212   case 256: // 32 bytes
3213     return AMDGPU::V_MOVRELD_B32_V8;
3214   case 512: // 64 bytes
3215     return AMDGPU::V_MOVRELD_B32_V16;
3216   default:
3217     llvm_unreachable("unsupported size for MOVRELD pseudos");
3218   }
3219 }
3220 
3221 static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
3222                                           MachineBasicBlock &MBB,
3223                                           const GCNSubtarget &ST) {
3224   const SIInstrInfo *TII = ST.getInstrInfo();
3225   const SIRegisterInfo &TRI = TII->getRegisterInfo();
3226   MachineFunction *MF = MBB.getParent();
3227   MachineRegisterInfo &MRI = MF->getRegInfo();
3228 
3229   unsigned Dst = MI.getOperand(0).getReg();
3230   const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
3231   const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3232   const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
3233   int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
3234   const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
3235 
3236   // This can be an immediate, but will be folded later.
3237   assert(Val->getReg());
3238 
3239   unsigned SubReg;
3240   std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
3241                                                          SrcVec->getReg(),
3242                                                          Offset);
3243   bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
3244 
3245   if (Idx->getReg() == AMDGPU::NoRegister) {
3246     MachineBasicBlock::iterator I(&MI);
3247     const DebugLoc &DL = MI.getDebugLoc();
3248 
3249     assert(Offset == 0);
3250 
3251     BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
3252         .add(*SrcVec)
3253         .add(*Val)
3254         .addImm(SubReg);
3255 
3256     MI.eraseFromParent();
3257     return &MBB;
3258   }
3259 
3260   if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) {
3261     MachineBasicBlock::iterator I(&MI);
3262     const DebugLoc &DL = MI.getDebugLoc();
3263 
3264     if (UseGPRIdxMode) {
3265       BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
3266           .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst
3267           .add(*Val)
3268           .addReg(Dst, RegState::ImplicitDefine)
3269           .addReg(SrcVec->getReg(), RegState::Implicit)
3270           .addReg(AMDGPU::M0, RegState::Implicit);
3271 
3272       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3273     } else {
3274       const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
3275 
3276       BuildMI(MBB, I, DL, MovRelDesc)
3277           .addReg(Dst, RegState::Define)
3278           .addReg(SrcVec->getReg())
3279           .add(*Val)
3280           .addImm(SubReg - AMDGPU::sub0);
3281     }
3282 
3283     MI.eraseFromParent();
3284     return &MBB;
3285   }
3286 
3287   if (Val->isReg())
3288     MRI.clearKillFlags(Val->getReg());
3289 
3290   const DebugLoc &DL = MI.getDebugLoc();
3291 
3292   unsigned PhiReg = MRI.createVirtualRegister(VecRC);
3293 
3294   auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
3295                               Offset, UseGPRIdxMode, false);
3296   MachineBasicBlock *LoopBB = InsPt->getParent();
3297 
3298   if (UseGPRIdxMode) {
3299     BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
3300         .addReg(PhiReg, RegState::Undef, SubReg) // vdst
3301         .add(*Val)                               // src0
3302         .addReg(Dst, RegState::ImplicitDefine)
3303         .addReg(PhiReg, RegState::Implicit)
3304         .addReg(AMDGPU::M0, RegState::Implicit);
3305     BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
3306   } else {
3307     const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
3308 
3309     BuildMI(*LoopBB, InsPt, DL, MovRelDesc)
3310         .addReg(Dst, RegState::Define)
3311         .addReg(PhiReg)
3312         .add(*Val)
3313         .addImm(SubReg - AMDGPU::sub0);
3314   }
3315 
3316   MI.eraseFromParent();
3317 
3318   return LoopBB;
3319 }
3320 
3321 MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
3322   MachineInstr &MI, MachineBasicBlock *BB) const {
3323 
3324   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3325   MachineFunction *MF = BB->getParent();
3326   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
3327 
3328   if (TII->isMIMG(MI)) {
3329     if (MI.memoperands_empty() && MI.mayLoadOrStore()) {
3330       report_fatal_error("missing mem operand from MIMG instruction");
3331     }
3332     // Add a memoperand for mimg instructions so that they aren't assumed to
3333     // be ordered memory instuctions.
3334 
3335     return BB;
3336   }
3337 
3338   switch (MI.getOpcode()) {
3339   case AMDGPU::S_ADD_U64_PSEUDO:
3340   case AMDGPU::S_SUB_U64_PSEUDO: {
3341     MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
3342     const DebugLoc &DL = MI.getDebugLoc();
3343 
3344     MachineOperand &Dest = MI.getOperand(0);
3345     MachineOperand &Src0 = MI.getOperand(1);
3346     MachineOperand &Src1 = MI.getOperand(2);
3347 
3348     unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3349     unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3350 
3351     MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
3352      Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
3353      &AMDGPU::SReg_32_XM0RegClass);
3354     MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
3355       Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
3356       &AMDGPU::SReg_32_XM0RegClass);
3357 
3358     MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
3359       Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub0,
3360       &AMDGPU::SReg_32_XM0RegClass);
3361     MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
3362       Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub1,
3363       &AMDGPU::SReg_32_XM0RegClass);
3364 
3365     bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
3366 
3367     unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
3368     unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
3369     BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
3370       .add(Src0Sub0)
3371       .add(Src1Sub0);
3372     BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
3373       .add(Src0Sub1)
3374       .add(Src1Sub1);
3375     BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
3376       .addReg(DestSub0)
3377       .addImm(AMDGPU::sub0)
3378       .addReg(DestSub1)
3379       .addImm(AMDGPU::sub1);
3380     MI.eraseFromParent();
3381     return BB;
3382   }
3383   case AMDGPU::SI_INIT_M0: {
3384     BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
3385             TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3386         .add(MI.getOperand(0));
3387     MI.eraseFromParent();
3388     return BB;
3389   }
3390   case AMDGPU::SI_INIT_EXEC:
3391     // This should be before all vector instructions.
3392     BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
3393             AMDGPU::EXEC)
3394         .addImm(MI.getOperand(0).getImm());
3395     MI.eraseFromParent();
3396     return BB;
3397 
3398   case AMDGPU::SI_INIT_EXEC_FROM_INPUT: {
3399     // Extract the thread count from an SGPR input and set EXEC accordingly.
3400     // Since BFM can't shift by 64, handle that case with CMP + CMOV.
3401     //
3402     // S_BFE_U32 count, input, {shift, 7}
3403     // S_BFM_B64 exec, count, 0
3404     // S_CMP_EQ_U32 count, 64
3405     // S_CMOV_B64 exec, -1
3406     MachineInstr *FirstMI = &*BB->begin();
3407     MachineRegisterInfo &MRI = MF->getRegInfo();
3408     unsigned InputReg = MI.getOperand(0).getReg();
3409     unsigned CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3410     bool Found = false;
3411 
3412     // Move the COPY of the input reg to the beginning, so that we can use it.
3413     for (auto I = BB->begin(); I != &MI; I++) {
3414       if (I->getOpcode() != TargetOpcode::COPY ||
3415           I->getOperand(0).getReg() != InputReg)
3416         continue;
3417 
3418       if (I == FirstMI) {
3419         FirstMI = &*++BB->begin();
3420       } else {
3421         I->removeFromParent();
3422         BB->insert(FirstMI, &*I);
3423       }
3424       Found = true;
3425       break;
3426     }
3427     assert(Found);
3428     (void)Found;
3429 
3430     // This should be before all vector instructions.
3431     BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
3432         .addReg(InputReg)
3433         .addImm((MI.getOperand(1).getImm() & 0x7f) | 0x70000);
3434     BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFM_B64),
3435             AMDGPU::EXEC)
3436         .addReg(CountReg)
3437         .addImm(0);
3438     BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32))
3439         .addReg(CountReg, RegState::Kill)
3440         .addImm(64);
3441     BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMOV_B64),
3442             AMDGPU::EXEC)
3443         .addImm(-1);
3444     MI.eraseFromParent();
3445     return BB;
3446   }
3447 
3448   case AMDGPU::GET_GROUPSTATICSIZE: {
3449     DebugLoc DL = MI.getDebugLoc();
3450     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
3451         .add(MI.getOperand(0))
3452         .addImm(MFI->getLDSSize());
3453     MI.eraseFromParent();
3454     return BB;
3455   }
3456   case AMDGPU::SI_INDIRECT_SRC_V1:
3457   case AMDGPU::SI_INDIRECT_SRC_V2:
3458   case AMDGPU::SI_INDIRECT_SRC_V4:
3459   case AMDGPU::SI_INDIRECT_SRC_V8:
3460   case AMDGPU::SI_INDIRECT_SRC_V16:
3461     return emitIndirectSrc(MI, *BB, *getSubtarget());
3462   case AMDGPU::SI_INDIRECT_DST_V1:
3463   case AMDGPU::SI_INDIRECT_DST_V2:
3464   case AMDGPU::SI_INDIRECT_DST_V4:
3465   case AMDGPU::SI_INDIRECT_DST_V8:
3466   case AMDGPU::SI_INDIRECT_DST_V16:
3467     return emitIndirectDst(MI, *BB, *getSubtarget());
3468   case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
3469   case AMDGPU::SI_KILL_I1_PSEUDO:
3470     return splitKillBlock(MI, BB);
3471   case AMDGPU::V_CNDMASK_B64_PSEUDO: {
3472     MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
3473 
3474     unsigned Dst = MI.getOperand(0).getReg();
3475     unsigned Src0 = MI.getOperand(1).getReg();
3476     unsigned Src1 = MI.getOperand(2).getReg();
3477     const DebugLoc &DL = MI.getDebugLoc();
3478     unsigned SrcCond = MI.getOperand(3).getReg();
3479 
3480     unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3481     unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3482     unsigned SrcCondCopy = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
3483 
3484     BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
3485       .addReg(SrcCond);
3486     BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
3487       .addImm(0)
3488       .addReg(Src0, 0, AMDGPU::sub0)
3489       .addImm(0)
3490       .addReg(Src1, 0, AMDGPU::sub0)
3491       .addReg(SrcCondCopy);
3492     BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
3493       .addImm(0)
3494       .addReg(Src0, 0, AMDGPU::sub1)
3495       .addImm(0)
3496       .addReg(Src1, 0, AMDGPU::sub1)
3497       .addReg(SrcCondCopy);
3498 
3499     BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
3500       .addReg(DstLo)
3501       .addImm(AMDGPU::sub0)
3502       .addReg(DstHi)
3503       .addImm(AMDGPU::sub1);
3504     MI.eraseFromParent();
3505     return BB;
3506   }
3507   case AMDGPU::SI_BR_UNDEF: {
3508     const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3509     const DebugLoc &DL = MI.getDebugLoc();
3510     MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
3511                            .add(MI.getOperand(0));
3512     Br->getOperand(1).setIsUndef(true); // read undef SCC
3513     MI.eraseFromParent();
3514     return BB;
3515   }
3516   case AMDGPU::ADJCALLSTACKUP:
3517   case AMDGPU::ADJCALLSTACKDOWN: {
3518     const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
3519     MachineInstrBuilder MIB(*MF, &MI);
3520 
3521     // Add an implicit use of the frame offset reg to prevent the restore copy
3522     // inserted after the call from being reorderd after stack operations in the
3523     // the caller's frame.
3524     MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
3525         .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit)
3526         .addReg(Info->getFrameOffsetReg(), RegState::Implicit);
3527     return BB;
3528   }
3529   case AMDGPU::SI_CALL_ISEL: {
3530     const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3531     const DebugLoc &DL = MI.getDebugLoc();
3532 
3533     unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
3534 
3535     MachineInstrBuilder MIB;
3536     MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
3537 
3538     for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
3539       MIB.add(MI.getOperand(I));
3540 
3541     MIB.cloneMemRefs(MI);
3542     MI.eraseFromParent();
3543     return BB;
3544   }
3545   case AMDGPU::V_ADD_I32_e32:
3546   case AMDGPU::V_SUB_I32_e32:
3547   case AMDGPU::V_SUBREV_I32_e32: {
3548     // TODO: Define distinct V_*_I32_Pseudo instructions instead.
3549     const DebugLoc &DL = MI.getDebugLoc();
3550     unsigned Opc = MI.getOpcode();
3551 
3552     bool NeedClampOperand = false;
3553     if (TII->pseudoToMCOpcode(Opc) == -1) {
3554       Opc = AMDGPU::getVOPe64(Opc);
3555       NeedClampOperand = true;
3556     }
3557 
3558     auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
3559     if (TII->isVOP3(*I)) {
3560       I.addReg(AMDGPU::VCC, RegState::Define);
3561     }
3562     I.add(MI.getOperand(1))
3563      .add(MI.getOperand(2));
3564     if (NeedClampOperand)
3565       I.addImm(0); // clamp bit for e64 encoding
3566 
3567     TII->legalizeOperands(*I);
3568 
3569     MI.eraseFromParent();
3570     return BB;
3571   }
3572   default:
3573     return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
3574   }
3575 }
3576 
3577 bool SITargetLowering::hasBitPreservingFPLogic(EVT VT) const {
3578   return isTypeLegal(VT.getScalarType());
3579 }
3580 
3581 bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
3582   // This currently forces unfolding various combinations of fsub into fma with
3583   // free fneg'd operands. As long as we have fast FMA (controlled by
3584   // isFMAFasterThanFMulAndFAdd), we should perform these.
3585 
3586   // When fma is quarter rate, for f64 where add / sub are at best half rate,
3587   // most of these combines appear to be cycle neutral but save on instruction
3588   // count / code size.
3589   return true;
3590 }
3591 
3592 EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
3593                                          EVT VT) const {
3594   if (!VT.isVector()) {
3595     return MVT::i1;
3596   }
3597   return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
3598 }
3599 
3600 MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const {
3601   // TODO: Should i16 be used always if legal? For now it would force VALU
3602   // shifts.
3603   return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
3604 }
3605 
3606 // Answering this is somewhat tricky and depends on the specific device which
3607 // have different rates for fma or all f64 operations.
3608 //
3609 // v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
3610 // regardless of which device (although the number of cycles differs between
3611 // devices), so it is always profitable for f64.
3612 //
3613 // v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
3614 // only on full rate devices. Normally, we should prefer selecting v_mad_f32
3615 // which we can always do even without fused FP ops since it returns the same
3616 // result as the separate operations and since it is always full
3617 // rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
3618 // however does not support denormals, so we do report fma as faster if we have
3619 // a fast fma device and require denormals.
3620 //
3621 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
3622   VT = VT.getScalarType();
3623 
3624   switch (VT.getSimpleVT().SimpleTy) {
3625   case MVT::f32: {
3626     // This is as fast on some subtargets. However, we always have full rate f32
3627     // mad available which returns the same result as the separate operations
3628     // which we should prefer over fma. We can't use this if we want to support
3629     // denormals, so only report this in these cases.
3630     if (Subtarget->hasFP32Denormals())
3631       return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
3632 
3633     // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
3634     return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
3635   }
3636   case MVT::f64:
3637     return true;
3638   case MVT::f16:
3639     return Subtarget->has16BitInsts() && Subtarget->hasFP16Denormals();
3640   default:
3641     break;
3642   }
3643 
3644   return false;
3645 }
3646 
3647 //===----------------------------------------------------------------------===//
3648 // Custom DAG Lowering Operations
3649 //===----------------------------------------------------------------------===//
3650 
3651 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
3652 // wider vector type is legal.
3653 SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
3654                                              SelectionDAG &DAG) const {
3655   unsigned Opc = Op.getOpcode();
3656   EVT VT = Op.getValueType();
3657   assert(VT == MVT::v4f16);
3658 
3659   SDValue Lo, Hi;
3660   std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
3661 
3662   SDLoc SL(Op);
3663   SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo,
3664                              Op->getFlags());
3665   SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi,
3666                              Op->getFlags());
3667 
3668   return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
3669 }
3670 
3671 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
3672 // wider vector type is legal.
3673 SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
3674                                               SelectionDAG &DAG) const {
3675   unsigned Opc = Op.getOpcode();
3676   EVT VT = Op.getValueType();
3677   assert(VT == MVT::v4i16 || VT == MVT::v4f16);
3678 
3679   SDValue Lo0, Hi0;
3680   std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
3681   SDValue Lo1, Hi1;
3682   std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
3683 
3684   SDLoc SL(Op);
3685 
3686   SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
3687                              Op->getFlags());
3688   SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
3689                              Op->getFlags());
3690 
3691   return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
3692 }
3693 
3694 SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
3695   switch (Op.getOpcode()) {
3696   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
3697   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
3698   case ISD::LOAD: {
3699     SDValue Result = LowerLOAD(Op, DAG);
3700     assert((!Result.getNode() ||
3701             Result.getNode()->getNumValues() == 2) &&
3702            "Load should return a value and a chain");
3703     return Result;
3704   }
3705 
3706   case ISD::FSIN:
3707   case ISD::FCOS:
3708     return LowerTrig(Op, DAG);
3709   case ISD::SELECT: return LowerSELECT(Op, DAG);
3710   case ISD::FDIV: return LowerFDIV(Op, DAG);
3711   case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
3712   case ISD::STORE: return LowerSTORE(Op, DAG);
3713   case ISD::GlobalAddress: {
3714     MachineFunction &MF = DAG.getMachineFunction();
3715     SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3716     return LowerGlobalAddress(MFI, Op, DAG);
3717   }
3718   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
3719   case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
3720   case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
3721   case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
3722   case ISD::INSERT_VECTOR_ELT:
3723     return lowerINSERT_VECTOR_ELT(Op, DAG);
3724   case ISD::EXTRACT_VECTOR_ELT:
3725     return lowerEXTRACT_VECTOR_ELT(Op, DAG);
3726   case ISD::BUILD_VECTOR:
3727     return lowerBUILD_VECTOR(Op, DAG);
3728   case ISD::FP_ROUND:
3729     return lowerFP_ROUND(Op, DAG);
3730   case ISD::TRAP:
3731     return lowerTRAP(Op, DAG);
3732   case ISD::DEBUGTRAP:
3733     return lowerDEBUGTRAP(Op, DAG);
3734   case ISD::FABS:
3735   case ISD::FNEG:
3736   case ISD::FCANONICALIZE:
3737     return splitUnaryVectorOp(Op, DAG);
3738   case ISD::FMINNUM:
3739   case ISD::FMAXNUM:
3740     return lowerFMINNUM_FMAXNUM(Op, DAG);
3741   case ISD::SHL:
3742   case ISD::SRA:
3743   case ISD::SRL:
3744   case ISD::ADD:
3745   case ISD::SUB:
3746   case ISD::MUL:
3747   case ISD::SMIN:
3748   case ISD::SMAX:
3749   case ISD::UMIN:
3750   case ISD::UMAX:
3751   case ISD::FADD:
3752   case ISD::FMUL:
3753   case ISD::FMINNUM_IEEE:
3754   case ISD::FMAXNUM_IEEE:
3755     return splitBinaryVectorOp(Op, DAG);
3756   }
3757   return SDValue();
3758 }
3759 
3760 static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,
3761                                        const SDLoc &DL,
3762                                        SelectionDAG &DAG, bool Unpacked) {
3763   if (!LoadVT.isVector())
3764     return Result;
3765 
3766   if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
3767     // Truncate to v2i16/v4i16.
3768     EVT IntLoadVT = LoadVT.changeTypeToInteger();
3769 
3770     // Workaround legalizer not scalarizing truncate after vector op
3771     // legalization byt not creating intermediate vector trunc.
3772     SmallVector<SDValue, 4> Elts;
3773     DAG.ExtractVectorElements(Result, Elts);
3774     for (SDValue &Elt : Elts)
3775       Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
3776 
3777     Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
3778 
3779     // Bitcast to original type (v2f16/v4f16).
3780     return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
3781   }
3782 
3783   // Cast back to the original packed type.
3784   return DAG.getNode(ISD::BITCAST, DL, LoadVT, Result);
3785 }
3786 
3787 SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
3788                                               MemSDNode *M,
3789                                               SelectionDAG &DAG,
3790                                               ArrayRef<SDValue> Ops,
3791                                               bool IsIntrinsic) const {
3792   SDLoc DL(M);
3793 
3794   bool Unpacked = Subtarget->hasUnpackedD16VMem();
3795   EVT LoadVT = M->getValueType(0);
3796 
3797   EVT EquivLoadVT = LoadVT;
3798   if (Unpacked && LoadVT.isVector()) {
3799     EquivLoadVT = LoadVT.isVector() ?
3800       EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3801                        LoadVT.getVectorNumElements()) : LoadVT;
3802   }
3803 
3804   // Change from v4f16/v2f16 to EquivLoadVT.
3805   SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
3806 
3807   SDValue Load
3808     = DAG.getMemIntrinsicNode(
3809       IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL,
3810       VTList, Ops, M->getMemoryVT(),
3811       M->getMemOperand());
3812   if (!Unpacked) // Just adjusted the opcode.
3813     return Load;
3814 
3815   SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
3816 
3817   return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
3818 }
3819 
3820 static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI,
3821                                   SDNode *N, SelectionDAG &DAG) {
3822   EVT VT = N->getValueType(0);
3823   const auto *CD = cast<ConstantSDNode>(N->getOperand(3));
3824   int CondCode = CD->getSExtValue();
3825   if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE ||
3826       CondCode > ICmpInst::Predicate::LAST_ICMP_PREDICATE)
3827     return DAG.getUNDEF(VT);
3828 
3829   ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
3830 
3831 
3832   SDValue LHS = N->getOperand(1);
3833   SDValue RHS = N->getOperand(2);
3834 
3835   SDLoc DL(N);
3836 
3837   EVT CmpVT = LHS.getValueType();
3838   if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
3839     unsigned PromoteOp = ICmpInst::isSigned(IcInput) ?
3840       ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3841     LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
3842     RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
3843   }
3844 
3845   ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
3846 
3847   return DAG.getNode(AMDGPUISD::SETCC, DL, VT, LHS, RHS,
3848                      DAG.getCondCode(CCOpcode));
3849 }
3850 
3851 static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI,
3852                                   SDNode *N, SelectionDAG &DAG) {
3853   EVT VT = N->getValueType(0);
3854   const auto *CD = cast<ConstantSDNode>(N->getOperand(3));
3855 
3856   int CondCode = CD->getSExtValue();
3857   if (CondCode < FCmpInst::Predicate::FIRST_FCMP_PREDICATE ||
3858       CondCode > FCmpInst::Predicate::LAST_FCMP_PREDICATE) {
3859     return DAG.getUNDEF(VT);
3860   }
3861 
3862   SDValue Src0 = N->getOperand(1);
3863   SDValue Src1 = N->getOperand(2);
3864   EVT CmpVT = Src0.getValueType();
3865   SDLoc SL(N);
3866 
3867   if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
3868     Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
3869     Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
3870   }
3871 
3872   FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
3873   ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
3874   return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src0,
3875                      Src1, DAG.getCondCode(CCOpcode));
3876 }
3877 
3878 void SITargetLowering::ReplaceNodeResults(SDNode *N,
3879                                           SmallVectorImpl<SDValue> &Results,
3880                                           SelectionDAG &DAG) const {
3881   switch (N->getOpcode()) {
3882   case ISD::INSERT_VECTOR_ELT: {
3883     if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
3884       Results.push_back(Res);
3885     return;
3886   }
3887   case ISD::EXTRACT_VECTOR_ELT: {
3888     if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
3889       Results.push_back(Res);
3890     return;
3891   }
3892   case ISD::INTRINSIC_WO_CHAIN: {
3893     unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
3894     switch (IID) {
3895     case Intrinsic::amdgcn_cvt_pkrtz: {
3896       SDValue Src0 = N->getOperand(1);
3897       SDValue Src1 = N->getOperand(2);
3898       SDLoc SL(N);
3899       SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32,
3900                                 Src0, Src1);
3901       Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
3902       return;
3903     }
3904     case Intrinsic::amdgcn_cvt_pknorm_i16:
3905     case Intrinsic::amdgcn_cvt_pknorm_u16:
3906     case Intrinsic::amdgcn_cvt_pk_i16:
3907     case Intrinsic::amdgcn_cvt_pk_u16: {
3908       SDValue Src0 = N->getOperand(1);
3909       SDValue Src1 = N->getOperand(2);
3910       SDLoc SL(N);
3911       unsigned Opcode;
3912 
3913       if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
3914         Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
3915       else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
3916         Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
3917       else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
3918         Opcode = AMDGPUISD::CVT_PK_I16_I32;
3919       else
3920         Opcode = AMDGPUISD::CVT_PK_U16_U32;
3921 
3922       EVT VT = N->getValueType(0);
3923       if (isTypeLegal(VT))
3924         Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
3925       else {
3926         SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
3927         Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
3928       }
3929       return;
3930     }
3931     }
3932     break;
3933   }
3934   case ISD::INTRINSIC_W_CHAIN: {
3935     if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
3936       Results.push_back(Res);
3937       Results.push_back(Res.getValue(1));
3938       return;
3939     }
3940 
3941     break;
3942   }
3943   case ISD::SELECT: {
3944     SDLoc SL(N);
3945     EVT VT = N->getValueType(0);
3946     EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3947     SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
3948     SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
3949 
3950     EVT SelectVT = NewVT;
3951     if (NewVT.bitsLT(MVT::i32)) {
3952       LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
3953       RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
3954       SelectVT = MVT::i32;
3955     }
3956 
3957     SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT,
3958                                     N->getOperand(0), LHS, RHS);
3959 
3960     if (NewVT != SelectVT)
3961       NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
3962     Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
3963     return;
3964   }
3965   case ISD::FNEG: {
3966     if (N->getValueType(0) != MVT::v2f16)
3967       break;
3968 
3969     SDLoc SL(N);
3970     SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
3971 
3972     SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32,
3973                              BC,
3974                              DAG.getConstant(0x80008000, SL, MVT::i32));
3975     Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
3976     return;
3977   }
3978   case ISD::FABS: {
3979     if (N->getValueType(0) != MVT::v2f16)
3980       break;
3981 
3982     SDLoc SL(N);
3983     SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
3984 
3985     SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32,
3986                              BC,
3987                              DAG.getConstant(0x7fff7fff, SL, MVT::i32));
3988     Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
3989     return;
3990   }
3991   default:
3992     break;
3993   }
3994 }
3995 
3996 /// Helper function for LowerBRCOND
3997 static SDNode *findUser(SDValue Value, unsigned Opcode) {
3998 
3999   SDNode *Parent = Value.getNode();
4000   for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
4001        I != E; ++I) {
4002 
4003     if (I.getUse().get() != Value)
4004       continue;
4005 
4006     if (I->getOpcode() == Opcode)
4007       return *I;
4008   }
4009   return nullptr;
4010 }
4011 
4012 unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
4013   if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
4014     switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
4015     case Intrinsic::amdgcn_if:
4016       return AMDGPUISD::IF;
4017     case Intrinsic::amdgcn_else:
4018       return AMDGPUISD::ELSE;
4019     case Intrinsic::amdgcn_loop:
4020       return AMDGPUISD::LOOP;
4021     case Intrinsic::amdgcn_end_cf:
4022       llvm_unreachable("should not occur");
4023     default:
4024       return 0;
4025     }
4026   }
4027 
4028   // break, if_break, else_break are all only used as inputs to loop, not
4029   // directly as branch conditions.
4030   return 0;
4031 }
4032 
4033 bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
4034   const Triple &TT = getTargetMachine().getTargetTriple();
4035   return (GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
4036           GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
4037          AMDGPU::shouldEmitConstantsToTextSection(TT);
4038 }
4039 
4040 bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
4041   // FIXME: Either avoid relying on address space here or change the default
4042   // address space for functions to avoid the explicit check.
4043   return (GV->getValueType()->isFunctionTy() ||
4044           GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
4045           GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
4046           GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
4047          !shouldEmitFixup(GV) &&
4048          !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
4049 }
4050 
4051 bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
4052   return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
4053 }
4054 
4055 /// This transforms the control flow intrinsics to get the branch destination as
4056 /// last parameter, also switches branch target with BR if the need arise
4057 SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
4058                                       SelectionDAG &DAG) const {
4059   SDLoc DL(BRCOND);
4060 
4061   SDNode *Intr = BRCOND.getOperand(1).getNode();
4062   SDValue Target = BRCOND.getOperand(2);
4063   SDNode *BR = nullptr;
4064   SDNode *SetCC = nullptr;
4065 
4066   if (Intr->getOpcode() == ISD::SETCC) {
4067     // As long as we negate the condition everything is fine
4068     SetCC = Intr;
4069     Intr = SetCC->getOperand(0).getNode();
4070 
4071   } else {
4072     // Get the target from BR if we don't negate the condition
4073     BR = findUser(BRCOND, ISD::BR);
4074     Target = BR->getOperand(1);
4075   }
4076 
4077   // FIXME: This changes the types of the intrinsics instead of introducing new
4078   // nodes with the correct types.
4079   // e.g. llvm.amdgcn.loop
4080 
4081   // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3
4082   // =>     t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088>
4083 
4084   unsigned CFNode = isCFIntrinsic(Intr);
4085   if (CFNode == 0) {
4086     // This is a uniform branch so we don't need to legalize.
4087     return BRCOND;
4088   }
4089 
4090   bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
4091                    Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
4092 
4093   assert(!SetCC ||
4094         (SetCC->getConstantOperandVal(1) == 1 &&
4095          cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
4096                                                              ISD::SETNE));
4097 
4098   // operands of the new intrinsic call
4099   SmallVector<SDValue, 4> Ops;
4100   if (HaveChain)
4101     Ops.push_back(BRCOND.getOperand(0));
4102 
4103   Ops.append(Intr->op_begin() + (HaveChain ?  2 : 1), Intr->op_end());
4104   Ops.push_back(Target);
4105 
4106   ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
4107 
4108   // build the new intrinsic call
4109   SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
4110 
4111   if (!HaveChain) {
4112     SDValue Ops[] =  {
4113       SDValue(Result, 0),
4114       BRCOND.getOperand(0)
4115     };
4116 
4117     Result = DAG.getMergeValues(Ops, DL).getNode();
4118   }
4119 
4120   if (BR) {
4121     // Give the branch instruction our target
4122     SDValue Ops[] = {
4123       BR->getOperand(0),
4124       BRCOND.getOperand(2)
4125     };
4126     SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
4127     DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
4128     BR = NewBR.getNode();
4129   }
4130 
4131   SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
4132 
4133   // Copy the intrinsic results to registers
4134   for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
4135     SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
4136     if (!CopyToReg)
4137       continue;
4138 
4139     Chain = DAG.getCopyToReg(
4140       Chain, DL,
4141       CopyToReg->getOperand(1),
4142       SDValue(Result, i - 1),
4143       SDValue());
4144 
4145     DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
4146   }
4147 
4148   // Remove the old intrinsic from the chain
4149   DAG.ReplaceAllUsesOfValueWith(
4150     SDValue(Intr, Intr->getNumValues() - 1),
4151     Intr->getOperand(0));
4152 
4153   return Chain;
4154 }
4155 
4156 SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG,
4157                                             SDValue Op,
4158                                             const SDLoc &DL,
4159                                             EVT VT) const {
4160   return Op.getValueType().bitsLE(VT) ?
4161       DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
4162       DAG.getNode(ISD::FTRUNC, DL, VT, Op);
4163 }
4164 
4165 SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
4166   assert(Op.getValueType() == MVT::f16 &&
4167          "Do not know how to custom lower FP_ROUND for non-f16 type");
4168 
4169   SDValue Src = Op.getOperand(0);
4170   EVT SrcVT = Src.getValueType();
4171   if (SrcVT != MVT::f64)
4172     return Op;
4173 
4174   SDLoc DL(Op);
4175 
4176   SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
4177   SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
4178   return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
4179 }
4180 
4181 SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
4182                                                SelectionDAG &DAG) const {
4183   EVT VT = Op.getValueType();
4184   const MachineFunction &MF = DAG.getMachineFunction();
4185   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4186   bool IsIEEEMode = Info->getMode().IEEE;
4187 
4188   // FIXME: Assert during eslection that this is only selected for
4189   // ieee_mode. Currently a combine can produce the ieee version for non-ieee
4190   // mode functions, but this happens to be OK since it's only done in cases
4191   // where there is known no sNaN.
4192   if (IsIEEEMode)
4193     return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
4194 
4195   if (VT == MVT::v4f16)
4196     return splitBinaryVectorOp(Op, DAG);
4197   return Op;
4198 }
4199 
4200 SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
4201   SDLoc SL(Op);
4202   SDValue Chain = Op.getOperand(0);
4203 
4204   if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4205       !Subtarget->isTrapHandlerEnabled())
4206     return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
4207 
4208   MachineFunction &MF = DAG.getMachineFunction();
4209   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4210   unsigned UserSGPR = Info->getQueuePtrUserSGPR();
4211   assert(UserSGPR != AMDGPU::NoRegister);
4212   SDValue QueuePtr = CreateLiveInRegister(
4213     DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
4214   SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
4215   SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
4216                                    QueuePtr, SDValue());
4217   SDValue Ops[] = {
4218     ToReg,
4219     DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMTrap, SL, MVT::i16),
4220     SGPR01,
4221     ToReg.getValue(1)
4222   };
4223   return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
4224 }
4225 
4226 SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
4227   SDLoc SL(Op);
4228   SDValue Chain = Op.getOperand(0);
4229   MachineFunction &MF = DAG.getMachineFunction();
4230 
4231   if (Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
4232       !Subtarget->isTrapHandlerEnabled()) {
4233     DiagnosticInfoUnsupported NoTrap(MF.getFunction(),
4234                                      "debugtrap handler not supported",
4235                                      Op.getDebugLoc(),
4236                                      DS_Warning);
4237     LLVMContext &Ctx = MF.getFunction().getContext();
4238     Ctx.diagnose(NoTrap);
4239     return Chain;
4240   }
4241 
4242   SDValue Ops[] = {
4243     Chain,
4244     DAG.getTargetConstant(GCNSubtarget::TrapIDLLVMDebugTrap, SL, MVT::i16)
4245   };
4246   return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
4247 }
4248 
4249 SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
4250                                              SelectionDAG &DAG) const {
4251   // FIXME: Use inline constants (src_{shared, private}_base) instead.
4252   if (Subtarget->hasApertureRegs()) {
4253     unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
4254         AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
4255         AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
4256     unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
4257         AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
4258         AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
4259     unsigned Encoding =
4260         AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
4261         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
4262         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
4263 
4264     SDValue EncodingImm = DAG.getTargetConstant(Encoding, DL, MVT::i16);
4265     SDValue ApertureReg = SDValue(
4266         DAG.getMachineNode(AMDGPU::S_GETREG_B32, DL, MVT::i32, EncodingImm), 0);
4267     SDValue ShiftAmount = DAG.getTargetConstant(WidthM1 + 1, DL, MVT::i32);
4268     return DAG.getNode(ISD::SHL, DL, MVT::i32, ApertureReg, ShiftAmount);
4269   }
4270 
4271   MachineFunction &MF = DAG.getMachineFunction();
4272   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
4273   unsigned UserSGPR = Info->getQueuePtrUserSGPR();
4274   assert(UserSGPR != AMDGPU::NoRegister);
4275 
4276   SDValue QueuePtr = CreateLiveInRegister(
4277     DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
4278 
4279   // Offset into amd_queue_t for group_segment_aperture_base_hi /
4280   // private_segment_aperture_base_hi.
4281   uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
4282 
4283   SDValue Ptr = DAG.getObjectPtrOffset(DL, QueuePtr, StructOffset);
4284 
4285   // TODO: Use custom target PseudoSourceValue.
4286   // TODO: We should use the value from the IR intrinsic call, but it might not
4287   // be available and how do we get it?
4288   Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()),
4289                                               AMDGPUAS::CONSTANT_ADDRESS));
4290 
4291   MachinePointerInfo PtrInfo(V, StructOffset);
4292   return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
4293                      MinAlign(64, StructOffset),
4294                      MachineMemOperand::MODereferenceable |
4295                          MachineMemOperand::MOInvariant);
4296 }
4297 
4298 SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
4299                                              SelectionDAG &DAG) const {
4300   SDLoc SL(Op);
4301   const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
4302 
4303   SDValue Src = ASC->getOperand(0);
4304   SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
4305 
4306   const AMDGPUTargetMachine &TM =
4307     static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
4308 
4309   // flat -> local/private
4310   if (ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
4311     unsigned DestAS = ASC->getDestAddressSpace();
4312 
4313     if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
4314         DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
4315       unsigned NullVal = TM.getNullPointerValue(DestAS);
4316       SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
4317       SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
4318       SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
4319 
4320       return DAG.getNode(ISD::SELECT, SL, MVT::i32,
4321                          NonNull, Ptr, SegmentNullPtr);
4322     }
4323   }
4324 
4325   // local/private -> flat
4326   if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
4327     unsigned SrcAS = ASC->getSrcAddressSpace();
4328 
4329     if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
4330         SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
4331       unsigned NullVal = TM.getNullPointerValue(SrcAS);
4332       SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
4333 
4334       SDValue NonNull
4335         = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
4336 
4337       SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG);
4338       SDValue CvtPtr
4339         = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
4340 
4341       return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull,
4342                          DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr),
4343                          FlatNullPtr);
4344     }
4345   }
4346 
4347   // global <-> flat are no-ops and never emitted.
4348 
4349   const MachineFunction &MF = DAG.getMachineFunction();
4350   DiagnosticInfoUnsupported InvalidAddrSpaceCast(
4351     MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
4352   DAG.getContext()->diagnose(InvalidAddrSpaceCast);
4353 
4354   return DAG.getUNDEF(ASC->getValueType(0));
4355 }
4356 
4357 SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
4358                                                  SelectionDAG &DAG) const {
4359   SDValue Vec = Op.getOperand(0);
4360   SDValue InsVal = Op.getOperand(1);
4361   SDValue Idx = Op.getOperand(2);
4362   EVT VecVT = Vec.getValueType();
4363   EVT EltVT = VecVT.getVectorElementType();
4364   unsigned VecSize = VecVT.getSizeInBits();
4365   unsigned EltSize = EltVT.getSizeInBits();
4366 
4367 
4368   assert(VecSize <= 64);
4369 
4370   unsigned NumElts = VecVT.getVectorNumElements();
4371   SDLoc SL(Op);
4372   auto KIdx = dyn_cast<ConstantSDNode>(Idx);
4373 
4374   if (NumElts == 4 && EltSize == 16 && KIdx) {
4375     SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
4376 
4377     SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
4378                                  DAG.getConstant(0, SL, MVT::i32));
4379     SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
4380                                  DAG.getConstant(1, SL, MVT::i32));
4381 
4382     SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
4383     SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
4384 
4385     unsigned Idx = KIdx->getZExtValue();
4386     bool InsertLo = Idx < 2;
4387     SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16,
4388       InsertLo ? LoVec : HiVec,
4389       DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
4390       DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
4391 
4392     InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
4393 
4394     SDValue Concat = InsertLo ?
4395       DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) :
4396       DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf });
4397 
4398     return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
4399   }
4400 
4401   if (isa<ConstantSDNode>(Idx))
4402     return SDValue();
4403 
4404   MVT IntVT = MVT::getIntegerVT(VecSize);
4405 
4406   // Avoid stack access for dynamic indexing.
4407   // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
4408 
4409   // Create a congruent vector with the target value in each element so that
4410   // the required element can be masked and ORed into the target vector.
4411   SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
4412                                DAG.getSplatBuildVector(VecVT, SL, InsVal));
4413 
4414   assert(isPowerOf2_32(EltSize));
4415   SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
4416 
4417   // Convert vector index to bit-index.
4418   SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
4419 
4420   SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
4421   SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
4422                             DAG.getConstant(0xffff, SL, IntVT),
4423                             ScaledIdx);
4424 
4425   SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
4426   SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
4427                             DAG.getNOT(SL, BFM, IntVT), BCVec);
4428 
4429   SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
4430   return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
4431 }
4432 
4433 SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
4434                                                   SelectionDAG &DAG) const {
4435   SDLoc SL(Op);
4436 
4437   EVT ResultVT = Op.getValueType();
4438   SDValue Vec = Op.getOperand(0);
4439   SDValue Idx = Op.getOperand(1);
4440   EVT VecVT = Vec.getValueType();
4441   unsigned VecSize = VecVT.getSizeInBits();
4442   EVT EltVT = VecVT.getVectorElementType();
4443   assert(VecSize <= 64);
4444 
4445   DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
4446 
4447   // Make sure we do any optimizations that will make it easier to fold
4448   // source modifiers before obscuring it with bit operations.
4449 
4450   // XXX - Why doesn't this get called when vector_shuffle is expanded?
4451   if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
4452     return Combined;
4453 
4454   unsigned EltSize = EltVT.getSizeInBits();
4455   assert(isPowerOf2_32(EltSize));
4456 
4457   MVT IntVT = MVT::getIntegerVT(VecSize);
4458   SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
4459 
4460   // Convert vector index to bit-index (* EltSize)
4461   SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
4462 
4463   SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
4464   SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
4465 
4466   if (ResultVT == MVT::f16) {
4467     SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
4468     return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
4469   }
4470 
4471   return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
4472 }
4473 
4474 SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
4475                                             SelectionDAG &DAG) const {
4476   SDLoc SL(Op);
4477   EVT VT = Op.getValueType();
4478 
4479   if (VT == MVT::v4i16 || VT == MVT::v4f16) {
4480     EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
4481 
4482     // Turn into pair of packed build_vectors.
4483     // TODO: Special case for constants that can be materialized with s_mov_b64.
4484     SDValue Lo = DAG.getBuildVector(HalfVT, SL,
4485                                     { Op.getOperand(0), Op.getOperand(1) });
4486     SDValue Hi = DAG.getBuildVector(HalfVT, SL,
4487                                     { Op.getOperand(2), Op.getOperand(3) });
4488 
4489     SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Lo);
4490     SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Hi);
4491 
4492     SDValue Blend = DAG.getBuildVector(MVT::v2i32, SL, { CastLo, CastHi });
4493     return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
4494   }
4495 
4496   assert(VT == MVT::v2f16 || VT == MVT::v2i16);
4497   assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
4498 
4499   SDValue Lo = Op.getOperand(0);
4500   SDValue Hi = Op.getOperand(1);
4501 
4502   // Avoid adding defined bits with the zero_extend.
4503   if (Hi.isUndef()) {
4504     Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
4505     SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
4506     return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
4507   }
4508 
4509   Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
4510   Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
4511 
4512   SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
4513                               DAG.getConstant(16, SL, MVT::i32));
4514   if (Lo.isUndef())
4515     return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
4516 
4517   Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
4518   Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
4519 
4520   SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
4521   return DAG.getNode(ISD::BITCAST, SL, VT, Or);
4522 }
4523 
4524 bool
4525 SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
4526   // We can fold offsets for anything that doesn't require a GOT relocation.
4527   return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
4528           GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
4529           GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
4530          !shouldEmitGOTReloc(GA->getGlobal());
4531 }
4532 
4533 static SDValue
4534 buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
4535                         const SDLoc &DL, unsigned Offset, EVT PtrVT,
4536                         unsigned GAFlags = SIInstrInfo::MO_NONE) {
4537   // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
4538   // lowered to the following code sequence:
4539   //
4540   // For constant address space:
4541   //   s_getpc_b64 s[0:1]
4542   //   s_add_u32 s0, s0, $symbol
4543   //   s_addc_u32 s1, s1, 0
4544   //
4545   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
4546   //   a fixup or relocation is emitted to replace $symbol with a literal
4547   //   constant, which is a pc-relative offset from the encoding of the $symbol
4548   //   operand to the global variable.
4549   //
4550   // For global address space:
4551   //   s_getpc_b64 s[0:1]
4552   //   s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
4553   //   s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
4554   //
4555   //   s_getpc_b64 returns the address of the s_add_u32 instruction and then
4556   //   fixups or relocations are emitted to replace $symbol@*@lo and
4557   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
4558   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
4559   //   operand to the global variable.
4560   //
4561   // What we want here is an offset from the value returned by s_getpc
4562   // (which is the address of the s_add_u32 instruction) to the global
4563   // variable, but since the encoding of $symbol starts 4 bytes after the start
4564   // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
4565   // small. This requires us to add 4 to the global variable offset in order to
4566   // compute the correct address.
4567   SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
4568                                              GAFlags);
4569   SDValue PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4,
4570                                              GAFlags == SIInstrInfo::MO_NONE ?
4571                                              GAFlags : GAFlags + 1);
4572   return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
4573 }
4574 
4575 SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
4576                                              SDValue Op,
4577                                              SelectionDAG &DAG) const {
4578   GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
4579   const GlobalValue *GV = GSD->getGlobal();
4580   if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
4581       GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
4582       GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
4583     return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
4584 
4585   SDLoc DL(GSD);
4586   EVT PtrVT = Op.getValueType();
4587 
4588   // FIXME: Should not make address space based decisions here.
4589   if (shouldEmitFixup(GV))
4590     return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
4591   else if (shouldEmitPCReloc(GV))
4592     return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
4593                                    SIInstrInfo::MO_REL32);
4594 
4595   SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
4596                                             SIInstrInfo::MO_GOTPCREL32);
4597 
4598   Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
4599   PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
4600   const DataLayout &DataLayout = DAG.getDataLayout();
4601   unsigned Align = DataLayout.getABITypeAlignment(PtrTy);
4602   MachinePointerInfo PtrInfo
4603     = MachinePointerInfo::getGOT(DAG.getMachineFunction());
4604 
4605   return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align,
4606                      MachineMemOperand::MODereferenceable |
4607                          MachineMemOperand::MOInvariant);
4608 }
4609 
4610 SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
4611                                    const SDLoc &DL, SDValue V) const {
4612   // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
4613   // the destination register.
4614   //
4615   // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
4616   // so we will end up with redundant moves to m0.
4617   //
4618   // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
4619 
4620   // A Null SDValue creates a glue result.
4621   SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
4622                                   V, Chain);
4623   return SDValue(M0, 0);
4624 }
4625 
4626 SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
4627                                                  SDValue Op,
4628                                                  MVT VT,
4629                                                  unsigned Offset) const {
4630   SDLoc SL(Op);
4631   SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL,
4632                                            DAG.getEntryNode(), Offset, 4, false);
4633   // The local size values will have the hi 16-bits as zero.
4634   return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
4635                      DAG.getValueType(VT));
4636 }
4637 
4638 static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
4639                                         EVT VT) {
4640   DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
4641                                       "non-hsa intrinsic with hsa target",
4642                                       DL.getDebugLoc());
4643   DAG.getContext()->diagnose(BadIntrin);
4644   return DAG.getUNDEF(VT);
4645 }
4646 
4647 static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
4648                                          EVT VT) {
4649   DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
4650                                       "intrinsic not supported on subtarget",
4651                                       DL.getDebugLoc());
4652   DAG.getContext()->diagnose(BadIntrin);
4653   return DAG.getUNDEF(VT);
4654 }
4655 
4656 static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,
4657                                     ArrayRef<SDValue> Elts) {
4658   assert(!Elts.empty());
4659   MVT Type;
4660   unsigned NumElts;
4661 
4662   if (Elts.size() == 1) {
4663     Type = MVT::f32;
4664     NumElts = 1;
4665   } else if (Elts.size() == 2) {
4666     Type = MVT::v2f32;
4667     NumElts = 2;
4668   } else if (Elts.size() <= 4) {
4669     Type = MVT::v4f32;
4670     NumElts = 4;
4671   } else if (Elts.size() <= 8) {
4672     Type = MVT::v8f32;
4673     NumElts = 8;
4674   } else {
4675     assert(Elts.size() <= 16);
4676     Type = MVT::v16f32;
4677     NumElts = 16;
4678   }
4679 
4680   SmallVector<SDValue, 16> VecElts(NumElts);
4681   for (unsigned i = 0; i < Elts.size(); ++i) {
4682     SDValue Elt = Elts[i];
4683     if (Elt.getValueType() != MVT::f32)
4684       Elt = DAG.getBitcast(MVT::f32, Elt);
4685     VecElts[i] = Elt;
4686   }
4687   for (unsigned i = Elts.size(); i < NumElts; ++i)
4688     VecElts[i] = DAG.getUNDEF(MVT::f32);
4689 
4690   if (NumElts == 1)
4691     return VecElts[0];
4692   return DAG.getBuildVector(Type, DL, VecElts);
4693 }
4694 
4695 static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG,
4696                              SDValue *GLC, SDValue *SLC, SDValue *DLC) {
4697   auto CachePolicyConst = cast<ConstantSDNode>(CachePolicy.getNode());
4698 
4699   uint64_t Value = CachePolicyConst->getZExtValue();
4700   SDLoc DL(CachePolicy);
4701   if (GLC) {
4702     *GLC = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
4703     Value &= ~(uint64_t)0x1;
4704   }
4705   if (SLC) {
4706     *SLC = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
4707     Value &= ~(uint64_t)0x2;
4708   }
4709   if (DLC) {
4710     *DLC = DAG.getTargetConstant((Value & 0x4) ? 1 : 0, DL, MVT::i32);
4711     Value &= ~(uint64_t)0x4;
4712   }
4713 
4714   return Value == 0;
4715 }
4716 
4717 // Re-construct the required return value for a image load intrinsic.
4718 // This is more complicated due to the optional use TexFailCtrl which means the required
4719 // return type is an aggregate
4720 static SDValue constructRetValue(SelectionDAG &DAG,
4721                                  MachineSDNode *Result,
4722                                  ArrayRef<EVT> ResultTypes,
4723                                  bool IsTexFail, bool Unpacked, bool IsD16,
4724                                  int DMaskPop, int NumVDataDwords,
4725                                  const SDLoc &DL, LLVMContext &Context) {
4726   // Determine the required return type. This is the same regardless of IsTexFail flag
4727   EVT ReqRetVT = ResultTypes[0];
4728   EVT ReqRetEltVT = ReqRetVT.isVector() ? ReqRetVT.getVectorElementType() : ReqRetVT;
4729   int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
4730   EVT AdjEltVT = Unpacked && IsD16 ? MVT::i32 : ReqRetEltVT;
4731   EVT AdjVT = Unpacked ? ReqRetNumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, ReqRetNumElts)
4732                                            : AdjEltVT
4733                        : ReqRetVT;
4734 
4735   // Extract data part of the result
4736   // Bitcast the result to the same type as the required return type
4737   int NumElts;
4738   if (IsD16 && !Unpacked)
4739     NumElts = NumVDataDwords << 1;
4740   else
4741     NumElts = NumVDataDwords;
4742 
4743   EVT CastVT = NumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, NumElts)
4744                            : AdjEltVT;
4745 
4746   // Special case for v6f16. Rather than add support for this, use v3i32 to
4747   // extract the data elements
4748   bool V6F16Special = false;
4749   if (NumElts == 6) {
4750     CastVT = EVT::getVectorVT(Context, MVT::i32, NumElts / 2);
4751     DMaskPop >>= 1;
4752     ReqRetNumElts >>= 1;
4753     V6F16Special = true;
4754     AdjVT = MVT::v2i32;
4755   }
4756 
4757   SDValue N = SDValue(Result, 0);
4758   SDValue CastRes = DAG.getNode(ISD::BITCAST, DL, CastVT, N);
4759 
4760   // Iterate over the result
4761   SmallVector<SDValue, 4> BVElts;
4762 
4763   if (CastVT.isVector()) {
4764     DAG.ExtractVectorElements(CastRes, BVElts, 0, DMaskPop);
4765   } else {
4766     BVElts.push_back(CastRes);
4767   }
4768   int ExtraElts = ReqRetNumElts - DMaskPop;
4769   while(ExtraElts--)
4770     BVElts.push_back(DAG.getUNDEF(AdjEltVT));
4771 
4772   SDValue PreTFCRes;
4773   if (ReqRetNumElts > 1) {
4774     SDValue NewVec = DAG.getBuildVector(AdjVT, DL, BVElts);
4775     if (IsD16 && Unpacked)
4776       PreTFCRes = adjustLoadValueTypeImpl(NewVec, ReqRetVT, DL, DAG, Unpacked);
4777     else
4778       PreTFCRes = NewVec;
4779   } else {
4780     PreTFCRes = BVElts[0];
4781   }
4782 
4783   if (V6F16Special)
4784     PreTFCRes = DAG.getNode(ISD::BITCAST, DL, MVT::v4f16, PreTFCRes);
4785 
4786   if (!IsTexFail) {
4787     if (Result->getNumValues() > 1)
4788       return DAG.getMergeValues({PreTFCRes, SDValue(Result, 1)}, DL);
4789     else
4790       return PreTFCRes;
4791   }
4792 
4793   // Extract the TexFail result and insert into aggregate return
4794   SmallVector<SDValue, 1> TFCElt;
4795   DAG.ExtractVectorElements(N, TFCElt, DMaskPop, 1);
4796   SDValue TFCRes = DAG.getNode(ISD::BITCAST, DL, ResultTypes[1], TFCElt[0]);
4797   return DAG.getMergeValues({PreTFCRes, TFCRes, SDValue(Result, 1)}, DL);
4798 }
4799 
4800 static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
4801                          SDValue *LWE, bool &IsTexFail) {
4802   auto TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
4803 
4804   uint64_t Value = TexFailCtrlConst->getZExtValue();
4805   if (Value) {
4806     IsTexFail = true;
4807   }
4808 
4809   SDLoc DL(TexFailCtrlConst);
4810   *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
4811   Value &= ~(uint64_t)0x1;
4812   *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
4813   Value &= ~(uint64_t)0x2;
4814 
4815   return Value == 0;
4816 }
4817 
4818 SDValue SITargetLowering::lowerImage(SDValue Op,
4819                                      const AMDGPU::ImageDimIntrinsicInfo *Intr,
4820                                      SelectionDAG &DAG) const {
4821   SDLoc DL(Op);
4822   MachineFunction &MF = DAG.getMachineFunction();
4823   const GCNSubtarget* ST = &MF.getSubtarget<GCNSubtarget>();
4824   const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
4825       AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
4826   const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
4827   const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
4828       AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
4829   unsigned IntrOpcode = Intr->BaseOpcode;
4830   bool IsGFX10 = Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10;
4831 
4832   SmallVector<EVT, 3> ResultTypes(Op->value_begin(), Op->value_end());
4833   SmallVector<EVT, 3> OrigResultTypes(Op->value_begin(), Op->value_end());
4834   bool IsD16 = false;
4835   bool IsA16 = false;
4836   SDValue VData;
4837   int NumVDataDwords;
4838   bool AdjustRetType = false;
4839 
4840   unsigned AddrIdx; // Index of first address argument
4841   unsigned DMask;
4842   unsigned DMaskLanes = 0;
4843 
4844   if (BaseOpcode->Atomic) {
4845     VData = Op.getOperand(2);
4846 
4847     bool Is64Bit = VData.getValueType() == MVT::i64;
4848     if (BaseOpcode->AtomicX2) {
4849       SDValue VData2 = Op.getOperand(3);
4850       VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
4851                                  {VData, VData2});
4852       if (Is64Bit)
4853         VData = DAG.getBitcast(MVT::v4i32, VData);
4854 
4855       ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
4856       DMask = Is64Bit ? 0xf : 0x3;
4857       NumVDataDwords = Is64Bit ? 4 : 2;
4858       AddrIdx = 4;
4859     } else {
4860       DMask = Is64Bit ? 0x3 : 0x1;
4861       NumVDataDwords = Is64Bit ? 2 : 1;
4862       AddrIdx = 3;
4863     }
4864   } else {
4865     unsigned DMaskIdx = BaseOpcode->Store ? 3 : isa<MemSDNode>(Op) ? 2 : 1;
4866     auto DMaskConst = cast<ConstantSDNode>(Op.getOperand(DMaskIdx));
4867     DMask = DMaskConst->getZExtValue();
4868     DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
4869 
4870     if (BaseOpcode->Store) {
4871       VData = Op.getOperand(2);
4872 
4873       MVT StoreVT = VData.getSimpleValueType();
4874       if (StoreVT.getScalarType() == MVT::f16) {
4875         if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ||
4876             !BaseOpcode->HasD16)
4877           return Op; // D16 is unsupported for this instruction
4878 
4879         IsD16 = true;
4880         VData = handleD16VData(VData, DAG);
4881       }
4882 
4883       NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
4884     } else {
4885       // Work out the num dwords based on the dmask popcount and underlying type
4886       // and whether packing is supported.
4887       MVT LoadVT = ResultTypes[0].getSimpleVT();
4888       if (LoadVT.getScalarType() == MVT::f16) {
4889         if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ||
4890             !BaseOpcode->HasD16)
4891           return Op; // D16 is unsupported for this instruction
4892 
4893         IsD16 = true;
4894       }
4895 
4896       // Confirm that the return type is large enough for the dmask specified
4897       if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
4898           (!LoadVT.isVector() && DMaskLanes > 1))
4899           return Op;
4900 
4901       if (IsD16 && !Subtarget->hasUnpackedD16VMem())
4902         NumVDataDwords = (DMaskLanes + 1) / 2;
4903       else
4904         NumVDataDwords = DMaskLanes;
4905 
4906       AdjustRetType = true;
4907     }
4908 
4909     AddrIdx = DMaskIdx + 1;
4910   }
4911 
4912   unsigned NumGradients = BaseOpcode->Gradients ? DimInfo->NumGradients : 0;
4913   unsigned NumCoords = BaseOpcode->Coordinates ? DimInfo->NumCoords : 0;
4914   unsigned NumLCM = BaseOpcode->LodOrClampOrMip ? 1 : 0;
4915   unsigned NumVAddrs = BaseOpcode->NumExtraArgs + NumGradients +
4916                        NumCoords + NumLCM;
4917   unsigned NumMIVAddrs = NumVAddrs;
4918 
4919   SmallVector<SDValue, 4> VAddrs;
4920 
4921   // Optimize _L to _LZ when _L is zero
4922   if (LZMappingInfo) {
4923     if (auto ConstantLod =
4924          dyn_cast<ConstantFPSDNode>(Op.getOperand(AddrIdx+NumVAddrs-1))) {
4925       if (ConstantLod->isZero() || ConstantLod->isNegative()) {
4926         IntrOpcode = LZMappingInfo->LZ;  // set new opcode to _lz variant of _l
4927         NumMIVAddrs--;               // remove 'lod'
4928       }
4929     }
4930   }
4931 
4932   // Check for 16 bit addresses and pack if true.
4933   unsigned DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
4934   MVT VAddrVT = Op.getOperand(DimIdx).getSimpleValueType();
4935   const MVT VAddrScalarVT = VAddrVT.getScalarType();
4936   if (((VAddrScalarVT == MVT::f16) || (VAddrScalarVT == MVT::i16)) &&
4937       ST->hasFeature(AMDGPU::FeatureR128A16)) {
4938     IsA16 = true;
4939     const MVT VectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
4940     for (unsigned i = AddrIdx; i < (AddrIdx + NumMIVAddrs); ++i) {
4941       SDValue AddrLo, AddrHi;
4942       // Push back extra arguments.
4943       if (i < DimIdx) {
4944         AddrLo = Op.getOperand(i);
4945       } else {
4946         AddrLo = Op.getOperand(i);
4947         // Dz/dh, dz/dv and the last odd coord are packed with undef. Also,
4948         // in 1D, derivatives dx/dh and dx/dv are packed with undef.
4949         if (((i + 1) >= (AddrIdx + NumMIVAddrs)) ||
4950             ((NumGradients / 2) % 2 == 1 &&
4951             (i == DimIdx + (NumGradients / 2) - 1 ||
4952              i == DimIdx + NumGradients - 1))) {
4953           AddrHi = DAG.getUNDEF(MVT::f16);
4954         } else {
4955           AddrHi = Op.getOperand(i + 1);
4956           i++;
4957         }
4958         AddrLo = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VectorVT,
4959                              {AddrLo, AddrHi});
4960         AddrLo = DAG.getBitcast(MVT::i32, AddrLo);
4961       }
4962       VAddrs.push_back(AddrLo);
4963     }
4964   } else {
4965     for (unsigned i = 0; i < NumMIVAddrs; ++i)
4966       VAddrs.push_back(Op.getOperand(AddrIdx + i));
4967   }
4968 
4969   // If the register allocator cannot place the address registers contiguously
4970   // without introducing moves, then using the non-sequential address encoding
4971   // is always preferable, since it saves VALU instructions and is usually a
4972   // wash in terms of code size or even better.
4973   //
4974   // However, we currently have no way of hinting to the register allocator that
4975   // MIMG addresses should be placed contiguously when it is possible to do so,
4976   // so force non-NSA for the common 2-address case as a heuristic.
4977   //
4978   // SIShrinkInstructions will convert NSA encodings to non-NSA after register
4979   // allocation when possible.
4980   bool UseNSA =
4981       ST->hasFeature(AMDGPU::FeatureNSAEncoding) && VAddrs.size() >= 3;
4982   SDValue VAddr;
4983   if (!UseNSA)
4984     VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
4985 
4986   SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
4987   SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
4988   unsigned CtrlIdx; // Index of texfailctrl argument
4989   SDValue Unorm;
4990   if (!BaseOpcode->Sampler) {
4991     Unorm = True;
4992     CtrlIdx = AddrIdx + NumVAddrs + 1;
4993   } else {
4994     auto UnormConst =
4995         cast<ConstantSDNode>(Op.getOperand(AddrIdx + NumVAddrs + 2));
4996 
4997     Unorm = UnormConst->getZExtValue() ? True : False;
4998     CtrlIdx = AddrIdx + NumVAddrs + 3;
4999   }
5000 
5001   SDValue TFE;
5002   SDValue LWE;
5003   SDValue TexFail = Op.getOperand(CtrlIdx);
5004   bool IsTexFail = false;
5005   if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
5006     return Op;
5007 
5008   if (IsTexFail) {
5009     if (!DMaskLanes) {
5010       // Expecting to get an error flag since TFC is on - and dmask is 0
5011       // Force dmask to be at least 1 otherwise the instruction will fail
5012       DMask = 0x1;
5013       DMaskLanes = 1;
5014       NumVDataDwords = 1;
5015     }
5016     NumVDataDwords += 1;
5017     AdjustRetType = true;
5018   }
5019 
5020   // Has something earlier tagged that the return type needs adjusting
5021   // This happens if the instruction is a load or has set TexFailCtrl flags
5022   if (AdjustRetType) {
5023     // NumVDataDwords reflects the true number of dwords required in the return type
5024     if (DMaskLanes == 0 && !BaseOpcode->Store) {
5025       // This is a no-op load. This can be eliminated
5026       SDValue Undef = DAG.getUNDEF(Op.getValueType());
5027       if (isa<MemSDNode>(Op))
5028         return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
5029       return Undef;
5030     }
5031 
5032     EVT NewVT = NumVDataDwords > 1 ?
5033                   EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumVDataDwords)
5034                 : MVT::f32;
5035 
5036     ResultTypes[0] = NewVT;
5037     if (ResultTypes.size() == 3) {
5038       // Original result was aggregate type used for TexFailCtrl results
5039       // The actual instruction returns as a vector type which has now been
5040       // created. Remove the aggregate result.
5041       ResultTypes.erase(&ResultTypes[1]);
5042     }
5043   }
5044 
5045   SDValue GLC;
5046   SDValue SLC;
5047   SDValue DLC;
5048   if (BaseOpcode->Atomic) {
5049     GLC = True; // TODO no-return optimization
5050     if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, nullptr, &SLC,
5051                           IsGFX10 ? &DLC : nullptr))
5052       return Op;
5053   } else {
5054     if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, &GLC, &SLC,
5055                           IsGFX10 ? &DLC : nullptr))
5056       return Op;
5057   }
5058 
5059   SmallVector<SDValue, 26> Ops;
5060   if (BaseOpcode->Store || BaseOpcode->Atomic)
5061     Ops.push_back(VData); // vdata
5062   if (UseNSA) {
5063     for (const SDValue &Addr : VAddrs)
5064       Ops.push_back(Addr);
5065   } else {
5066     Ops.push_back(VAddr);
5067   }
5068   Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs)); // rsrc
5069   if (BaseOpcode->Sampler)
5070     Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs + 1)); // sampler
5071   Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
5072   if (IsGFX10)
5073     Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
5074   Ops.push_back(Unorm);
5075   if (IsGFX10)
5076     Ops.push_back(DLC);
5077   Ops.push_back(GLC);
5078   Ops.push_back(SLC);
5079   Ops.push_back(IsA16 &&  // a16 or r128
5080                 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
5081   Ops.push_back(TFE); // tfe
5082   Ops.push_back(LWE); // lwe
5083   if (!IsGFX10)
5084     Ops.push_back(DimInfo->DA ? True : False);
5085   if (BaseOpcode->HasD16)
5086     Ops.push_back(IsD16 ? True : False);
5087   if (isa<MemSDNode>(Op))
5088     Ops.push_back(Op.getOperand(0)); // chain
5089 
5090   int NumVAddrDwords =
5091       UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
5092   int Opcode = -1;
5093 
5094   if (IsGFX10) {
5095     Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
5096                                    UseNSA ? AMDGPU::MIMGEncGfx10NSA
5097                                           : AMDGPU::MIMGEncGfx10Default,
5098                                    NumVDataDwords, NumVAddrDwords);
5099   } else {
5100     if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
5101       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
5102                                      NumVDataDwords, NumVAddrDwords);
5103     if (Opcode == -1)
5104       Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
5105                                      NumVDataDwords, NumVAddrDwords);
5106   }
5107   assert(Opcode != -1);
5108 
5109   MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
5110   if (auto MemOp = dyn_cast<MemSDNode>(Op)) {
5111     MachineMemOperand *MemRef = MemOp->getMemOperand();
5112     DAG.setNodeMemRefs(NewNode, {MemRef});
5113   }
5114 
5115   if (BaseOpcode->AtomicX2) {
5116     SmallVector<SDValue, 1> Elt;
5117     DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
5118     return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
5119   } else if (!BaseOpcode->Store) {
5120     return constructRetValue(DAG, NewNode,
5121                              OrigResultTypes, IsTexFail,
5122                              Subtarget->hasUnpackedD16VMem(), IsD16,
5123                              DMaskLanes, NumVDataDwords, DL,
5124                              *DAG.getContext());
5125   }
5126 
5127   return SDValue(NewNode, 0);
5128 }
5129 
5130 SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
5131                                        SDValue Offset, SDValue GLC,
5132                                        SelectionDAG &DAG) const {
5133   MachineFunction &MF = DAG.getMachineFunction();
5134   MachineMemOperand *MMO = MF.getMachineMemOperand(
5135       MachinePointerInfo(),
5136       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
5137           MachineMemOperand::MOInvariant,
5138       VT.getStoreSize(), VT.getStoreSize());
5139 
5140   if (!Offset->isDivergent()) {
5141     SDValue Ops[] = {
5142         Rsrc,
5143         Offset, // Offset
5144         GLC     // glc
5145     };
5146     return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
5147                                    DAG.getVTList(VT), Ops, VT, MMO);
5148   }
5149 
5150   // We have a divergent offset. Emit a MUBUF buffer load instead. We can
5151   // assume that the buffer is unswizzled.
5152   SmallVector<SDValue, 4> Loads;
5153   unsigned NumLoads = 1;
5154   MVT LoadVT = VT.getSimpleVT();
5155   unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
5156   assert((LoadVT.getScalarType() == MVT::i32 ||
5157           LoadVT.getScalarType() == MVT::f32) &&
5158          isPowerOf2_32(NumElts));
5159 
5160   if (NumElts == 8 || NumElts == 16) {
5161     NumLoads = NumElts == 16 ? 4 : 2;
5162     LoadVT = MVT::v4i32;
5163   }
5164 
5165   SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
5166   unsigned CachePolicy = cast<ConstantSDNode>(GLC)->getZExtValue();
5167   SDValue Ops[] = {
5168       DAG.getEntryNode(),                         // Chain
5169       Rsrc,                                       // rsrc
5170       DAG.getConstant(0, DL, MVT::i32),           // vindex
5171       {},                                         // voffset
5172       {},                                         // soffset
5173       {},                                         // offset
5174       DAG.getConstant(CachePolicy, DL, MVT::i32), // cachepolicy
5175       DAG.getConstant(0, DL, MVT::i1),            // idxen
5176   };
5177 
5178   // Use the alignment to ensure that the required offsets will fit into the
5179   // immediate offsets.
5180   setBufferOffsets(Offset, DAG, &Ops[3], NumLoads > 1 ? 16 * NumLoads : 4);
5181 
5182   uint64_t InstOffset = cast<ConstantSDNode>(Ops[5])->getZExtValue();
5183   for (unsigned i = 0; i < NumLoads; ++i) {
5184     Ops[5] = DAG.getConstant(InstOffset + 16 * i, DL, MVT::i32);
5185     Loads.push_back(DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList,
5186                                             Ops, LoadVT, MMO));
5187   }
5188 
5189   if (VT == MVT::v8i32 || VT == MVT::v16i32)
5190     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
5191 
5192   return Loads[0];
5193 }
5194 
5195 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
5196                                                   SelectionDAG &DAG) const {
5197   MachineFunction &MF = DAG.getMachineFunction();
5198   auto MFI = MF.getInfo<SIMachineFunctionInfo>();
5199 
5200   EVT VT = Op.getValueType();
5201   SDLoc DL(Op);
5202   unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
5203 
5204   // TODO: Should this propagate fast-math-flags?
5205 
5206   switch (IntrinsicID) {
5207   case Intrinsic::amdgcn_implicit_buffer_ptr: {
5208     if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
5209       return emitNonHSAIntrinsicError(DAG, DL, VT);
5210     return getPreloadedValue(DAG, *MFI, VT,
5211                              AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
5212   }
5213   case Intrinsic::amdgcn_dispatch_ptr:
5214   case Intrinsic::amdgcn_queue_ptr: {
5215     if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
5216       DiagnosticInfoUnsupported BadIntrin(
5217           MF.getFunction(), "unsupported hsa intrinsic without hsa target",
5218           DL.getDebugLoc());
5219       DAG.getContext()->diagnose(BadIntrin);
5220       return DAG.getUNDEF(VT);
5221     }
5222 
5223     auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
5224       AMDGPUFunctionArgInfo::DISPATCH_PTR : AMDGPUFunctionArgInfo::QUEUE_PTR;
5225     return getPreloadedValue(DAG, *MFI, VT, RegID);
5226   }
5227   case Intrinsic::amdgcn_implicitarg_ptr: {
5228     if (MFI->isEntryFunction())
5229       return getImplicitArgPtr(DAG, DL);
5230     return getPreloadedValue(DAG, *MFI, VT,
5231                              AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
5232   }
5233   case Intrinsic::amdgcn_kernarg_segment_ptr: {
5234     return getPreloadedValue(DAG, *MFI, VT,
5235                              AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
5236   }
5237   case Intrinsic::amdgcn_dispatch_id: {
5238     return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
5239   }
5240   case Intrinsic::amdgcn_rcp:
5241     return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
5242   case Intrinsic::amdgcn_rsq:
5243     return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
5244   case Intrinsic::amdgcn_rsq_legacy:
5245     if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
5246       return emitRemovedIntrinsicError(DAG, DL, VT);
5247 
5248     return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
5249   case Intrinsic::amdgcn_rcp_legacy:
5250     if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
5251       return emitRemovedIntrinsicError(DAG, DL, VT);
5252     return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
5253   case Intrinsic::amdgcn_rsq_clamp: {
5254     if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
5255       return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
5256 
5257     Type *Type = VT.getTypeForEVT(*DAG.getContext());
5258     APFloat Max = APFloat::getLargest(Type->getFltSemantics());
5259     APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
5260 
5261     SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
5262     SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
5263                               DAG.getConstantFP(Max, DL, VT));
5264     return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
5265                        DAG.getConstantFP(Min, DL, VT));
5266   }
5267   case Intrinsic::r600_read_ngroups_x:
5268     if (Subtarget->isAmdHsaOS())
5269       return emitNonHSAIntrinsicError(DAG, DL, VT);
5270 
5271     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
5272                                     SI::KernelInputOffsets::NGROUPS_X, 4, false);
5273   case Intrinsic::r600_read_ngroups_y:
5274     if (Subtarget->isAmdHsaOS())
5275       return emitNonHSAIntrinsicError(DAG, DL, VT);
5276 
5277     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
5278                                     SI::KernelInputOffsets::NGROUPS_Y, 4, false);
5279   case Intrinsic::r600_read_ngroups_z:
5280     if (Subtarget->isAmdHsaOS())
5281       return emitNonHSAIntrinsicError(DAG, DL, VT);
5282 
5283     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
5284                                     SI::KernelInputOffsets::NGROUPS_Z, 4, false);
5285   case Intrinsic::r600_read_global_size_x:
5286     if (Subtarget->isAmdHsaOS())
5287       return emitNonHSAIntrinsicError(DAG, DL, VT);
5288 
5289     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
5290                                     SI::KernelInputOffsets::GLOBAL_SIZE_X, 4, false);
5291   case Intrinsic::r600_read_global_size_y:
5292     if (Subtarget->isAmdHsaOS())
5293       return emitNonHSAIntrinsicError(DAG, DL, VT);
5294 
5295     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
5296                                     SI::KernelInputOffsets::GLOBAL_SIZE_Y, 4, false);
5297   case Intrinsic::r600_read_global_size_z:
5298     if (Subtarget->isAmdHsaOS())
5299       return emitNonHSAIntrinsicError(DAG, DL, VT);
5300 
5301     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
5302                                     SI::KernelInputOffsets::GLOBAL_SIZE_Z, 4, false);
5303   case Intrinsic::r600_read_local_size_x:
5304     if (Subtarget->isAmdHsaOS())
5305       return emitNonHSAIntrinsicError(DAG, DL, VT);
5306 
5307     return lowerImplicitZextParam(DAG, Op, MVT::i16,
5308                                   SI::KernelInputOffsets::LOCAL_SIZE_X);
5309   case Intrinsic::r600_read_local_size_y:
5310     if (Subtarget->isAmdHsaOS())
5311       return emitNonHSAIntrinsicError(DAG, DL, VT);
5312 
5313     return lowerImplicitZextParam(DAG, Op, MVT::i16,
5314                                   SI::KernelInputOffsets::LOCAL_SIZE_Y);
5315   case Intrinsic::r600_read_local_size_z:
5316     if (Subtarget->isAmdHsaOS())
5317       return emitNonHSAIntrinsicError(DAG, DL, VT);
5318 
5319     return lowerImplicitZextParam(DAG, Op, MVT::i16,
5320                                   SI::KernelInputOffsets::LOCAL_SIZE_Z);
5321   case Intrinsic::amdgcn_workgroup_id_x:
5322   case Intrinsic::r600_read_tgid_x:
5323     return getPreloadedValue(DAG, *MFI, VT,
5324                              AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
5325   case Intrinsic::amdgcn_workgroup_id_y:
5326   case Intrinsic::r600_read_tgid_y:
5327     return getPreloadedValue(DAG, *MFI, VT,
5328                              AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
5329   case Intrinsic::amdgcn_workgroup_id_z:
5330   case Intrinsic::r600_read_tgid_z:
5331     return getPreloadedValue(DAG, *MFI, VT,
5332                              AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
5333   case Intrinsic::amdgcn_workitem_id_x:
5334   case Intrinsic::r600_read_tidig_x:
5335     return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
5336                           SDLoc(DAG.getEntryNode()),
5337                           MFI->getArgInfo().WorkItemIDX);
5338   case Intrinsic::amdgcn_workitem_id_y:
5339   case Intrinsic::r600_read_tidig_y:
5340     return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
5341                           SDLoc(DAG.getEntryNode()),
5342                           MFI->getArgInfo().WorkItemIDY);
5343   case Intrinsic::amdgcn_workitem_id_z:
5344   case Intrinsic::r600_read_tidig_z:
5345     return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
5346                           SDLoc(DAG.getEntryNode()),
5347                           MFI->getArgInfo().WorkItemIDZ);
5348   case Intrinsic::amdgcn_s_buffer_load: {
5349     unsigned Cache = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
5350     return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
5351                         DAG.getTargetConstant(Cache & 1, DL, MVT::i1), DAG);
5352   }
5353   case Intrinsic::amdgcn_fdiv_fast:
5354     return lowerFDIV_FAST(Op, DAG);
5355   case Intrinsic::amdgcn_interp_mov: {
5356     SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
5357     SDValue Glue = M0.getValue(1);
5358     return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, Op.getOperand(1),
5359                        Op.getOperand(2), Op.getOperand(3), Glue);
5360   }
5361   case Intrinsic::amdgcn_interp_p1: {
5362     SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
5363     SDValue Glue = M0.getValue(1);
5364     return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1),
5365                        Op.getOperand(2), Op.getOperand(3), Glue);
5366   }
5367   case Intrinsic::amdgcn_interp_p2: {
5368     SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
5369     SDValue Glue = SDValue(M0.getNode(), 1);
5370     return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1),
5371                        Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
5372                        Glue);
5373   }
5374   case Intrinsic::amdgcn_interp_p1_f16: {
5375     SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
5376     SDValue Glue = M0.getValue(1);
5377     if (getSubtarget()->getLDSBankCount() == 16) {
5378       // 16 bank LDS
5379       SDValue S = DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32,
5380                               DAG.getConstant(2, DL, MVT::i32), // P0
5381                               Op.getOperand(2), // Attrchan
5382                               Op.getOperand(3), // Attr
5383                               Glue);
5384       SDValue Ops[] = {
5385         Op.getOperand(1), // Src0
5386         Op.getOperand(2), // Attrchan
5387         Op.getOperand(3), // Attr
5388         DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers
5389         S, // Src2 - holds two f16 values selected by high
5390         DAG.getConstant(0, DL, MVT::i32), // $src2_modifiers
5391         Op.getOperand(4), // high
5392         DAG.getConstant(0, DL, MVT::i1), // $clamp
5393         DAG.getConstant(0, DL, MVT::i32) // $omod
5394       };
5395       return DAG.getNode(AMDGPUISD::INTERP_P1LV_F16, DL, MVT::f32, Ops);
5396     } else {
5397       // 32 bank LDS
5398       SDValue Ops[] = {
5399         Op.getOperand(1), // Src0
5400         Op.getOperand(2), // Attrchan
5401         Op.getOperand(3), // Attr
5402         DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers
5403         Op.getOperand(4), // high
5404         DAG.getConstant(0, DL, MVT::i1), // $clamp
5405         DAG.getConstant(0, DL, MVT::i32), // $omod
5406         Glue
5407       };
5408       return DAG.getNode(AMDGPUISD::INTERP_P1LL_F16, DL, MVT::f32, Ops);
5409     }
5410   }
5411   case Intrinsic::amdgcn_interp_p2_f16: {
5412     SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(6));
5413     SDValue Glue = SDValue(M0.getNode(), 1);
5414     SDValue Ops[] = {
5415       Op.getOperand(2), // Src0
5416       Op.getOperand(3), // Attrchan
5417       Op.getOperand(4), // Attr
5418       DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers
5419       Op.getOperand(1), // Src2
5420       DAG.getConstant(0, DL, MVT::i32), // $src2_modifiers
5421       Op.getOperand(5), // high
5422       DAG.getConstant(0, DL, MVT::i1), // $clamp
5423       Glue
5424     };
5425     return DAG.getNode(AMDGPUISD::INTERP_P2_F16, DL, MVT::f16, Ops);
5426   }
5427   case Intrinsic::amdgcn_sin:
5428     return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
5429 
5430   case Intrinsic::amdgcn_cos:
5431     return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
5432 
5433   case Intrinsic::amdgcn_log_clamp: {
5434     if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
5435       return SDValue();
5436 
5437     DiagnosticInfoUnsupported BadIntrin(
5438       MF.getFunction(), "intrinsic not supported on subtarget",
5439       DL.getDebugLoc());
5440       DAG.getContext()->diagnose(BadIntrin);
5441       return DAG.getUNDEF(VT);
5442   }
5443   case Intrinsic::amdgcn_ldexp:
5444     return DAG.getNode(AMDGPUISD::LDEXP, DL, VT,
5445                        Op.getOperand(1), Op.getOperand(2));
5446 
5447   case Intrinsic::amdgcn_fract:
5448     return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
5449 
5450   case Intrinsic::amdgcn_class:
5451     return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
5452                        Op.getOperand(1), Op.getOperand(2));
5453   case Intrinsic::amdgcn_div_fmas:
5454     return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
5455                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
5456                        Op.getOperand(4));
5457 
5458   case Intrinsic::amdgcn_div_fixup:
5459     return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
5460                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5461 
5462   case Intrinsic::amdgcn_trig_preop:
5463     return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
5464                        Op.getOperand(1), Op.getOperand(2));
5465   case Intrinsic::amdgcn_div_scale: {
5466     const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
5467 
5468     // Translate to the operands expected by the machine instruction. The
5469     // first parameter must be the same as the first instruction.
5470     SDValue Numerator = Op.getOperand(1);
5471     SDValue Denominator = Op.getOperand(2);
5472 
5473     // Note this order is opposite of the machine instruction's operations,
5474     // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
5475     // intrinsic has the numerator as the first operand to match a normal
5476     // division operation.
5477 
5478     SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
5479 
5480     return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
5481                        Denominator, Numerator);
5482   }
5483   case Intrinsic::amdgcn_icmp: {
5484     // There is a Pat that handles this variant, so return it as-is.
5485     if (Op.getOperand(1).getValueType() == MVT::i1 &&
5486         Op.getConstantOperandVal(2) == 0 &&
5487         Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
5488       return Op;
5489     return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
5490   }
5491   case Intrinsic::amdgcn_fcmp: {
5492     return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
5493   }
5494   case Intrinsic::amdgcn_fmed3:
5495     return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
5496                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5497   case Intrinsic::amdgcn_fdot2:
5498     return DAG.getNode(AMDGPUISD::FDOT2, DL, VT,
5499                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
5500                        Op.getOperand(4));
5501   case Intrinsic::amdgcn_fmul_legacy:
5502     return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
5503                        Op.getOperand(1), Op.getOperand(2));
5504   case Intrinsic::amdgcn_sffbh:
5505     return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
5506   case Intrinsic::amdgcn_sbfe:
5507     return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
5508                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5509   case Intrinsic::amdgcn_ubfe:
5510     return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
5511                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
5512   case Intrinsic::amdgcn_cvt_pkrtz:
5513   case Intrinsic::amdgcn_cvt_pknorm_i16:
5514   case Intrinsic::amdgcn_cvt_pknorm_u16:
5515   case Intrinsic::amdgcn_cvt_pk_i16:
5516   case Intrinsic::amdgcn_cvt_pk_u16: {
5517     // FIXME: Stop adding cast if v2f16/v2i16 are legal.
5518     EVT VT = Op.getValueType();
5519     unsigned Opcode;
5520 
5521     if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
5522       Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
5523     else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
5524       Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
5525     else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
5526       Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
5527     else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
5528       Opcode = AMDGPUISD::CVT_PK_I16_I32;
5529     else
5530       Opcode = AMDGPUISD::CVT_PK_U16_U32;
5531 
5532     if (isTypeLegal(VT))
5533       return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
5534 
5535     SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
5536                                Op.getOperand(1), Op.getOperand(2));
5537     return DAG.getNode(ISD::BITCAST, DL, VT, Node);
5538   }
5539   case Intrinsic::amdgcn_wqm: {
5540     SDValue Src = Op.getOperand(1);
5541     return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src),
5542                    0);
5543   }
5544   case Intrinsic::amdgcn_wwm: {
5545     SDValue Src = Op.getOperand(1);
5546     return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src),
5547                    0);
5548   }
5549   case Intrinsic::amdgcn_fmad_ftz:
5550     return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
5551                        Op.getOperand(2), Op.getOperand(3));
5552   default:
5553     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
5554             AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
5555       return lowerImage(Op, ImageDimIntr, DAG);
5556 
5557     return Op;
5558   }
5559 }
5560 
5561 SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
5562                                                  SelectionDAG &DAG) const {
5563   unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
5564   SDLoc DL(Op);
5565 
5566   switch (IntrID) {
5567   case Intrinsic::amdgcn_ds_ordered_add:
5568   case Intrinsic::amdgcn_ds_ordered_swap: {
5569     MemSDNode *M = cast<MemSDNode>(Op);
5570     SDValue Chain = M->getOperand(0);
5571     SDValue M0 = M->getOperand(2);
5572     SDValue Value = M->getOperand(3);
5573     unsigned OrderedCountIndex = M->getConstantOperandVal(7);
5574     unsigned WaveRelease = M->getConstantOperandVal(8);
5575     unsigned WaveDone = M->getConstantOperandVal(9);
5576     unsigned ShaderType;
5577     unsigned Instruction;
5578 
5579     switch (IntrID) {
5580     case Intrinsic::amdgcn_ds_ordered_add:
5581       Instruction = 0;
5582       break;
5583     case Intrinsic::amdgcn_ds_ordered_swap:
5584       Instruction = 1;
5585       break;
5586     }
5587 
5588     if (WaveDone && !WaveRelease)
5589       report_fatal_error("ds_ordered_count: wave_done requires wave_release");
5590 
5591     switch (DAG.getMachineFunction().getFunction().getCallingConv()) {
5592     case CallingConv::AMDGPU_CS:
5593     case CallingConv::AMDGPU_KERNEL:
5594       ShaderType = 0;
5595       break;
5596     case CallingConv::AMDGPU_PS:
5597       ShaderType = 1;
5598       break;
5599     case CallingConv::AMDGPU_VS:
5600       ShaderType = 2;
5601       break;
5602     case CallingConv::AMDGPU_GS:
5603       ShaderType = 3;
5604       break;
5605     default:
5606       report_fatal_error("ds_ordered_count unsupported for this calling conv");
5607     }
5608 
5609     unsigned Offset0 = OrderedCountIndex << 2;
5610     unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) |
5611                        (Instruction << 4);
5612     unsigned Offset = Offset0 | (Offset1 << 8);
5613 
5614     SDValue Ops[] = {
5615       Chain,
5616       Value,
5617       DAG.getTargetConstant(Offset, DL, MVT::i16),
5618       copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
5619     };
5620     return DAG.getMemIntrinsicNode(AMDGPUISD::DS_ORDERED_COUNT, DL,
5621                                    M->getVTList(), Ops, M->getMemoryVT(),
5622                                    M->getMemOperand());
5623   }
5624   case Intrinsic::amdgcn_ds_fadd: {
5625     MemSDNode *M = cast<MemSDNode>(Op);
5626     unsigned Opc;
5627     switch (IntrID) {
5628     case Intrinsic::amdgcn_ds_fadd:
5629       Opc = ISD::ATOMIC_LOAD_FADD;
5630       break;
5631     }
5632 
5633     return DAG.getAtomic(Opc, SDLoc(Op), M->getMemoryVT(),
5634                          M->getOperand(0), M->getOperand(2), M->getOperand(3),
5635                          M->getMemOperand());
5636   }
5637   case Intrinsic::amdgcn_atomic_inc:
5638   case Intrinsic::amdgcn_atomic_dec:
5639   case Intrinsic::amdgcn_ds_fmin:
5640   case Intrinsic::amdgcn_ds_fmax: {
5641     MemSDNode *M = cast<MemSDNode>(Op);
5642     unsigned Opc;
5643     switch (IntrID) {
5644     case Intrinsic::amdgcn_atomic_inc:
5645       Opc = AMDGPUISD::ATOMIC_INC;
5646       break;
5647     case Intrinsic::amdgcn_atomic_dec:
5648       Opc = AMDGPUISD::ATOMIC_DEC;
5649       break;
5650     case Intrinsic::amdgcn_ds_fmin:
5651       Opc = AMDGPUISD::ATOMIC_LOAD_FMIN;
5652       break;
5653     case Intrinsic::amdgcn_ds_fmax:
5654       Opc = AMDGPUISD::ATOMIC_LOAD_FMAX;
5655       break;
5656     default:
5657       llvm_unreachable("Unknown intrinsic!");
5658     }
5659     SDValue Ops[] = {
5660       M->getOperand(0), // Chain
5661       M->getOperand(2), // Ptr
5662       M->getOperand(3)  // Value
5663     };
5664 
5665     return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
5666                                    M->getMemoryVT(), M->getMemOperand());
5667   }
5668   case Intrinsic::amdgcn_buffer_load:
5669   case Intrinsic::amdgcn_buffer_load_format: {
5670     unsigned Glc = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
5671     unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
5672     unsigned IdxEn = 1;
5673     if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3)))
5674       IdxEn = Idx->getZExtValue() != 0;
5675     SDValue Ops[] = {
5676       Op.getOperand(0), // Chain
5677       Op.getOperand(2), // rsrc
5678       Op.getOperand(3), // vindex
5679       SDValue(),        // voffset -- will be set by setBufferOffsets
5680       SDValue(),        // soffset -- will be set by setBufferOffsets
5681       SDValue(),        // offset -- will be set by setBufferOffsets
5682       DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
5683       DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
5684     };
5685 
5686     setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]);
5687     unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
5688         AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
5689 
5690     EVT VT = Op.getValueType();
5691     EVT IntVT = VT.changeTypeToInteger();
5692     auto *M = cast<MemSDNode>(Op);
5693     EVT LoadVT = Op.getValueType();
5694 
5695     if (LoadVT.getScalarType() == MVT::f16)
5696       return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
5697                                  M, DAG, Ops);
5698 
5699     // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
5700     if (LoadVT.getScalarType() == MVT::i8 ||
5701         LoadVT.getScalarType() == MVT::i16)
5702       return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
5703 
5704     return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
5705                                M->getMemOperand(), DAG);
5706   }
5707   case Intrinsic::amdgcn_raw_buffer_load:
5708   case Intrinsic::amdgcn_raw_buffer_load_format: {
5709     auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
5710     SDValue Ops[] = {
5711       Op.getOperand(0), // Chain
5712       Op.getOperand(2), // rsrc
5713       DAG.getConstant(0, DL, MVT::i32), // vindex
5714       Offsets.first,    // voffset
5715       Op.getOperand(4), // soffset
5716       Offsets.second,   // offset
5717       Op.getOperand(5), // cachepolicy
5718       DAG.getConstant(0, DL, MVT::i1), // idxen
5719     };
5720 
5721     unsigned Opc = (IntrID == Intrinsic::amdgcn_raw_buffer_load) ?
5722         AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
5723 
5724     EVT VT = Op.getValueType();
5725     EVT IntVT = VT.changeTypeToInteger();
5726     auto *M = cast<MemSDNode>(Op);
5727     EVT LoadVT = Op.getValueType();
5728 
5729     if (LoadVT.getScalarType() == MVT::f16)
5730       return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
5731                                  M, DAG, Ops);
5732 
5733     // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
5734     if (LoadVT.getScalarType() == MVT::i8 ||
5735         LoadVT.getScalarType() == MVT::i16)
5736       return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
5737 
5738     return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
5739                                M->getMemOperand(), DAG);
5740   }
5741   case Intrinsic::amdgcn_struct_buffer_load:
5742   case Intrinsic::amdgcn_struct_buffer_load_format: {
5743     auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
5744     SDValue Ops[] = {
5745       Op.getOperand(0), // Chain
5746       Op.getOperand(2), // rsrc
5747       Op.getOperand(3), // vindex
5748       Offsets.first,    // voffset
5749       Op.getOperand(5), // soffset
5750       Offsets.second,   // offset
5751       Op.getOperand(6), // cachepolicy
5752       DAG.getConstant(1, DL, MVT::i1), // idxen
5753     };
5754 
5755     unsigned Opc = (IntrID == Intrinsic::amdgcn_struct_buffer_load) ?
5756         AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
5757 
5758     EVT VT = Op.getValueType();
5759     EVT IntVT = VT.changeTypeToInteger();
5760     auto *M = cast<MemSDNode>(Op);
5761     EVT LoadVT = Op.getValueType();
5762 
5763     if (LoadVT.getScalarType() == MVT::f16)
5764       return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
5765                                  M, DAG, Ops);
5766 
5767     // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
5768     if (LoadVT.getScalarType() == MVT::i8 ||
5769         LoadVT.getScalarType() == MVT::i16)
5770       return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
5771 
5772     return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
5773                                M->getMemOperand(), DAG);
5774   }
5775   case Intrinsic::amdgcn_tbuffer_load: {
5776     MemSDNode *M = cast<MemSDNode>(Op);
5777     EVT LoadVT = Op.getValueType();
5778 
5779     unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
5780     unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
5781     unsigned Glc = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
5782     unsigned Slc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
5783     unsigned IdxEn = 1;
5784     if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3)))
5785       IdxEn = Idx->getZExtValue() != 0;
5786     SDValue Ops[] = {
5787       Op.getOperand(0),  // Chain
5788       Op.getOperand(2),  // rsrc
5789       Op.getOperand(3),  // vindex
5790       Op.getOperand(4),  // voffset
5791       Op.getOperand(5),  // soffset
5792       Op.getOperand(6),  // offset
5793       DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
5794       DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
5795       DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
5796     };
5797 
5798     if (LoadVT.getScalarType() == MVT::f16)
5799       return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
5800                                  M, DAG, Ops);
5801     return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
5802                                Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
5803                                DAG);
5804   }
5805   case Intrinsic::amdgcn_raw_tbuffer_load: {
5806     MemSDNode *M = cast<MemSDNode>(Op);
5807     EVT LoadVT = Op.getValueType();
5808     auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
5809 
5810     SDValue Ops[] = {
5811       Op.getOperand(0),  // Chain
5812       Op.getOperand(2),  // rsrc
5813       DAG.getConstant(0, DL, MVT::i32), // vindex
5814       Offsets.first,     // voffset
5815       Op.getOperand(4),  // soffset
5816       Offsets.second,    // offset
5817       Op.getOperand(5),  // format
5818       Op.getOperand(6),  // cachepolicy
5819       DAG.getConstant(0, DL, MVT::i1), // idxen
5820     };
5821 
5822     if (LoadVT.getScalarType() == MVT::f16)
5823       return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
5824                                  M, DAG, Ops);
5825     return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
5826                                Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
5827                                DAG);
5828   }
5829   case Intrinsic::amdgcn_struct_tbuffer_load: {
5830     MemSDNode *M = cast<MemSDNode>(Op);
5831     EVT LoadVT = Op.getValueType();
5832     auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
5833 
5834     SDValue Ops[] = {
5835       Op.getOperand(0),  // Chain
5836       Op.getOperand(2),  // rsrc
5837       Op.getOperand(3),  // vindex
5838       Offsets.first,     // voffset
5839       Op.getOperand(5),  // soffset
5840       Offsets.second,    // offset
5841       Op.getOperand(6),  // format
5842       Op.getOperand(7),  // cachepolicy
5843       DAG.getConstant(1, DL, MVT::i1), // idxen
5844     };
5845 
5846     if (LoadVT.getScalarType() == MVT::f16)
5847       return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
5848                                  M, DAG, Ops);
5849     return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
5850                                Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
5851                                DAG);
5852   }
5853   case Intrinsic::amdgcn_buffer_atomic_swap:
5854   case Intrinsic::amdgcn_buffer_atomic_add:
5855   case Intrinsic::amdgcn_buffer_atomic_sub:
5856   case Intrinsic::amdgcn_buffer_atomic_smin:
5857   case Intrinsic::amdgcn_buffer_atomic_umin:
5858   case Intrinsic::amdgcn_buffer_atomic_smax:
5859   case Intrinsic::amdgcn_buffer_atomic_umax:
5860   case Intrinsic::amdgcn_buffer_atomic_and:
5861   case Intrinsic::amdgcn_buffer_atomic_or:
5862   case Intrinsic::amdgcn_buffer_atomic_xor: {
5863     unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
5864     unsigned IdxEn = 1;
5865     if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
5866       IdxEn = Idx->getZExtValue() != 0;
5867     SDValue Ops[] = {
5868       Op.getOperand(0), // Chain
5869       Op.getOperand(2), // vdata
5870       Op.getOperand(3), // rsrc
5871       Op.getOperand(4), // vindex
5872       SDValue(),        // voffset -- will be set by setBufferOffsets
5873       SDValue(),        // soffset -- will be set by setBufferOffsets
5874       SDValue(),        // offset -- will be set by setBufferOffsets
5875       DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy
5876       DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
5877     };
5878     setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
5879     EVT VT = Op.getValueType();
5880 
5881     auto *M = cast<MemSDNode>(Op);
5882     unsigned Opcode = 0;
5883 
5884     switch (IntrID) {
5885     case Intrinsic::amdgcn_buffer_atomic_swap:
5886       Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
5887       break;
5888     case Intrinsic::amdgcn_buffer_atomic_add:
5889       Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
5890       break;
5891     case Intrinsic::amdgcn_buffer_atomic_sub:
5892       Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
5893       break;
5894     case Intrinsic::amdgcn_buffer_atomic_smin:
5895       Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
5896       break;
5897     case Intrinsic::amdgcn_buffer_atomic_umin:
5898       Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
5899       break;
5900     case Intrinsic::amdgcn_buffer_atomic_smax:
5901       Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
5902       break;
5903     case Intrinsic::amdgcn_buffer_atomic_umax:
5904       Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
5905       break;
5906     case Intrinsic::amdgcn_buffer_atomic_and:
5907       Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
5908       break;
5909     case Intrinsic::amdgcn_buffer_atomic_or:
5910       Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
5911       break;
5912     case Intrinsic::amdgcn_buffer_atomic_xor:
5913       Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
5914       break;
5915     default:
5916       llvm_unreachable("unhandled atomic opcode");
5917     }
5918 
5919     return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
5920                                    M->getMemOperand());
5921   }
5922   case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5923   case Intrinsic::amdgcn_raw_buffer_atomic_add:
5924   case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5925   case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5926   case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5927   case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5928   case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5929   case Intrinsic::amdgcn_raw_buffer_atomic_and:
5930   case Intrinsic::amdgcn_raw_buffer_atomic_or:
5931   case Intrinsic::amdgcn_raw_buffer_atomic_xor: {
5932     auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
5933     SDValue Ops[] = {
5934       Op.getOperand(0), // Chain
5935       Op.getOperand(2), // vdata
5936       Op.getOperand(3), // rsrc
5937       DAG.getConstant(0, DL, MVT::i32), // vindex
5938       Offsets.first,    // voffset
5939       Op.getOperand(5), // soffset
5940       Offsets.second,   // offset
5941       Op.getOperand(6), // cachepolicy
5942       DAG.getConstant(0, DL, MVT::i1), // idxen
5943     };
5944     EVT VT = Op.getValueType();
5945 
5946     auto *M = cast<MemSDNode>(Op);
5947     unsigned Opcode = 0;
5948 
5949     switch (IntrID) {
5950     case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5951       Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
5952       break;
5953     case Intrinsic::amdgcn_raw_buffer_atomic_add:
5954       Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
5955       break;
5956     case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5957       Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
5958       break;
5959     case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5960       Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
5961       break;
5962     case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5963       Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
5964       break;
5965     case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5966       Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
5967       break;
5968     case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5969       Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
5970       break;
5971     case Intrinsic::amdgcn_raw_buffer_atomic_and:
5972       Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
5973       break;
5974     case Intrinsic::amdgcn_raw_buffer_atomic_or:
5975       Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
5976       break;
5977     case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5978       Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
5979       break;
5980     default:
5981       llvm_unreachable("unhandled atomic opcode");
5982     }
5983 
5984     return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
5985                                    M->getMemOperand());
5986   }
5987   case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5988   case Intrinsic::amdgcn_struct_buffer_atomic_add:
5989   case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5990   case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5991   case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5992   case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5993   case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5994   case Intrinsic::amdgcn_struct_buffer_atomic_and:
5995   case Intrinsic::amdgcn_struct_buffer_atomic_or:
5996   case Intrinsic::amdgcn_struct_buffer_atomic_xor: {
5997     auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
5998     SDValue Ops[] = {
5999       Op.getOperand(0), // Chain
6000       Op.getOperand(2), // vdata
6001       Op.getOperand(3), // rsrc
6002       Op.getOperand(4), // vindex
6003       Offsets.first,    // voffset
6004       Op.getOperand(6), // soffset
6005       Offsets.second,   // offset
6006       Op.getOperand(7), // cachepolicy
6007       DAG.getConstant(1, DL, MVT::i1), // idxen
6008     };
6009     EVT VT = Op.getValueType();
6010 
6011     auto *M = cast<MemSDNode>(Op);
6012     unsigned Opcode = 0;
6013 
6014     switch (IntrID) {
6015     case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6016       Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
6017       break;
6018     case Intrinsic::amdgcn_struct_buffer_atomic_add:
6019       Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
6020       break;
6021     case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6022       Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
6023       break;
6024     case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6025       Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
6026       break;
6027     case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6028       Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
6029       break;
6030     case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6031       Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
6032       break;
6033     case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6034       Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
6035       break;
6036     case Intrinsic::amdgcn_struct_buffer_atomic_and:
6037       Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
6038       break;
6039     case Intrinsic::amdgcn_struct_buffer_atomic_or:
6040       Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
6041       break;
6042     case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6043       Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
6044       break;
6045     default:
6046       llvm_unreachable("unhandled atomic opcode");
6047     }
6048 
6049     return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
6050                                    M->getMemOperand());
6051   }
6052   case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
6053     unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
6054     unsigned IdxEn = 1;
6055     if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(5)))
6056       IdxEn = Idx->getZExtValue() != 0;
6057     SDValue Ops[] = {
6058       Op.getOperand(0), // Chain
6059       Op.getOperand(2), // src
6060       Op.getOperand(3), // cmp
6061       Op.getOperand(4), // rsrc
6062       Op.getOperand(5), // vindex
6063       SDValue(),        // voffset -- will be set by setBufferOffsets
6064       SDValue(),        // soffset -- will be set by setBufferOffsets
6065       SDValue(),        // offset -- will be set by setBufferOffsets
6066       DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy
6067       DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
6068     };
6069     setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]);
6070     EVT VT = Op.getValueType();
6071     auto *M = cast<MemSDNode>(Op);
6072 
6073     return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
6074                                    Op->getVTList(), Ops, VT, M->getMemOperand());
6075   }
6076   case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: {
6077     auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
6078     SDValue Ops[] = {
6079       Op.getOperand(0), // Chain
6080       Op.getOperand(2), // src
6081       Op.getOperand(3), // cmp
6082       Op.getOperand(4), // rsrc
6083       DAG.getConstant(0, DL, MVT::i32), // vindex
6084       Offsets.first,    // voffset
6085       Op.getOperand(6), // soffset
6086       Offsets.second,   // offset
6087       Op.getOperand(7), // cachepolicy
6088       DAG.getConstant(0, DL, MVT::i1), // idxen
6089     };
6090     EVT VT = Op.getValueType();
6091     auto *M = cast<MemSDNode>(Op);
6092 
6093     return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
6094                                    Op->getVTList(), Ops, VT, M->getMemOperand());
6095   }
6096   case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: {
6097     auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG);
6098     SDValue Ops[] = {
6099       Op.getOperand(0), // Chain
6100       Op.getOperand(2), // src
6101       Op.getOperand(3), // cmp
6102       Op.getOperand(4), // rsrc
6103       Op.getOperand(5), // vindex
6104       Offsets.first,    // voffset
6105       Op.getOperand(7), // soffset
6106       Offsets.second,   // offset
6107       Op.getOperand(8), // cachepolicy
6108       DAG.getConstant(1, DL, MVT::i1), // idxen
6109     };
6110     EVT VT = Op.getValueType();
6111     auto *M = cast<MemSDNode>(Op);
6112 
6113     return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
6114                                    Op->getVTList(), Ops, VT, M->getMemOperand());
6115   }
6116 
6117   default:
6118     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
6119             AMDGPU::getImageDimIntrinsicInfo(IntrID))
6120       return lowerImage(Op, ImageDimIntr, DAG);
6121 
6122     return SDValue();
6123   }
6124 }
6125 
6126 // Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
6127 // dwordx4 if on SI.
6128 SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
6129                                               SDVTList VTList,
6130                                               ArrayRef<SDValue> Ops, EVT MemVT,
6131                                               MachineMemOperand *MMO,
6132                                               SelectionDAG &DAG) const {
6133   EVT VT = VTList.VTs[0];
6134   EVT WidenedVT = VT;
6135   EVT WidenedMemVT = MemVT;
6136   if (!Subtarget->hasDwordx3LoadStores() &&
6137       (WidenedVT == MVT::v3i32 || WidenedVT == MVT::v3f32)) {
6138     WidenedVT = EVT::getVectorVT(*DAG.getContext(),
6139                                  WidenedVT.getVectorElementType(), 4);
6140     WidenedMemVT = EVT::getVectorVT(*DAG.getContext(),
6141                                     WidenedMemVT.getVectorElementType(), 4);
6142     MMO = DAG.getMachineFunction().getMachineMemOperand(MMO, 0, 16);
6143   }
6144 
6145   assert(VTList.NumVTs == 2);
6146   SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
6147 
6148   auto NewOp = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
6149                                        WidenedMemVT, MMO);
6150   if (WidenedVT != VT) {
6151     auto Extract = DAG.getNode(
6152         ISD::EXTRACT_SUBVECTOR, DL, VT, NewOp,
6153         DAG.getConstant(0, DL, getVectorIdxTy(DAG.getDataLayout())));
6154     NewOp = DAG.getMergeValues({ Extract, SDValue(NewOp.getNode(), 1) }, DL);
6155   }
6156   return NewOp;
6157 }
6158 
6159 SDValue SITargetLowering::handleD16VData(SDValue VData,
6160                                          SelectionDAG &DAG) const {
6161   EVT StoreVT = VData.getValueType();
6162 
6163   // No change for f16 and legal vector D16 types.
6164   if (!StoreVT.isVector())
6165     return VData;
6166 
6167   SDLoc DL(VData);
6168   assert((StoreVT.getVectorNumElements() != 3) && "Handle v3f16");
6169 
6170   if (Subtarget->hasUnpackedD16VMem()) {
6171     // We need to unpack the packed data to store.
6172     EVT IntStoreVT = StoreVT.changeTypeToInteger();
6173     SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
6174 
6175     EVT EquivStoreVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
6176                                         StoreVT.getVectorNumElements());
6177     SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
6178     return DAG.UnrollVectorOp(ZExt.getNode());
6179   }
6180 
6181   assert(isTypeLegal(StoreVT));
6182   return VData;
6183 }
6184 
6185 SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
6186                                               SelectionDAG &DAG) const {
6187   SDLoc DL(Op);
6188   SDValue Chain = Op.getOperand(0);
6189   unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
6190   MachineFunction &MF = DAG.getMachineFunction();
6191 
6192   switch (IntrinsicID) {
6193   case Intrinsic::amdgcn_exp: {
6194     const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
6195     const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
6196     const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(8));
6197     const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(9));
6198 
6199     const SDValue Ops[] = {
6200       Chain,
6201       DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
6202       DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8),  // en
6203       Op.getOperand(4), // src0
6204       Op.getOperand(5), // src1
6205       Op.getOperand(6), // src2
6206       Op.getOperand(7), // src3
6207       DAG.getTargetConstant(0, DL, MVT::i1), // compr
6208       DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
6209     };
6210 
6211     unsigned Opc = Done->isNullValue() ?
6212       AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
6213     return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
6214   }
6215   case Intrinsic::amdgcn_exp_compr: {
6216     const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
6217     const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
6218     SDValue Src0 = Op.getOperand(4);
6219     SDValue Src1 = Op.getOperand(5);
6220     const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
6221     const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(7));
6222 
6223     SDValue Undef = DAG.getUNDEF(MVT::f32);
6224     const SDValue Ops[] = {
6225       Chain,
6226       DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
6227       DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8),  // en
6228       DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0),
6229       DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1),
6230       Undef, // src2
6231       Undef, // src3
6232       DAG.getTargetConstant(1, DL, MVT::i1), // compr
6233       DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
6234     };
6235 
6236     unsigned Opc = Done->isNullValue() ?
6237       AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
6238     return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
6239   }
6240   case Intrinsic::amdgcn_s_sendmsg:
6241   case Intrinsic::amdgcn_s_sendmsghalt: {
6242     unsigned NodeOp = (IntrinsicID == Intrinsic::amdgcn_s_sendmsg) ?
6243       AMDGPUISD::SENDMSG : AMDGPUISD::SENDMSGHALT;
6244     Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
6245     SDValue Glue = Chain.getValue(1);
6246     return DAG.getNode(NodeOp, DL, MVT::Other, Chain,
6247                        Op.getOperand(2), Glue);
6248   }
6249   case Intrinsic::amdgcn_init_exec: {
6250     return DAG.getNode(AMDGPUISD::INIT_EXEC, DL, MVT::Other, Chain,
6251                        Op.getOperand(2));
6252   }
6253   case Intrinsic::amdgcn_init_exec_from_input: {
6254     return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain,
6255                        Op.getOperand(2), Op.getOperand(3));
6256   }
6257   case Intrinsic::amdgcn_s_barrier: {
6258     if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
6259       const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6260       unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
6261       if (WGSize <= ST.getWavefrontSize())
6262         return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
6263                                           Op.getOperand(0)), 0);
6264     }
6265     return SDValue();
6266   };
6267   case Intrinsic::amdgcn_tbuffer_store: {
6268     SDValue VData = Op.getOperand(2);
6269     bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
6270     if (IsD16)
6271       VData = handleD16VData(VData, DAG);
6272     unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
6273     unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
6274     unsigned Glc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
6275     unsigned Slc = cast<ConstantSDNode>(Op.getOperand(11))->getZExtValue();
6276     unsigned IdxEn = 1;
6277     if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
6278       IdxEn = Idx->getZExtValue() != 0;
6279     SDValue Ops[] = {
6280       Chain,
6281       VData,             // vdata
6282       Op.getOperand(3),  // rsrc
6283       Op.getOperand(4),  // vindex
6284       Op.getOperand(5),  // voffset
6285       Op.getOperand(6),  // soffset
6286       Op.getOperand(7),  // offset
6287       DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
6288       DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
6289       DAG.getConstant(IdxEn, DL, MVT::i1), // idexen
6290     };
6291     unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
6292                            AMDGPUISD::TBUFFER_STORE_FORMAT;
6293     MemSDNode *M = cast<MemSDNode>(Op);
6294     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6295                                    M->getMemoryVT(), M->getMemOperand());
6296   }
6297 
6298   case Intrinsic::amdgcn_struct_tbuffer_store: {
6299     SDValue VData = Op.getOperand(2);
6300     bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
6301     if (IsD16)
6302       VData = handleD16VData(VData, DAG);
6303     auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
6304     SDValue Ops[] = {
6305       Chain,
6306       VData,             // vdata
6307       Op.getOperand(3),  // rsrc
6308       Op.getOperand(4),  // vindex
6309       Offsets.first,     // voffset
6310       Op.getOperand(6),  // soffset
6311       Offsets.second,    // offset
6312       Op.getOperand(7),  // format
6313       Op.getOperand(8),  // cachepolicy
6314       DAG.getConstant(1, DL, MVT::i1), // idexen
6315     };
6316     unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
6317                            AMDGPUISD::TBUFFER_STORE_FORMAT;
6318     MemSDNode *M = cast<MemSDNode>(Op);
6319     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6320                                    M->getMemoryVT(), M->getMemOperand());
6321   }
6322 
6323   case Intrinsic::amdgcn_raw_tbuffer_store: {
6324     SDValue VData = Op.getOperand(2);
6325     bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
6326     if (IsD16)
6327       VData = handleD16VData(VData, DAG);
6328     auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
6329     SDValue Ops[] = {
6330       Chain,
6331       VData,             // vdata
6332       Op.getOperand(3),  // rsrc
6333       DAG.getConstant(0, DL, MVT::i32), // vindex
6334       Offsets.first,     // voffset
6335       Op.getOperand(5),  // soffset
6336       Offsets.second,    // offset
6337       Op.getOperand(6),  // format
6338       Op.getOperand(7),  // cachepolicy
6339       DAG.getConstant(0, DL, MVT::i1), // idexen
6340     };
6341     unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
6342                            AMDGPUISD::TBUFFER_STORE_FORMAT;
6343     MemSDNode *M = cast<MemSDNode>(Op);
6344     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6345                                    M->getMemoryVT(), M->getMemOperand());
6346   }
6347 
6348   case Intrinsic::amdgcn_buffer_store:
6349   case Intrinsic::amdgcn_buffer_store_format: {
6350     SDValue VData = Op.getOperand(2);
6351     bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
6352     if (IsD16)
6353       VData = handleD16VData(VData, DAG);
6354     unsigned Glc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
6355     unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
6356     unsigned IdxEn = 1;
6357     if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4)))
6358       IdxEn = Idx->getZExtValue() != 0;
6359     SDValue Ops[] = {
6360       Chain,
6361       VData,
6362       Op.getOperand(3), // rsrc
6363       Op.getOperand(4), // vindex
6364       SDValue(), // voffset -- will be set by setBufferOffsets
6365       SDValue(), // soffset -- will be set by setBufferOffsets
6366       SDValue(), // offset -- will be set by setBufferOffsets
6367       DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
6368       DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
6369     };
6370     setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
6371     unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
6372                    AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
6373     Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
6374     MemSDNode *M = cast<MemSDNode>(Op);
6375 
6376     // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
6377     EVT VDataType = VData.getValueType().getScalarType();
6378     if (VDataType == MVT::i8 || VDataType == MVT::i16)
6379       return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
6380 
6381     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6382                                    M->getMemoryVT(), M->getMemOperand());
6383   }
6384 
6385   case Intrinsic::amdgcn_raw_buffer_store:
6386   case Intrinsic::amdgcn_raw_buffer_store_format: {
6387     SDValue VData = Op.getOperand(2);
6388     bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
6389     if (IsD16)
6390       VData = handleD16VData(VData, DAG);
6391     auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
6392     SDValue Ops[] = {
6393       Chain,
6394       VData,
6395       Op.getOperand(3), // rsrc
6396       DAG.getConstant(0, DL, MVT::i32), // vindex
6397       Offsets.first,    // voffset
6398       Op.getOperand(5), // soffset
6399       Offsets.second,   // offset
6400       Op.getOperand(6), // cachepolicy
6401       DAG.getConstant(0, DL, MVT::i1), // idxen
6402     };
6403     unsigned Opc = IntrinsicID == Intrinsic::amdgcn_raw_buffer_store ?
6404                    AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
6405     Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
6406     MemSDNode *M = cast<MemSDNode>(Op);
6407 
6408     // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
6409     EVT VDataType = VData.getValueType().getScalarType();
6410     if (VDataType == MVT::i8 || VDataType == MVT::i16)
6411       return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
6412 
6413     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6414                                    M->getMemoryVT(), M->getMemOperand());
6415   }
6416 
6417   case Intrinsic::amdgcn_struct_buffer_store:
6418   case Intrinsic::amdgcn_struct_buffer_store_format: {
6419     SDValue VData = Op.getOperand(2);
6420     bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
6421     if (IsD16)
6422       VData = handleD16VData(VData, DAG);
6423     auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
6424     SDValue Ops[] = {
6425       Chain,
6426       VData,
6427       Op.getOperand(3), // rsrc
6428       Op.getOperand(4), // vindex
6429       Offsets.first,    // voffset
6430       Op.getOperand(6), // soffset
6431       Offsets.second,   // offset
6432       Op.getOperand(7), // cachepolicy
6433       DAG.getConstant(1, DL, MVT::i1), // idxen
6434     };
6435     unsigned Opc = IntrinsicID == Intrinsic::amdgcn_struct_buffer_store ?
6436                    AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
6437     Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
6438     MemSDNode *M = cast<MemSDNode>(Op);
6439 
6440     // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
6441     EVT VDataType = VData.getValueType().getScalarType();
6442     if (VDataType == MVT::i8 || VDataType == MVT::i16)
6443       return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
6444 
6445     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
6446                                    M->getMemoryVT(), M->getMemOperand());
6447   }
6448 
6449   default: {
6450     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
6451             AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
6452       return lowerImage(Op, ImageDimIntr, DAG);
6453 
6454     return Op;
6455   }
6456   }
6457 }
6458 
6459 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
6460 // offset (the offset that is included in bounds checking and swizzling, to be
6461 // split between the instruction's voffset and immoffset fields) and soffset
6462 // (the offset that is excluded from bounds checking and swizzling, to go in
6463 // the instruction's soffset field).  This function takes the first kind of
6464 // offset and figures out how to split it between voffset and immoffset.
6465 std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
6466     SDValue Offset, SelectionDAG &DAG) const {
6467   SDLoc DL(Offset);
6468   const unsigned MaxImm = 4095;
6469   SDValue N0 = Offset;
6470   ConstantSDNode *C1 = nullptr;
6471 
6472   if ((C1 = dyn_cast<ConstantSDNode>(N0)))
6473     N0 = SDValue();
6474   else if (DAG.isBaseWithConstantOffset(N0)) {
6475     C1 = cast<ConstantSDNode>(N0.getOperand(1));
6476     N0 = N0.getOperand(0);
6477   }
6478 
6479   if (C1) {
6480     unsigned ImmOffset = C1->getZExtValue();
6481     // If the immediate value is too big for the immoffset field, put the value
6482     // and -4096 into the immoffset field so that the value that is copied/added
6483     // for the voffset field is a multiple of 4096, and it stands more chance
6484     // of being CSEd with the copy/add for another similar load/store.
6485     // However, do not do that rounding down to a multiple of 4096 if that is a
6486     // negative number, as it appears to be illegal to have a negative offset
6487     // in the vgpr, even if adding the immediate offset makes it positive.
6488     unsigned Overflow = ImmOffset & ~MaxImm;
6489     ImmOffset -= Overflow;
6490     if ((int32_t)Overflow < 0) {
6491       Overflow += ImmOffset;
6492       ImmOffset = 0;
6493     }
6494     C1 = cast<ConstantSDNode>(DAG.getConstant(ImmOffset, DL, MVT::i32));
6495     if (Overflow) {
6496       auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
6497       if (!N0)
6498         N0 = OverflowVal;
6499       else {
6500         SDValue Ops[] = { N0, OverflowVal };
6501         N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
6502       }
6503     }
6504   }
6505   if (!N0)
6506     N0 = DAG.getConstant(0, DL, MVT::i32);
6507   if (!C1)
6508     C1 = cast<ConstantSDNode>(DAG.getConstant(0, DL, MVT::i32));
6509   return {N0, SDValue(C1, 0)};
6510 }
6511 
6512 // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
6513 // three offsets (voffset, soffset and instoffset) into the SDValue[3] array
6514 // pointed to by Offsets.
6515 void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
6516                                         SelectionDAG &DAG, SDValue *Offsets,
6517                                         unsigned Align) const {
6518   SDLoc DL(CombinedOffset);
6519   if (auto C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
6520     uint32_t Imm = C->getZExtValue();
6521     uint32_t SOffset, ImmOffset;
6522     if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget, Align)) {
6523       Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
6524       Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
6525       Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
6526       return;
6527     }
6528   }
6529   if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
6530     SDValue N0 = CombinedOffset.getOperand(0);
6531     SDValue N1 = CombinedOffset.getOperand(1);
6532     uint32_t SOffset, ImmOffset;
6533     int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
6534     if (Offset >= 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
6535                                                 Subtarget, Align)) {
6536       Offsets[0] = N0;
6537       Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
6538       Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
6539       return;
6540     }
6541   }
6542   Offsets[0] = CombinedOffset;
6543   Offsets[1] = DAG.getConstant(0, DL, MVT::i32);
6544   Offsets[2] = DAG.getConstant(0, DL, MVT::i32);
6545 }
6546 
6547 // Handle 8 bit and 16 bit buffer loads
6548 SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
6549                                                      EVT LoadVT, SDLoc DL,
6550                                                      ArrayRef<SDValue> Ops,
6551                                                      MemSDNode *M) const {
6552   EVT IntVT = LoadVT.changeTypeToInteger();
6553   unsigned Opc = (LoadVT.getScalarType() == MVT::i8) ?
6554          AMDGPUISD::BUFFER_LOAD_UBYTE : AMDGPUISD::BUFFER_LOAD_USHORT;
6555 
6556   SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
6557   SDValue BufferLoad = DAG.getMemIntrinsicNode(Opc, DL, ResList,
6558                                                Ops, IntVT,
6559                                                M->getMemOperand());
6560   SDValue BufferLoadTrunc = DAG.getNode(ISD::TRUNCATE, DL,
6561                                         LoadVT.getScalarType(), BufferLoad);
6562   return DAG.getMergeValues({BufferLoadTrunc, BufferLoad.getValue(1)}, DL);
6563 }
6564 
6565 // Handle 8 bit and 16 bit buffer stores
6566 SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
6567                                                       EVT VDataType, SDLoc DL,
6568                                                       SDValue Ops[],
6569                                                       MemSDNode *M) const {
6570   SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
6571   Ops[1] = BufferStoreExt;
6572   unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE :
6573                                  AMDGPUISD::BUFFER_STORE_SHORT;
6574   ArrayRef<SDValue> OpsRef = makeArrayRef(&Ops[0], 9);
6575   return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
6576                                      M->getMemOperand());
6577 }
6578 
6579 static SDValue getLoadExtOrTrunc(SelectionDAG &DAG,
6580                                  ISD::LoadExtType ExtType, SDValue Op,
6581                                  const SDLoc &SL, EVT VT) {
6582   if (VT.bitsLT(Op.getValueType()))
6583     return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
6584 
6585   switch (ExtType) {
6586   case ISD::SEXTLOAD:
6587     return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
6588   case ISD::ZEXTLOAD:
6589     return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
6590   case ISD::EXTLOAD:
6591     return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
6592   case ISD::NON_EXTLOAD:
6593     return Op;
6594   }
6595 
6596   llvm_unreachable("invalid ext type");
6597 }
6598 
6599 SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const {
6600   SelectionDAG &DAG = DCI.DAG;
6601   if (Ld->getAlignment() < 4 || Ld->isDivergent())
6602     return SDValue();
6603 
6604   // FIXME: Constant loads should all be marked invariant.
6605   unsigned AS = Ld->getAddressSpace();
6606   if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
6607       AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
6608       (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
6609     return SDValue();
6610 
6611   // Don't do this early, since it may interfere with adjacent load merging for
6612   // illegal types. We can avoid losing alignment information for exotic types
6613   // pre-legalize.
6614   EVT MemVT = Ld->getMemoryVT();
6615   if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
6616       MemVT.getSizeInBits() >= 32)
6617     return SDValue();
6618 
6619   SDLoc SL(Ld);
6620 
6621   assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
6622          "unexpected vector extload");
6623 
6624   // TODO: Drop only high part of range.
6625   SDValue Ptr = Ld->getBasePtr();
6626   SDValue NewLoad = DAG.getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD,
6627                                 MVT::i32, SL, Ld->getChain(), Ptr,
6628                                 Ld->getOffset(),
6629                                 Ld->getPointerInfo(), MVT::i32,
6630                                 Ld->getAlignment(),
6631                                 Ld->getMemOperand()->getFlags(),
6632                                 Ld->getAAInfo(),
6633                                 nullptr); // Drop ranges
6634 
6635   EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
6636   if (MemVT.isFloatingPoint()) {
6637     assert(Ld->getExtensionType() == ISD::NON_EXTLOAD &&
6638            "unexpected fp extload");
6639     TruncVT = MemVT.changeTypeToInteger();
6640   }
6641 
6642   SDValue Cvt = NewLoad;
6643   if (Ld->getExtensionType() == ISD::SEXTLOAD) {
6644     Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
6645                       DAG.getValueType(TruncVT));
6646   } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
6647              Ld->getExtensionType() == ISD::NON_EXTLOAD) {
6648     Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
6649   } else {
6650     assert(Ld->getExtensionType() == ISD::EXTLOAD);
6651   }
6652 
6653   EVT VT = Ld->getValueType(0);
6654   EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
6655 
6656   DCI.AddToWorklist(Cvt.getNode());
6657 
6658   // We may need to handle exotic cases, such as i16->i64 extloads, so insert
6659   // the appropriate extension from the 32-bit load.
6660   Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
6661   DCI.AddToWorklist(Cvt.getNode());
6662 
6663   // Handle conversion back to floating point if necessary.
6664   Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
6665 
6666   return DAG.getMergeValues({ Cvt, NewLoad.getValue(1) }, SL);
6667 }
6668 
6669 SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
6670   SDLoc DL(Op);
6671   LoadSDNode *Load = cast<LoadSDNode>(Op);
6672   ISD::LoadExtType ExtType = Load->getExtensionType();
6673   EVT MemVT = Load->getMemoryVT();
6674 
6675   if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
6676     if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
6677       return SDValue();
6678 
6679     // FIXME: Copied from PPC
6680     // First, load into 32 bits, then truncate to 1 bit.
6681 
6682     SDValue Chain = Load->getChain();
6683     SDValue BasePtr = Load->getBasePtr();
6684     MachineMemOperand *MMO = Load->getMemOperand();
6685 
6686     EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
6687 
6688     SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
6689                                    BasePtr, RealMemVT, MMO);
6690 
6691     if (!MemVT.isVector()) {
6692       SDValue Ops[] = {
6693         DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
6694         NewLD.getValue(1)
6695       };
6696 
6697       return DAG.getMergeValues(Ops, DL);
6698     }
6699 
6700     SmallVector<SDValue, 3> Elts;
6701     for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
6702       SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
6703                                 DAG.getConstant(I, DL, MVT::i32));
6704 
6705       Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
6706     }
6707 
6708     SDValue Ops[] = {
6709       DAG.getBuildVector(MemVT, DL, Elts),
6710       NewLD.getValue(1)
6711     };
6712 
6713     return DAG.getMergeValues(Ops, DL);
6714   }
6715 
6716   if (!MemVT.isVector())
6717     return SDValue();
6718 
6719   assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
6720          "Custom lowering for non-i32 vectors hasn't been implemented.");
6721 
6722   unsigned Alignment = Load->getAlignment();
6723   unsigned AS = Load->getAddressSpace();
6724   if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
6725                           AS, Alignment)) {
6726     SDValue Ops[2];
6727     std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
6728     return DAG.getMergeValues(Ops, DL);
6729   }
6730   if (Subtarget->hasLDSMisalignedBug() &&
6731       AS == AMDGPUAS::FLAT_ADDRESS &&
6732       Alignment < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
6733     return SplitVectorLoad(Op, DAG);
6734   }
6735 
6736   MachineFunction &MF = DAG.getMachineFunction();
6737   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
6738   // If there is a possibilty that flat instruction access scratch memory
6739   // then we need to use the same legalization rules we use for private.
6740   if (AS == AMDGPUAS::FLAT_ADDRESS)
6741     AS = MFI->hasFlatScratchInit() ?
6742          AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
6743 
6744   unsigned NumElements = MemVT.getVectorNumElements();
6745 
6746   if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
6747       AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
6748     if (!Op->isDivergent() && Alignment >= 4 && NumElements < 32) {
6749       if (MemVT.isPow2VectorType())
6750         return SDValue();
6751       if (NumElements == 3)
6752         return WidenVectorLoad(Op, DAG);
6753       return SplitVectorLoad(Op, DAG);
6754     }
6755     // Non-uniform loads will be selected to MUBUF instructions, so they
6756     // have the same legalization requirements as global and private
6757     // loads.
6758     //
6759   }
6760 
6761   if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
6762       AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
6763       AS == AMDGPUAS::GLOBAL_ADDRESS) {
6764     if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
6765         !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load) &&
6766         Alignment >= 4 && NumElements < 32) {
6767       if (MemVT.isPow2VectorType())
6768         return SDValue();
6769       if (NumElements == 3)
6770         return WidenVectorLoad(Op, DAG);
6771       return SplitVectorLoad(Op, DAG);
6772     }
6773     // Non-uniform loads will be selected to MUBUF instructions, so they
6774     // have the same legalization requirements as global and private
6775     // loads.
6776     //
6777   }
6778   if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
6779       AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
6780       AS == AMDGPUAS::GLOBAL_ADDRESS ||
6781       AS == AMDGPUAS::FLAT_ADDRESS) {
6782     if (NumElements > 4)
6783       return SplitVectorLoad(Op, DAG);
6784     // v3 loads not supported on SI.
6785     if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
6786       return WidenVectorLoad(Op, DAG);
6787     // v3 and v4 loads are supported for private and global memory.
6788     return SDValue();
6789   }
6790   if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
6791     // Depending on the setting of the private_element_size field in the
6792     // resource descriptor, we can only make private accesses up to a certain
6793     // size.
6794     switch (Subtarget->getMaxPrivateElementSize()) {
6795     case 4:
6796       return scalarizeVectorLoad(Load, DAG);
6797     case 8:
6798       if (NumElements > 2)
6799         return SplitVectorLoad(Op, DAG);
6800       return SDValue();
6801     case 16:
6802       // Same as global/flat
6803       if (NumElements > 4)
6804         return SplitVectorLoad(Op, DAG);
6805       // v3 loads not supported on SI.
6806       if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
6807         return WidenVectorLoad(Op, DAG);
6808       return SDValue();
6809     default:
6810       llvm_unreachable("unsupported private_element_size");
6811     }
6812   } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
6813     // Use ds_read_b128 if possible.
6814     if (Subtarget->useDS128() && Load->getAlignment() >= 16 &&
6815         MemVT.getStoreSize() == 16)
6816       return SDValue();
6817 
6818     if (NumElements > 2)
6819       return SplitVectorLoad(Op, DAG);
6820 
6821     // SI has a hardware bug in the LDS / GDS boounds checking: if the base
6822     // address is negative, then the instruction is incorrectly treated as
6823     // out-of-bounds even if base + offsets is in bounds. Split vectorized
6824     // loads here to avoid emitting ds_read2_b32. We may re-combine the
6825     // load later in the SILoadStoreOptimizer.
6826     if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
6827         NumElements == 2 && MemVT.getStoreSize() == 8 &&
6828         Load->getAlignment() < 8) {
6829       return SplitVectorLoad(Op, DAG);
6830     }
6831   }
6832   return SDValue();
6833 }
6834 
6835 SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
6836   EVT VT = Op.getValueType();
6837   assert(VT.getSizeInBits() == 64);
6838 
6839   SDLoc DL(Op);
6840   SDValue Cond = Op.getOperand(0);
6841 
6842   SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
6843   SDValue One = DAG.getConstant(1, DL, MVT::i32);
6844 
6845   SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
6846   SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
6847 
6848   SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
6849   SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
6850 
6851   SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
6852 
6853   SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
6854   SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
6855 
6856   SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
6857 
6858   SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
6859   return DAG.getNode(ISD::BITCAST, DL, VT, Res);
6860 }
6861 
6862 // Catch division cases where we can use shortcuts with rcp and rsq
6863 // instructions.
6864 SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
6865                                               SelectionDAG &DAG) const {
6866   SDLoc SL(Op);
6867   SDValue LHS = Op.getOperand(0);
6868   SDValue RHS = Op.getOperand(1);
6869   EVT VT = Op.getValueType();
6870   const SDNodeFlags Flags = Op->getFlags();
6871   bool Unsafe = DAG.getTarget().Options.UnsafeFPMath || Flags.hasAllowReciprocal();
6872 
6873   if (!Unsafe && VT == MVT::f32 && Subtarget->hasFP32Denormals())
6874     return SDValue();
6875 
6876   if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
6877     if (Unsafe || VT == MVT::f32 || VT == MVT::f16) {
6878       if (CLHS->isExactlyValue(1.0)) {
6879         // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
6880         // the CI documentation has a worst case error of 1 ulp.
6881         // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
6882         // use it as long as we aren't trying to use denormals.
6883         //
6884         // v_rcp_f16 and v_rsq_f16 DO support denormals.
6885 
6886         // 1.0 / sqrt(x) -> rsq(x)
6887 
6888         // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
6889         // error seems really high at 2^29 ULP.
6890         if (RHS.getOpcode() == ISD::FSQRT)
6891           return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
6892 
6893         // 1.0 / x -> rcp(x)
6894         return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
6895       }
6896 
6897       // Same as for 1.0, but expand the sign out of the constant.
6898       if (CLHS->isExactlyValue(-1.0)) {
6899         // -1.0 / x -> rcp (fneg x)
6900         SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
6901         return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
6902       }
6903     }
6904   }
6905 
6906   if (Unsafe) {
6907     // Turn into multiply by the reciprocal.
6908     // x / y -> x * (1.0 / y)
6909     SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
6910     return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
6911   }
6912 
6913   return SDValue();
6914 }
6915 
6916 static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
6917                           EVT VT, SDValue A, SDValue B, SDValue GlueChain) {
6918   if (GlueChain->getNumValues() <= 1) {
6919     return DAG.getNode(Opcode, SL, VT, A, B);
6920   }
6921 
6922   assert(GlueChain->getNumValues() == 3);
6923 
6924   SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
6925   switch (Opcode) {
6926   default: llvm_unreachable("no chain equivalent for opcode");
6927   case ISD::FMUL:
6928     Opcode = AMDGPUISD::FMUL_W_CHAIN;
6929     break;
6930   }
6931 
6932   return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B,
6933                      GlueChain.getValue(2));
6934 }
6935 
6936 static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
6937                            EVT VT, SDValue A, SDValue B, SDValue C,
6938                            SDValue GlueChain) {
6939   if (GlueChain->getNumValues() <= 1) {
6940     return DAG.getNode(Opcode, SL, VT, A, B, C);
6941   }
6942 
6943   assert(GlueChain->getNumValues() == 3);
6944 
6945   SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
6946   switch (Opcode) {
6947   default: llvm_unreachable("no chain equivalent for opcode");
6948   case ISD::FMA:
6949     Opcode = AMDGPUISD::FMA_W_CHAIN;
6950     break;
6951   }
6952 
6953   return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, C,
6954                      GlueChain.getValue(2));
6955 }
6956 
6957 SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
6958   if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
6959     return FastLowered;
6960 
6961   SDLoc SL(Op);
6962   SDValue Src0 = Op.getOperand(0);
6963   SDValue Src1 = Op.getOperand(1);
6964 
6965   SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
6966   SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
6967 
6968   SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
6969   SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
6970 
6971   SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
6972   SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
6973 
6974   return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
6975 }
6976 
6977 // Faster 2.5 ULP division that does not support denormals.
6978 SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
6979   SDLoc SL(Op);
6980   SDValue LHS = Op.getOperand(1);
6981   SDValue RHS = Op.getOperand(2);
6982 
6983   SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
6984 
6985   const APFloat K0Val(BitsToFloat(0x6f800000));
6986   const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
6987 
6988   const APFloat K1Val(BitsToFloat(0x2f800000));
6989   const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
6990 
6991   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
6992 
6993   EVT SetCCVT =
6994     getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
6995 
6996   SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
6997 
6998   SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
6999 
7000   // TODO: Should this propagate fast-math-flags?
7001   r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
7002 
7003   // rcp does not support denormals.
7004   SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
7005 
7006   SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
7007 
7008   return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
7009 }
7010 
7011 SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
7012   if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
7013     return FastLowered;
7014 
7015   SDLoc SL(Op);
7016   SDValue LHS = Op.getOperand(0);
7017   SDValue RHS = Op.getOperand(1);
7018 
7019   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
7020 
7021   SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
7022 
7023   SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
7024                                           RHS, RHS, LHS);
7025   SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
7026                                         LHS, RHS, LHS);
7027 
7028   // Denominator is scaled to not be denormal, so using rcp is ok.
7029   SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
7030                                   DenominatorScaled);
7031   SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
7032                                      DenominatorScaled);
7033 
7034   const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE |
7035                                (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
7036                                (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
7037 
7038   const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16);
7039 
7040   if (!Subtarget->hasFP32Denormals()) {
7041     SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
7042     const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
7043                                                       SL, MVT::i32);
7044     SDValue EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs,
7045                                        DAG.getEntryNode(),
7046                                        EnableDenormValue, BitField);
7047     SDValue Ops[3] = {
7048       NegDivScale0,
7049       EnableDenorm.getValue(0),
7050       EnableDenorm.getValue(1)
7051     };
7052 
7053     NegDivScale0 = DAG.getMergeValues(Ops, SL);
7054   }
7055 
7056   SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
7057                              ApproxRcp, One, NegDivScale0);
7058 
7059   SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
7060                              ApproxRcp, Fma0);
7061 
7062   SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
7063                            Fma1, Fma1);
7064 
7065   SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
7066                              NumeratorScaled, Mul);
7067 
7068   SDValue Fma3 = getFPTernOp(DAG, ISD::FMA,SL, MVT::f32, Fma2, Fma1, Mul, Fma2);
7069 
7070   SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
7071                              NumeratorScaled, Fma3);
7072 
7073   if (!Subtarget->hasFP32Denormals()) {
7074     const SDValue DisableDenormValue =
7075         DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
7076     SDValue DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other,
7077                                         Fma4.getValue(1),
7078                                         DisableDenormValue,
7079                                         BitField,
7080                                         Fma4.getValue(2));
7081 
7082     SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
7083                                       DisableDenorm, DAG.getRoot());
7084     DAG.setRoot(OutputChain);
7085   }
7086 
7087   SDValue Scale = NumeratorScaled.getValue(1);
7088   SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
7089                              Fma4, Fma1, Fma3, Scale);
7090 
7091   return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS);
7092 }
7093 
7094 SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
7095   if (DAG.getTarget().Options.UnsafeFPMath)
7096     return lowerFastUnsafeFDIV(Op, DAG);
7097 
7098   SDLoc SL(Op);
7099   SDValue X = Op.getOperand(0);
7100   SDValue Y = Op.getOperand(1);
7101 
7102   const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
7103 
7104   SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
7105 
7106   SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
7107 
7108   SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
7109 
7110   SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
7111 
7112   SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
7113 
7114   SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
7115 
7116   SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
7117 
7118   SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
7119 
7120   SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
7121   SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
7122 
7123   SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
7124                              NegDivScale0, Mul, DivScale1);
7125 
7126   SDValue Scale;
7127 
7128   if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
7129     // Workaround a hardware bug on SI where the condition output from div_scale
7130     // is not usable.
7131 
7132     const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
7133 
7134     // Figure out if the scale to use for div_fmas.
7135     SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
7136     SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
7137     SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
7138     SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
7139 
7140     SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
7141     SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
7142 
7143     SDValue Scale0Hi
7144       = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
7145     SDValue Scale1Hi
7146       = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
7147 
7148     SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
7149     SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
7150     Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
7151   } else {
7152     Scale = DivScale1.getValue(1);
7153   }
7154 
7155   SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
7156                              Fma4, Fma3, Mul, Scale);
7157 
7158   return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
7159 }
7160 
7161 SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
7162   EVT VT = Op.getValueType();
7163 
7164   if (VT == MVT::f32)
7165     return LowerFDIV32(Op, DAG);
7166 
7167   if (VT == MVT::f64)
7168     return LowerFDIV64(Op, DAG);
7169 
7170   if (VT == MVT::f16)
7171     return LowerFDIV16(Op, DAG);
7172 
7173   llvm_unreachable("Unexpected type for fdiv");
7174 }
7175 
7176 SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
7177   SDLoc DL(Op);
7178   StoreSDNode *Store = cast<StoreSDNode>(Op);
7179   EVT VT = Store->getMemoryVT();
7180 
7181   if (VT == MVT::i1) {
7182     return DAG.getTruncStore(Store->getChain(), DL,
7183        DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
7184        Store->getBasePtr(), MVT::i1, Store->getMemOperand());
7185   }
7186 
7187   assert(VT.isVector() &&
7188          Store->getValue().getValueType().getScalarType() == MVT::i32);
7189 
7190   unsigned AS = Store->getAddressSpace();
7191   if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
7192                           AS, Store->getAlignment())) {
7193     return expandUnalignedStore(Store, DAG);
7194   }
7195 
7196   if (Subtarget->hasLDSMisalignedBug() &&
7197       AS == AMDGPUAS::FLAT_ADDRESS &&
7198       Store->getAlignment() < VT.getStoreSize() && VT.getSizeInBits() > 32) {
7199     return SplitVectorStore(Op, DAG);
7200   }
7201 
7202   MachineFunction &MF = DAG.getMachineFunction();
7203   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
7204   // If there is a possibilty that flat instruction access scratch memory
7205   // then we need to use the same legalization rules we use for private.
7206   if (AS == AMDGPUAS::FLAT_ADDRESS)
7207     AS = MFI->hasFlatScratchInit() ?
7208          AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
7209 
7210   unsigned NumElements = VT.getVectorNumElements();
7211   if (AS == AMDGPUAS::GLOBAL_ADDRESS ||
7212       AS == AMDGPUAS::FLAT_ADDRESS) {
7213     if (NumElements > 4)
7214       return SplitVectorStore(Op, DAG);
7215     // v3 stores not supported on SI.
7216     if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
7217       return SplitVectorStore(Op, DAG);
7218     return SDValue();
7219   } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
7220     switch (Subtarget->getMaxPrivateElementSize()) {
7221     case 4:
7222       return scalarizeVectorStore(Store, DAG);
7223     case 8:
7224       if (NumElements > 2)
7225         return SplitVectorStore(Op, DAG);
7226       return SDValue();
7227     case 16:
7228       if (NumElements > 4 || NumElements == 3)
7229         return SplitVectorStore(Op, DAG);
7230       return SDValue();
7231     default:
7232       llvm_unreachable("unsupported private_element_size");
7233     }
7234   } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
7235     // Use ds_write_b128 if possible.
7236     if (Subtarget->useDS128() && Store->getAlignment() >= 16 &&
7237         VT.getStoreSize() == 16 && NumElements != 3)
7238       return SDValue();
7239 
7240     if (NumElements > 2)
7241       return SplitVectorStore(Op, DAG);
7242 
7243     // SI has a hardware bug in the LDS / GDS boounds checking: if the base
7244     // address is negative, then the instruction is incorrectly treated as
7245     // out-of-bounds even if base + offsets is in bounds. Split vectorized
7246     // stores here to avoid emitting ds_write2_b32. We may re-combine the
7247     // store later in the SILoadStoreOptimizer.
7248     if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
7249         NumElements == 2 && VT.getStoreSize() == 8 &&
7250         Store->getAlignment() < 8) {
7251       return SplitVectorStore(Op, DAG);
7252     }
7253 
7254     return SDValue();
7255   } else {
7256     llvm_unreachable("unhandled address space");
7257   }
7258 }
7259 
7260 SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
7261   SDLoc DL(Op);
7262   EVT VT = Op.getValueType();
7263   SDValue Arg = Op.getOperand(0);
7264   SDValue TrigVal;
7265 
7266   // TODO: Should this propagate fast-math-flags?
7267 
7268   SDValue OneOver2Pi = DAG.getConstantFP(0.5 / M_PI, DL, VT);
7269 
7270   if (Subtarget->hasTrigReducedRange()) {
7271     SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi);
7272     TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal);
7273   } else {
7274     TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi);
7275   }
7276 
7277   switch (Op.getOpcode()) {
7278   case ISD::FCOS:
7279     return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal);
7280   case ISD::FSIN:
7281     return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal);
7282   default:
7283     llvm_unreachable("Wrong trig opcode");
7284   }
7285 }
7286 
7287 SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
7288   AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
7289   assert(AtomicNode->isCompareAndSwap());
7290   unsigned AS = AtomicNode->getAddressSpace();
7291 
7292   // No custom lowering required for local address space
7293   if (!isFlatGlobalAddrSpace(AS))
7294     return Op;
7295 
7296   // Non-local address space requires custom lowering for atomic compare
7297   // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
7298   SDLoc DL(Op);
7299   SDValue ChainIn = Op.getOperand(0);
7300   SDValue Addr = Op.getOperand(1);
7301   SDValue Old = Op.getOperand(2);
7302   SDValue New = Op.getOperand(3);
7303   EVT VT = Op.getValueType();
7304   MVT SimpleVT = VT.getSimpleVT();
7305   MVT VecType = MVT::getVectorVT(SimpleVT, 2);
7306 
7307   SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
7308   SDValue Ops[] = { ChainIn, Addr, NewOld };
7309 
7310   return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
7311                                  Ops, VT, AtomicNode->getMemOperand());
7312 }
7313 
7314 //===----------------------------------------------------------------------===//
7315 // Custom DAG optimizations
7316 //===----------------------------------------------------------------------===//
7317 
7318 SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
7319                                                      DAGCombinerInfo &DCI) const {
7320   EVT VT = N->getValueType(0);
7321   EVT ScalarVT = VT.getScalarType();
7322   if (ScalarVT != MVT::f32)
7323     return SDValue();
7324 
7325   SelectionDAG &DAG = DCI.DAG;
7326   SDLoc DL(N);
7327 
7328   SDValue Src = N->getOperand(0);
7329   EVT SrcVT = Src.getValueType();
7330 
7331   // TODO: We could try to match extracting the higher bytes, which would be
7332   // easier if i8 vectors weren't promoted to i32 vectors, particularly after
7333   // types are legalized. v4i8 -> v4f32 is probably the only case to worry
7334   // about in practice.
7335   if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
7336     if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
7337       SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src);
7338       DCI.AddToWorklist(Cvt.getNode());
7339       return Cvt;
7340     }
7341   }
7342 
7343   return SDValue();
7344 }
7345 
7346 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
7347 
7348 // This is a variant of
7349 // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
7350 //
7351 // The normal DAG combiner will do this, but only if the add has one use since
7352 // that would increase the number of instructions.
7353 //
7354 // This prevents us from seeing a constant offset that can be folded into a
7355 // memory instruction's addressing mode. If we know the resulting add offset of
7356 // a pointer can be folded into an addressing offset, we can replace the pointer
7357 // operand with the add of new constant offset. This eliminates one of the uses,
7358 // and may allow the remaining use to also be simplified.
7359 //
7360 SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
7361                                                unsigned AddrSpace,
7362                                                EVT MemVT,
7363                                                DAGCombinerInfo &DCI) const {
7364   SDValue N0 = N->getOperand(0);
7365   SDValue N1 = N->getOperand(1);
7366 
7367   // We only do this to handle cases where it's profitable when there are
7368   // multiple uses of the add, so defer to the standard combine.
7369   if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
7370       N0->hasOneUse())
7371     return SDValue();
7372 
7373   const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
7374   if (!CN1)
7375     return SDValue();
7376 
7377   const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
7378   if (!CAdd)
7379     return SDValue();
7380 
7381   // If the resulting offset is too large, we can't fold it into the addressing
7382   // mode offset.
7383   APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
7384   Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
7385 
7386   AddrMode AM;
7387   AM.HasBaseReg = true;
7388   AM.BaseOffs = Offset.getSExtValue();
7389   if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
7390     return SDValue();
7391 
7392   SelectionDAG &DAG = DCI.DAG;
7393   SDLoc SL(N);
7394   EVT VT = N->getValueType(0);
7395 
7396   SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
7397   SDValue COffset = DAG.getConstant(Offset, SL, MVT::i32);
7398 
7399   SDNodeFlags Flags;
7400   Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() &&
7401                           (N0.getOpcode() == ISD::OR ||
7402                            N0->getFlags().hasNoUnsignedWrap()));
7403 
7404   return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
7405 }
7406 
7407 SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
7408                                                   DAGCombinerInfo &DCI) const {
7409   SDValue Ptr = N->getBasePtr();
7410   SelectionDAG &DAG = DCI.DAG;
7411   SDLoc SL(N);
7412 
7413   // TODO: We could also do this for multiplies.
7414   if (Ptr.getOpcode() == ISD::SHL) {
7415     SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(),  N->getAddressSpace(),
7416                                           N->getMemoryVT(), DCI);
7417     if (NewPtr) {
7418       SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
7419 
7420       NewOps[N->getOpcode() == ISD::STORE ? 2 : 1] = NewPtr;
7421       return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
7422     }
7423   }
7424 
7425   return SDValue();
7426 }
7427 
7428 static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
7429   return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
7430          (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
7431          (Opc == ISD::XOR && Val == 0);
7432 }
7433 
7434 // Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
7435 // will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
7436 // integer combine opportunities since most 64-bit operations are decomposed
7437 // this way.  TODO: We won't want this for SALU especially if it is an inline
7438 // immediate.
7439 SDValue SITargetLowering::splitBinaryBitConstantOp(
7440   DAGCombinerInfo &DCI,
7441   const SDLoc &SL,
7442   unsigned Opc, SDValue LHS,
7443   const ConstantSDNode *CRHS) const {
7444   uint64_t Val = CRHS->getZExtValue();
7445   uint32_t ValLo = Lo_32(Val);
7446   uint32_t ValHi = Hi_32(Val);
7447   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
7448 
7449     if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
7450          bitOpWithConstantIsReducible(Opc, ValHi)) ||
7451         (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
7452     // If we need to materialize a 64-bit immediate, it will be split up later
7453     // anyway. Avoid creating the harder to understand 64-bit immediate
7454     // materialization.
7455     return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
7456   }
7457 
7458   return SDValue();
7459 }
7460 
7461 // Returns true if argument is a boolean value which is not serialized into
7462 // memory or argument and does not require v_cmdmask_b32 to be deserialized.
7463 static bool isBoolSGPR(SDValue V) {
7464   if (V.getValueType() != MVT::i1)
7465     return false;
7466   switch (V.getOpcode()) {
7467   default: break;
7468   case ISD::SETCC:
7469   case ISD::AND:
7470   case ISD::OR:
7471   case ISD::XOR:
7472   case AMDGPUISD::FP_CLASS:
7473     return true;
7474   }
7475   return false;
7476 }
7477 
7478 // If a constant has all zeroes or all ones within each byte return it.
7479 // Otherwise return 0.
7480 static uint32_t getConstantPermuteMask(uint32_t C) {
7481   // 0xff for any zero byte in the mask
7482   uint32_t ZeroByteMask = 0;
7483   if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
7484   if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
7485   if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
7486   if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000;
7487   uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
7488   if ((NonZeroByteMask & C) != NonZeroByteMask)
7489     return 0; // Partial bytes selected.
7490   return C;
7491 }
7492 
7493 // Check if a node selects whole bytes from its operand 0 starting at a byte
7494 // boundary while masking the rest. Returns select mask as in the v_perm_b32
7495 // or -1 if not succeeded.
7496 // Note byte select encoding:
7497 // value 0-3 selects corresponding source byte;
7498 // value 0xc selects zero;
7499 // value 0xff selects 0xff.
7500 static uint32_t getPermuteMask(SelectionDAG &DAG, SDValue V) {
7501   assert(V.getValueSizeInBits() == 32);
7502 
7503   if (V.getNumOperands() != 2)
7504     return ~0;
7505 
7506   ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
7507   if (!N1)
7508     return ~0;
7509 
7510   uint32_t C = N1->getZExtValue();
7511 
7512   switch (V.getOpcode()) {
7513   default:
7514     break;
7515   case ISD::AND:
7516     if (uint32_t ConstMask = getConstantPermuteMask(C)) {
7517       return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
7518     }
7519     break;
7520 
7521   case ISD::OR:
7522     if (uint32_t ConstMask = getConstantPermuteMask(C)) {
7523       return (0x03020100 & ~ConstMask) | ConstMask;
7524     }
7525     break;
7526 
7527   case ISD::SHL:
7528     if (C % 8)
7529       return ~0;
7530 
7531     return uint32_t((0x030201000c0c0c0cull << C) >> 32);
7532 
7533   case ISD::SRL:
7534     if (C % 8)
7535       return ~0;
7536 
7537     return uint32_t(0x0c0c0c0c03020100ull >> C);
7538   }
7539 
7540   return ~0;
7541 }
7542 
7543 SDValue SITargetLowering::performAndCombine(SDNode *N,
7544                                             DAGCombinerInfo &DCI) const {
7545   if (DCI.isBeforeLegalize())
7546     return SDValue();
7547 
7548   SelectionDAG &DAG = DCI.DAG;
7549   EVT VT = N->getValueType(0);
7550   SDValue LHS = N->getOperand(0);
7551   SDValue RHS = N->getOperand(1);
7552 
7553 
7554   const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
7555   if (VT == MVT::i64 && CRHS) {
7556     if (SDValue Split
7557         = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
7558       return Split;
7559   }
7560 
7561   if (CRHS && VT == MVT::i32) {
7562     // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
7563     // nb = number of trailing zeroes in mask
7564     // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
7565     // given that we are selecting 8 or 16 bit fields starting at byte boundary.
7566     uint64_t Mask = CRHS->getZExtValue();
7567     unsigned Bits = countPopulation(Mask);
7568     if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
7569         (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
7570       if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
7571         unsigned Shift = CShift->getZExtValue();
7572         unsigned NB = CRHS->getAPIntValue().countTrailingZeros();
7573         unsigned Offset = NB + Shift;
7574         if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
7575           SDLoc SL(N);
7576           SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
7577                                     LHS->getOperand(0),
7578                                     DAG.getConstant(Offset, SL, MVT::i32),
7579                                     DAG.getConstant(Bits, SL, MVT::i32));
7580           EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
7581           SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
7582                                     DAG.getValueType(NarrowVT));
7583           SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
7584                                     DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
7585           return Shl;
7586         }
7587       }
7588     }
7589 
7590     // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
7591     if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
7592         isa<ConstantSDNode>(LHS.getOperand(2))) {
7593       uint32_t Sel = getConstantPermuteMask(Mask);
7594       if (!Sel)
7595         return SDValue();
7596 
7597       // Select 0xc for all zero bytes
7598       Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
7599       SDLoc DL(N);
7600       return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
7601                          LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
7602     }
7603   }
7604 
7605   // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
7606   // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
7607   if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
7608     ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
7609     ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
7610 
7611     SDValue X = LHS.getOperand(0);
7612     SDValue Y = RHS.getOperand(0);
7613     if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X)
7614       return SDValue();
7615 
7616     if (LCC == ISD::SETO) {
7617       if (X != LHS.getOperand(1))
7618         return SDValue();
7619 
7620       if (RCC == ISD::SETUNE) {
7621         const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
7622         if (!C1 || !C1->isInfinity() || C1->isNegative())
7623           return SDValue();
7624 
7625         const uint32_t Mask = SIInstrFlags::N_NORMAL |
7626                               SIInstrFlags::N_SUBNORMAL |
7627                               SIInstrFlags::N_ZERO |
7628                               SIInstrFlags::P_ZERO |
7629                               SIInstrFlags::P_SUBNORMAL |
7630                               SIInstrFlags::P_NORMAL;
7631 
7632         static_assert(((~(SIInstrFlags::S_NAN |
7633                           SIInstrFlags::Q_NAN |
7634                           SIInstrFlags::N_INFINITY |
7635                           SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
7636                       "mask not equal");
7637 
7638         SDLoc DL(N);
7639         return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
7640                            X, DAG.getConstant(Mask, DL, MVT::i32));
7641       }
7642     }
7643   }
7644 
7645   if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
7646     std::swap(LHS, RHS);
7647 
7648   if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
7649       RHS.hasOneUse()) {
7650     ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
7651     // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | n_nan)
7652     // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | n_nan)
7653     const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
7654     if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
7655         (RHS.getOperand(0) == LHS.getOperand(0) &&
7656          LHS.getOperand(0) == LHS.getOperand(1))) {
7657       const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
7658       unsigned NewMask = LCC == ISD::SETO ?
7659         Mask->getZExtValue() & ~OrdMask :
7660         Mask->getZExtValue() & OrdMask;
7661 
7662       SDLoc DL(N);
7663       return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
7664                          DAG.getConstant(NewMask, DL, MVT::i32));
7665     }
7666   }
7667 
7668   if (VT == MVT::i32 &&
7669       (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) {
7670     // and x, (sext cc from i1) => select cc, x, 0
7671     if (RHS.getOpcode() != ISD::SIGN_EXTEND)
7672       std::swap(LHS, RHS);
7673     if (isBoolSGPR(RHS.getOperand(0)))
7674       return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0),
7675                            LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
7676   }
7677 
7678   // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
7679   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
7680   if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
7681       N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
7682     uint32_t LHSMask = getPermuteMask(DAG, LHS);
7683     uint32_t RHSMask = getPermuteMask(DAG, RHS);
7684     if (LHSMask != ~0u && RHSMask != ~0u) {
7685       // Canonicalize the expression in an attempt to have fewer unique masks
7686       // and therefore fewer registers used to hold the masks.
7687       if (LHSMask > RHSMask) {
7688         std::swap(LHSMask, RHSMask);
7689         std::swap(LHS, RHS);
7690       }
7691 
7692       // Select 0xc for each lane used from source operand. Zero has 0xc mask
7693       // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
7694       uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
7695       uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
7696 
7697       // Check of we need to combine values from two sources within a byte.
7698       if (!(LHSUsedLanes & RHSUsedLanes) &&
7699           // If we select high and lower word keep it for SDWA.
7700           // TODO: teach SDWA to work with v_perm_b32 and remove the check.
7701           !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
7702         // Each byte in each mask is either selector mask 0-3, or has higher
7703         // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
7704         // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
7705         // mask which is not 0xff wins. By anding both masks we have a correct
7706         // result except that 0x0c shall be corrected to give 0x0c only.
7707         uint32_t Mask = LHSMask & RHSMask;
7708         for (unsigned I = 0; I < 32; I += 8) {
7709           uint32_t ByteSel = 0xff << I;
7710           if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
7711             Mask &= (0x0c << I) & 0xffffffff;
7712         }
7713 
7714         // Add 4 to each active LHS lane. It will not affect any existing 0xff
7715         // or 0x0c.
7716         uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
7717         SDLoc DL(N);
7718 
7719         return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
7720                            LHS.getOperand(0), RHS.getOperand(0),
7721                            DAG.getConstant(Sel, DL, MVT::i32));
7722       }
7723     }
7724   }
7725 
7726   return SDValue();
7727 }
7728 
7729 SDValue SITargetLowering::performOrCombine(SDNode *N,
7730                                            DAGCombinerInfo &DCI) const {
7731   SelectionDAG &DAG = DCI.DAG;
7732   SDValue LHS = N->getOperand(0);
7733   SDValue RHS = N->getOperand(1);
7734 
7735   EVT VT = N->getValueType(0);
7736   if (VT == MVT::i1) {
7737     // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
7738     if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
7739         RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
7740       SDValue Src = LHS.getOperand(0);
7741       if (Src != RHS.getOperand(0))
7742         return SDValue();
7743 
7744       const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
7745       const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
7746       if (!CLHS || !CRHS)
7747         return SDValue();
7748 
7749       // Only 10 bits are used.
7750       static const uint32_t MaxMask = 0x3ff;
7751 
7752       uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
7753       SDLoc DL(N);
7754       return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
7755                          Src, DAG.getConstant(NewMask, DL, MVT::i32));
7756     }
7757 
7758     return SDValue();
7759   }
7760 
7761   // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
7762   if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
7763       LHS.getOpcode() == AMDGPUISD::PERM &&
7764       isa<ConstantSDNode>(LHS.getOperand(2))) {
7765     uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
7766     if (!Sel)
7767       return SDValue();
7768 
7769     Sel |= LHS.getConstantOperandVal(2);
7770     SDLoc DL(N);
7771     return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
7772                        LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
7773   }
7774 
7775   // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
7776   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
7777   if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
7778       N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32) != -1) {
7779     uint32_t LHSMask = getPermuteMask(DAG, LHS);
7780     uint32_t RHSMask = getPermuteMask(DAG, RHS);
7781     if (LHSMask != ~0u && RHSMask != ~0u) {
7782       // Canonicalize the expression in an attempt to have fewer unique masks
7783       // and therefore fewer registers used to hold the masks.
7784       if (LHSMask > RHSMask) {
7785         std::swap(LHSMask, RHSMask);
7786         std::swap(LHS, RHS);
7787       }
7788 
7789       // Select 0xc for each lane used from source operand. Zero has 0xc mask
7790       // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
7791       uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
7792       uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
7793 
7794       // Check of we need to combine values from two sources within a byte.
7795       if (!(LHSUsedLanes & RHSUsedLanes) &&
7796           // If we select high and lower word keep it for SDWA.
7797           // TODO: teach SDWA to work with v_perm_b32 and remove the check.
7798           !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
7799         // Kill zero bytes selected by other mask. Zero value is 0xc.
7800         LHSMask &= ~RHSUsedLanes;
7801         RHSMask &= ~LHSUsedLanes;
7802         // Add 4 to each active LHS lane
7803         LHSMask |= LHSUsedLanes & 0x04040404;
7804         // Combine masks
7805         uint32_t Sel = LHSMask | RHSMask;
7806         SDLoc DL(N);
7807 
7808         return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
7809                            LHS.getOperand(0), RHS.getOperand(0),
7810                            DAG.getConstant(Sel, DL, MVT::i32));
7811       }
7812     }
7813   }
7814 
7815   if (VT != MVT::i64)
7816     return SDValue();
7817 
7818   // TODO: This could be a generic combine with a predicate for extracting the
7819   // high half of an integer being free.
7820 
7821   // (or i64:x, (zero_extend i32:y)) ->
7822   //   i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
7823   if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
7824       RHS.getOpcode() != ISD::ZERO_EXTEND)
7825     std::swap(LHS, RHS);
7826 
7827   if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
7828     SDValue ExtSrc = RHS.getOperand(0);
7829     EVT SrcVT = ExtSrc.getValueType();
7830     if (SrcVT == MVT::i32) {
7831       SDLoc SL(N);
7832       SDValue LowLHS, HiBits;
7833       std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
7834       SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
7835 
7836       DCI.AddToWorklist(LowOr.getNode());
7837       DCI.AddToWorklist(HiBits.getNode());
7838 
7839       SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
7840                                 LowOr, HiBits);
7841       return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
7842     }
7843   }
7844 
7845   const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
7846   if (CRHS) {
7847     if (SDValue Split
7848           = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR, LHS, CRHS))
7849       return Split;
7850   }
7851 
7852   return SDValue();
7853 }
7854 
7855 SDValue SITargetLowering::performXorCombine(SDNode *N,
7856                                             DAGCombinerInfo &DCI) const {
7857   EVT VT = N->getValueType(0);
7858   if (VT != MVT::i64)
7859     return SDValue();
7860 
7861   SDValue LHS = N->getOperand(0);
7862   SDValue RHS = N->getOperand(1);
7863 
7864   const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
7865   if (CRHS) {
7866     if (SDValue Split
7867           = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
7868       return Split;
7869   }
7870 
7871   return SDValue();
7872 }
7873 
7874 // Instructions that will be lowered with a final instruction that zeros the
7875 // high result bits.
7876 // XXX - probably only need to list legal operations.
7877 static bool fp16SrcZerosHighBits(unsigned Opc) {
7878   switch (Opc) {
7879   case ISD::FADD:
7880   case ISD::FSUB:
7881   case ISD::FMUL:
7882   case ISD::FDIV:
7883   case ISD::FREM:
7884   case ISD::FMA:
7885   case ISD::FMAD:
7886   case ISD::FCANONICALIZE:
7887   case ISD::FP_ROUND:
7888   case ISD::UINT_TO_FP:
7889   case ISD::SINT_TO_FP:
7890   case ISD::FABS:
7891     // Fabs is lowered to a bit operation, but it's an and which will clear the
7892     // high bits anyway.
7893   case ISD::FSQRT:
7894   case ISD::FSIN:
7895   case ISD::FCOS:
7896   case ISD::FPOWI:
7897   case ISD::FPOW:
7898   case ISD::FLOG:
7899   case ISD::FLOG2:
7900   case ISD::FLOG10:
7901   case ISD::FEXP:
7902   case ISD::FEXP2:
7903   case ISD::FCEIL:
7904   case ISD::FTRUNC:
7905   case ISD::FRINT:
7906   case ISD::FNEARBYINT:
7907   case ISD::FROUND:
7908   case ISD::FFLOOR:
7909   case ISD::FMINNUM:
7910   case ISD::FMAXNUM:
7911   case AMDGPUISD::FRACT:
7912   case AMDGPUISD::CLAMP:
7913   case AMDGPUISD::COS_HW:
7914   case AMDGPUISD::SIN_HW:
7915   case AMDGPUISD::FMIN3:
7916   case AMDGPUISD::FMAX3:
7917   case AMDGPUISD::FMED3:
7918   case AMDGPUISD::FMAD_FTZ:
7919   case AMDGPUISD::RCP:
7920   case AMDGPUISD::RSQ:
7921   case AMDGPUISD::RCP_IFLAG:
7922   case AMDGPUISD::LDEXP:
7923     return true;
7924   default:
7925     // fcopysign, select and others may be lowered to 32-bit bit operations
7926     // which don't zero the high bits.
7927     return false;
7928   }
7929 }
7930 
7931 SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
7932                                                    DAGCombinerInfo &DCI) const {
7933   if (!Subtarget->has16BitInsts() ||
7934       DCI.getDAGCombineLevel() < AfterLegalizeDAG)
7935     return SDValue();
7936 
7937   EVT VT = N->getValueType(0);
7938   if (VT != MVT::i32)
7939     return SDValue();
7940 
7941   SDValue Src = N->getOperand(0);
7942   if (Src.getValueType() != MVT::i16)
7943     return SDValue();
7944 
7945   // (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src
7946   // FIXME: It is not universally true that the high bits are zeroed on gfx9.
7947   if (Src.getOpcode() == ISD::BITCAST) {
7948     SDValue BCSrc = Src.getOperand(0);
7949     if (BCSrc.getValueType() == MVT::f16 &&
7950         fp16SrcZerosHighBits(BCSrc.getOpcode()))
7951       return DCI.DAG.getNode(AMDGPUISD::FP16_ZEXT, SDLoc(N), VT, BCSrc);
7952   }
7953 
7954   return SDValue();
7955 }
7956 
7957 SDValue SITargetLowering::performSignExtendInRegCombine(SDNode *N,
7958                                                         DAGCombinerInfo &DCI)
7959                                                         const {
7960   SDValue Src = N->getOperand(0);
7961   auto *VTSign = cast<VTSDNode>(N->getOperand(1));
7962 
7963   if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
7964       VTSign->getVT() == MVT::i8) ||
7965       (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
7966       VTSign->getVT() == MVT::i16)) &&
7967       Src.hasOneUse()) {
7968     auto *M = cast<MemSDNode>(Src);
7969     SDValue Ops[] = {
7970       Src.getOperand(0), // Chain
7971       Src.getOperand(1), // rsrc
7972       Src.getOperand(2), // vindex
7973       Src.getOperand(3), // voffset
7974       Src.getOperand(4), // soffset
7975       Src.getOperand(5), // offset
7976       Src.getOperand(6),
7977       Src.getOperand(7)
7978     };
7979     // replace with BUFFER_LOAD_BYTE/SHORT
7980     SDVTList ResList = DCI.DAG.getVTList(MVT::i32,
7981                                          Src.getOperand(0).getValueType());
7982     unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE) ?
7983                    AMDGPUISD::BUFFER_LOAD_BYTE : AMDGPUISD::BUFFER_LOAD_SHORT;
7984     SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(Opc, SDLoc(N),
7985                                                           ResList,
7986                                                           Ops, M->getMemoryVT(),
7987                                                           M->getMemOperand());
7988     return DCI.DAG.getMergeValues({BufferLoadSignExt,
7989                                   BufferLoadSignExt.getValue(1)}, SDLoc(N));
7990   }
7991   return SDValue();
7992 }
7993 
7994 SDValue SITargetLowering::performClassCombine(SDNode *N,
7995                                               DAGCombinerInfo &DCI) const {
7996   SelectionDAG &DAG = DCI.DAG;
7997   SDValue Mask = N->getOperand(1);
7998 
7999   // fp_class x, 0 -> false
8000   if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) {
8001     if (CMask->isNullValue())
8002       return DAG.getConstant(0, SDLoc(N), MVT::i1);
8003   }
8004 
8005   if (N->getOperand(0).isUndef())
8006     return DAG.getUNDEF(MVT::i1);
8007 
8008   return SDValue();
8009 }
8010 
8011 SDValue SITargetLowering::performRcpCombine(SDNode *N,
8012                                             DAGCombinerInfo &DCI) const {
8013   EVT VT = N->getValueType(0);
8014   SDValue N0 = N->getOperand(0);
8015 
8016   if (N0.isUndef())
8017     return N0;
8018 
8019   if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
8020                          N0.getOpcode() == ISD::SINT_TO_FP)) {
8021     return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
8022                            N->getFlags());
8023   }
8024 
8025   return AMDGPUTargetLowering::performRcpCombine(N, DCI);
8026 }
8027 
8028 bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
8029                                        unsigned MaxDepth) const {
8030   unsigned Opcode = Op.getOpcode();
8031   if (Opcode == ISD::FCANONICALIZE)
8032     return true;
8033 
8034   if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
8035     auto F = CFP->getValueAPF();
8036     if (F.isNaN() && F.isSignaling())
8037       return false;
8038     return !F.isDenormal() || denormalsEnabledForType(Op.getValueType());
8039   }
8040 
8041   // If source is a result of another standard FP operation it is already in
8042   // canonical form.
8043   if (MaxDepth == 0)
8044     return false;
8045 
8046   switch (Opcode) {
8047   // These will flush denorms if required.
8048   case ISD::FADD:
8049   case ISD::FSUB:
8050   case ISD::FMUL:
8051   case ISD::FCEIL:
8052   case ISD::FFLOOR:
8053   case ISD::FMA:
8054   case ISD::FMAD:
8055   case ISD::FSQRT:
8056   case ISD::FDIV:
8057   case ISD::FREM:
8058   case ISD::FP_ROUND:
8059   case ISD::FP_EXTEND:
8060   case AMDGPUISD::FMUL_LEGACY:
8061   case AMDGPUISD::FMAD_FTZ:
8062   case AMDGPUISD::RCP:
8063   case AMDGPUISD::RSQ:
8064   case AMDGPUISD::RSQ_CLAMP:
8065   case AMDGPUISD::RCP_LEGACY:
8066   case AMDGPUISD::RSQ_LEGACY:
8067   case AMDGPUISD::RCP_IFLAG:
8068   case AMDGPUISD::TRIG_PREOP:
8069   case AMDGPUISD::DIV_SCALE:
8070   case AMDGPUISD::DIV_FMAS:
8071   case AMDGPUISD::DIV_FIXUP:
8072   case AMDGPUISD::FRACT:
8073   case AMDGPUISD::LDEXP:
8074   case AMDGPUISD::CVT_PKRTZ_F16_F32:
8075   case AMDGPUISD::CVT_F32_UBYTE0:
8076   case AMDGPUISD::CVT_F32_UBYTE1:
8077   case AMDGPUISD::CVT_F32_UBYTE2:
8078   case AMDGPUISD::CVT_F32_UBYTE3:
8079     return true;
8080 
8081   // It can/will be lowered or combined as a bit operation.
8082   // Need to check their input recursively to handle.
8083   case ISD::FNEG:
8084   case ISD::FABS:
8085   case ISD::FCOPYSIGN:
8086     return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
8087 
8088   case ISD::FSIN:
8089   case ISD::FCOS:
8090   case ISD::FSINCOS:
8091     return Op.getValueType().getScalarType() != MVT::f16;
8092 
8093   case ISD::FMINNUM:
8094   case ISD::FMAXNUM:
8095   case ISD::FMINNUM_IEEE:
8096   case ISD::FMAXNUM_IEEE:
8097   case AMDGPUISD::CLAMP:
8098   case AMDGPUISD::FMED3:
8099   case AMDGPUISD::FMAX3:
8100   case AMDGPUISD::FMIN3: {
8101     // FIXME: Shouldn't treat the generic operations different based these.
8102     // However, we aren't really required to flush the result from
8103     // minnum/maxnum..
8104 
8105     // snans will be quieted, so we only need to worry about denormals.
8106     if (Subtarget->supportsMinMaxDenormModes() ||
8107         denormalsEnabledForType(Op.getValueType()))
8108       return true;
8109 
8110     // Flushing may be required.
8111     // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
8112     // targets need to check their input recursively.
8113 
8114     // FIXME: Does this apply with clamp? It's implemented with max.
8115     for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
8116       if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
8117         return false;
8118     }
8119 
8120     return true;
8121   }
8122   case ISD::SELECT: {
8123     return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
8124            isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
8125   }
8126   case ISD::BUILD_VECTOR: {
8127     for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
8128       SDValue SrcOp = Op.getOperand(i);
8129       if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
8130         return false;
8131     }
8132 
8133     return true;
8134   }
8135   case ISD::EXTRACT_VECTOR_ELT:
8136   case ISD::EXTRACT_SUBVECTOR: {
8137     return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
8138   }
8139   case ISD::INSERT_VECTOR_ELT: {
8140     return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
8141            isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
8142   }
8143   case ISD::UNDEF:
8144     // Could be anything.
8145     return false;
8146 
8147   case ISD::BITCAST: {
8148     // Hack round the mess we make when legalizing extract_vector_elt
8149     SDValue Src = Op.getOperand(0);
8150     if (Src.getValueType() == MVT::i16 &&
8151         Src.getOpcode() == ISD::TRUNCATE) {
8152       SDValue TruncSrc = Src.getOperand(0);
8153       if (TruncSrc.getValueType() == MVT::i32 &&
8154           TruncSrc.getOpcode() == ISD::BITCAST &&
8155           TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
8156         return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
8157       }
8158     }
8159 
8160     return false;
8161   }
8162   case ISD::INTRINSIC_WO_CHAIN: {
8163     unsigned IntrinsicID
8164       = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
8165     // TODO: Handle more intrinsics
8166     switch (IntrinsicID) {
8167     case Intrinsic::amdgcn_cvt_pkrtz:
8168     case Intrinsic::amdgcn_cubeid:
8169     case Intrinsic::amdgcn_frexp_mant:
8170     case Intrinsic::amdgcn_fdot2:
8171       return true;
8172     default:
8173       break;
8174     }
8175 
8176     LLVM_FALLTHROUGH;
8177   }
8178   default:
8179     return denormalsEnabledForType(Op.getValueType()) &&
8180            DAG.isKnownNeverSNaN(Op);
8181   }
8182 
8183   llvm_unreachable("invalid operation");
8184 }
8185 
8186 // Constant fold canonicalize.
8187 SDValue SITargetLowering::getCanonicalConstantFP(
8188   SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {
8189   // Flush denormals to 0 if not enabled.
8190   if (C.isDenormal() && !denormalsEnabledForType(VT))
8191     return DAG.getConstantFP(0.0, SL, VT);
8192 
8193   if (C.isNaN()) {
8194     APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
8195     if (C.isSignaling()) {
8196       // Quiet a signaling NaN.
8197       // FIXME: Is this supposed to preserve payload bits?
8198       return DAG.getConstantFP(CanonicalQNaN, SL, VT);
8199     }
8200 
8201     // Make sure it is the canonical NaN bitpattern.
8202     //
8203     // TODO: Can we use -1 as the canonical NaN value since it's an inline
8204     // immediate?
8205     if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
8206       return DAG.getConstantFP(CanonicalQNaN, SL, VT);
8207   }
8208 
8209   // Already canonical.
8210   return DAG.getConstantFP(C, SL, VT);
8211 }
8212 
8213 static bool vectorEltWillFoldAway(SDValue Op) {
8214   return Op.isUndef() || isa<ConstantFPSDNode>(Op);
8215 }
8216 
8217 SDValue SITargetLowering::performFCanonicalizeCombine(
8218   SDNode *N,
8219   DAGCombinerInfo &DCI) const {
8220   SelectionDAG &DAG = DCI.DAG;
8221   SDValue N0 = N->getOperand(0);
8222   EVT VT = N->getValueType(0);
8223 
8224   // fcanonicalize undef -> qnan
8225   if (N0.isUndef()) {
8226     APFloat QNaN = APFloat::getQNaN(SelectionDAG::EVTToAPFloatSemantics(VT));
8227     return DAG.getConstantFP(QNaN, SDLoc(N), VT);
8228   }
8229 
8230   if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
8231     EVT VT = N->getValueType(0);
8232     return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
8233   }
8234 
8235   // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
8236   //                                                   (fcanonicalize k)
8237   //
8238   // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
8239 
8240   // TODO: This could be better with wider vectors that will be split to v2f16,
8241   // and to consider uses since there aren't that many packed operations.
8242   if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
8243       isTypeLegal(MVT::v2f16)) {
8244     SDLoc SL(N);
8245     SDValue NewElts[2];
8246     SDValue Lo = N0.getOperand(0);
8247     SDValue Hi = N0.getOperand(1);
8248     EVT EltVT = Lo.getValueType();
8249 
8250     if (vectorEltWillFoldAway(Lo) || vectorEltWillFoldAway(Hi)) {
8251       for (unsigned I = 0; I != 2; ++I) {
8252         SDValue Op = N0.getOperand(I);
8253         if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
8254           NewElts[I] = getCanonicalConstantFP(DAG, SL, EltVT,
8255                                               CFP->getValueAPF());
8256         } else if (Op.isUndef()) {
8257           // Handled below based on what the other operand is.
8258           NewElts[I] = Op;
8259         } else {
8260           NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
8261         }
8262       }
8263 
8264       // If one half is undef, and one is constant, perfer a splat vector rather
8265       // than the normal qNaN. If it's a register, prefer 0.0 since that's
8266       // cheaper to use and may be free with a packed operation.
8267       if (NewElts[0].isUndef()) {
8268         if (isa<ConstantFPSDNode>(NewElts[1]))
8269           NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) ?
8270             NewElts[1]: DAG.getConstantFP(0.0f, SL, EltVT);
8271       }
8272 
8273       if (NewElts[1].isUndef()) {
8274         NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) ?
8275           NewElts[0] : DAG.getConstantFP(0.0f, SL, EltVT);
8276       }
8277 
8278       return DAG.getBuildVector(VT, SL, NewElts);
8279     }
8280   }
8281 
8282   unsigned SrcOpc = N0.getOpcode();
8283 
8284   // If it's free to do so, push canonicalizes further up the source, which may
8285   // find a canonical source.
8286   //
8287   // TODO: More opcodes. Note this is unsafe for the the _ieee minnum/maxnum for
8288   // sNaNs.
8289   if (SrcOpc == ISD::FMINNUM || SrcOpc == ISD::FMAXNUM) {
8290     auto *CRHS = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
8291     if (CRHS && N0.hasOneUse()) {
8292       SDLoc SL(N);
8293       SDValue Canon0 = DAG.getNode(ISD::FCANONICALIZE, SL, VT,
8294                                    N0.getOperand(0));
8295       SDValue Canon1 = getCanonicalConstantFP(DAG, SL, VT, CRHS->getValueAPF());
8296       DCI.AddToWorklist(Canon0.getNode());
8297 
8298       return DAG.getNode(N0.getOpcode(), SL, VT, Canon0, Canon1);
8299     }
8300   }
8301 
8302   return isCanonicalized(DAG, N0) ? N0 : SDValue();
8303 }
8304 
8305 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
8306   switch (Opc) {
8307   case ISD::FMAXNUM:
8308   case ISD::FMAXNUM_IEEE:
8309     return AMDGPUISD::FMAX3;
8310   case ISD::SMAX:
8311     return AMDGPUISD::SMAX3;
8312   case ISD::UMAX:
8313     return AMDGPUISD::UMAX3;
8314   case ISD::FMINNUM:
8315   case ISD::FMINNUM_IEEE:
8316     return AMDGPUISD::FMIN3;
8317   case ISD::SMIN:
8318     return AMDGPUISD::SMIN3;
8319   case ISD::UMIN:
8320     return AMDGPUISD::UMIN3;
8321   default:
8322     llvm_unreachable("Not a min/max opcode");
8323   }
8324 }
8325 
8326 SDValue SITargetLowering::performIntMed3ImmCombine(
8327   SelectionDAG &DAG, const SDLoc &SL,
8328   SDValue Op0, SDValue Op1, bool Signed) const {
8329   ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1);
8330   if (!K1)
8331     return SDValue();
8332 
8333   ConstantSDNode *K0 = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
8334   if (!K0)
8335     return SDValue();
8336 
8337   if (Signed) {
8338     if (K0->getAPIntValue().sge(K1->getAPIntValue()))
8339       return SDValue();
8340   } else {
8341     if (K0->getAPIntValue().uge(K1->getAPIntValue()))
8342       return SDValue();
8343   }
8344 
8345   EVT VT = K0->getValueType(0);
8346   unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
8347   if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16())) {
8348     return DAG.getNode(Med3Opc, SL, VT,
8349                        Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
8350   }
8351 
8352   // If there isn't a 16-bit med3 operation, convert to 32-bit.
8353   MVT NVT = MVT::i32;
8354   unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
8355 
8356   SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
8357   SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
8358   SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
8359 
8360   SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3);
8361   return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3);
8362 }
8363 
8364 static ConstantFPSDNode *getSplatConstantFP(SDValue Op) {
8365   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
8366     return C;
8367 
8368   if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
8369     if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
8370       return C;
8371   }
8372 
8373   return nullptr;
8374 }
8375 
8376 SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
8377                                                   const SDLoc &SL,
8378                                                   SDValue Op0,
8379                                                   SDValue Op1) const {
8380   ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
8381   if (!K1)
8382     return SDValue();
8383 
8384   ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
8385   if (!K0)
8386     return SDValue();
8387 
8388   // Ordered >= (although NaN inputs should have folded away by now).
8389   APFloat::cmpResult Cmp = K0->getValueAPF().compare(K1->getValueAPF());
8390   if (Cmp == APFloat::cmpGreaterThan)
8391     return SDValue();
8392 
8393   const MachineFunction &MF = DAG.getMachineFunction();
8394   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8395 
8396   // TODO: Check IEEE bit enabled?
8397   EVT VT = Op0.getValueType();
8398   if (Info->getMode().DX10Clamp) {
8399     // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
8400     // hardware fmed3 behavior converting to a min.
8401     // FIXME: Should this be allowing -0.0?
8402     if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
8403       return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
8404   }
8405 
8406   // med3 for f16 is only available on gfx9+, and not available for v2f16.
8407   if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
8408     // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
8409     // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
8410     // then give the other result, which is different from med3 with a NaN
8411     // input.
8412     SDValue Var = Op0.getOperand(0);
8413     if (!DAG.isKnownNeverSNaN(Var))
8414       return SDValue();
8415 
8416     const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
8417 
8418     if ((!K0->hasOneUse() ||
8419          TII->isInlineConstant(K0->getValueAPF().bitcastToAPInt())) &&
8420         (!K1->hasOneUse() ||
8421          TII->isInlineConstant(K1->getValueAPF().bitcastToAPInt()))) {
8422       return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
8423                          Var, SDValue(K0, 0), SDValue(K1, 0));
8424     }
8425   }
8426 
8427   return SDValue();
8428 }
8429 
8430 SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
8431                                                DAGCombinerInfo &DCI) const {
8432   SelectionDAG &DAG = DCI.DAG;
8433 
8434   EVT VT = N->getValueType(0);
8435   unsigned Opc = N->getOpcode();
8436   SDValue Op0 = N->getOperand(0);
8437   SDValue Op1 = N->getOperand(1);
8438 
8439   // Only do this if the inner op has one use since this will just increases
8440   // register pressure for no benefit.
8441 
8442   if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
8443       !VT.isVector() &&
8444       (VT == MVT::i32 || VT == MVT::f32 ||
8445        ((VT == MVT::f16 || VT == MVT::i16) && Subtarget->hasMin3Max3_16()))) {
8446     // max(max(a, b), c) -> max3(a, b, c)
8447     // min(min(a, b), c) -> min3(a, b, c)
8448     if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
8449       SDLoc DL(N);
8450       return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
8451                          DL,
8452                          N->getValueType(0),
8453                          Op0.getOperand(0),
8454                          Op0.getOperand(1),
8455                          Op1);
8456     }
8457 
8458     // Try commuted.
8459     // max(a, max(b, c)) -> max3(a, b, c)
8460     // min(a, min(b, c)) -> min3(a, b, c)
8461     if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
8462       SDLoc DL(N);
8463       return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
8464                          DL,
8465                          N->getValueType(0),
8466                          Op0,
8467                          Op1.getOperand(0),
8468                          Op1.getOperand(1));
8469     }
8470   }
8471 
8472   // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
8473   if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
8474     if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true))
8475       return Med3;
8476   }
8477 
8478   if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
8479     if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false))
8480       return Med3;
8481   }
8482 
8483   // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
8484   if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
8485        (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
8486        (Opc == AMDGPUISD::FMIN_LEGACY &&
8487         Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
8488       (VT == MVT::f32 || VT == MVT::f64 ||
8489        (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
8490        (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
8491       Op0.hasOneUse()) {
8492     if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
8493       return Res;
8494   }
8495 
8496   return SDValue();
8497 }
8498 
8499 static bool isClampZeroToOne(SDValue A, SDValue B) {
8500   if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
8501     if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
8502       // FIXME: Should this be allowing -0.0?
8503       return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
8504              (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
8505     }
8506   }
8507 
8508   return false;
8509 }
8510 
8511 // FIXME: Should only worry about snans for version with chain.
8512 SDValue SITargetLowering::performFMed3Combine(SDNode *N,
8513                                               DAGCombinerInfo &DCI) const {
8514   EVT VT = N->getValueType(0);
8515   // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
8516   // NaNs. With a NaN input, the order of the operands may change the result.
8517 
8518   SelectionDAG &DAG = DCI.DAG;
8519   SDLoc SL(N);
8520 
8521   SDValue Src0 = N->getOperand(0);
8522   SDValue Src1 = N->getOperand(1);
8523   SDValue Src2 = N->getOperand(2);
8524 
8525   if (isClampZeroToOne(Src0, Src1)) {
8526     // const_a, const_b, x -> clamp is safe in all cases including signaling
8527     // nans.
8528     // FIXME: Should this be allowing -0.0?
8529     return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
8530   }
8531 
8532   const MachineFunction &MF = DAG.getMachineFunction();
8533   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8534 
8535   // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
8536   // handling no dx10-clamp?
8537   if (Info->getMode().DX10Clamp) {
8538     // If NaNs is clamped to 0, we are free to reorder the inputs.
8539 
8540     if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
8541       std::swap(Src0, Src1);
8542 
8543     if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
8544       std::swap(Src1, Src2);
8545 
8546     if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
8547       std::swap(Src0, Src1);
8548 
8549     if (isClampZeroToOne(Src1, Src2))
8550       return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
8551   }
8552 
8553   return SDValue();
8554 }
8555 
8556 SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
8557                                                  DAGCombinerInfo &DCI) const {
8558   SDValue Src0 = N->getOperand(0);
8559   SDValue Src1 = N->getOperand(1);
8560   if (Src0.isUndef() && Src1.isUndef())
8561     return DCI.DAG.getUNDEF(N->getValueType(0));
8562   return SDValue();
8563 }
8564 
8565 SDValue SITargetLowering::performExtractVectorEltCombine(
8566   SDNode *N, DAGCombinerInfo &DCI) const {
8567   SDValue Vec = N->getOperand(0);
8568   SelectionDAG &DAG = DCI.DAG;
8569 
8570   EVT VecVT = Vec.getValueType();
8571   EVT EltVT = VecVT.getVectorElementType();
8572 
8573   if ((Vec.getOpcode() == ISD::FNEG ||
8574        Vec.getOpcode() == ISD::FABS) && allUsesHaveSourceMods(N)) {
8575     SDLoc SL(N);
8576     EVT EltVT = N->getValueType(0);
8577     SDValue Idx = N->getOperand(1);
8578     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
8579                               Vec.getOperand(0), Idx);
8580     return DAG.getNode(Vec.getOpcode(), SL, EltVT, Elt);
8581   }
8582 
8583   // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
8584   //    =>
8585   // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
8586   // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
8587   // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
8588   if (Vec.hasOneUse() && DCI.isBeforeLegalize()) {
8589     SDLoc SL(N);
8590     EVT EltVT = N->getValueType(0);
8591     SDValue Idx = N->getOperand(1);
8592     unsigned Opc = Vec.getOpcode();
8593 
8594     switch(Opc) {
8595     default:
8596       break;
8597       // TODO: Support other binary operations.
8598     case ISD::FADD:
8599     case ISD::FSUB:
8600     case ISD::FMUL:
8601     case ISD::ADD:
8602     case ISD::UMIN:
8603     case ISD::UMAX:
8604     case ISD::SMIN:
8605     case ISD::SMAX:
8606     case ISD::FMAXNUM:
8607     case ISD::FMINNUM:
8608     case ISD::FMAXNUM_IEEE:
8609     case ISD::FMINNUM_IEEE: {
8610       SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
8611                                  Vec.getOperand(0), Idx);
8612       SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
8613                                  Vec.getOperand(1), Idx);
8614 
8615       DCI.AddToWorklist(Elt0.getNode());
8616       DCI.AddToWorklist(Elt1.getNode());
8617       return DAG.getNode(Opc, SL, EltVT, Elt0, Elt1, Vec->getFlags());
8618     }
8619     }
8620   }
8621 
8622   unsigned VecSize = VecVT.getSizeInBits();
8623   unsigned EltSize = EltVT.getSizeInBits();
8624 
8625   // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
8626   // This elminates non-constant index and subsequent movrel or scratch access.
8627   // Sub-dword vectors of size 2 dword or less have better implementation.
8628   // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
8629   // instructions.
8630   if (VecSize <= 256 && (VecSize > 64 || EltSize >= 32) &&
8631       !isa<ConstantSDNode>(N->getOperand(1))) {
8632     SDLoc SL(N);
8633     SDValue Idx = N->getOperand(1);
8634     EVT IdxVT = Idx.getValueType();
8635     SDValue V;
8636     for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
8637       SDValue IC = DAG.getConstant(I, SL, IdxVT);
8638       SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
8639       if (I == 0)
8640         V = Elt;
8641       else
8642         V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
8643     }
8644     return V;
8645   }
8646 
8647   if (!DCI.isBeforeLegalize())
8648     return SDValue();
8649 
8650   // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
8651   // elements. This exposes more load reduction opportunities by replacing
8652   // multiple small extract_vector_elements with a single 32-bit extract.
8653   auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
8654   if (isa<MemSDNode>(Vec) &&
8655       EltSize <= 16 &&
8656       EltVT.isByteSized() &&
8657       VecSize > 32 &&
8658       VecSize % 32 == 0 &&
8659       Idx) {
8660     EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
8661 
8662     unsigned BitIndex = Idx->getZExtValue() * EltSize;
8663     unsigned EltIdx = BitIndex / 32;
8664     unsigned LeftoverBitIdx = BitIndex % 32;
8665     SDLoc SL(N);
8666 
8667     SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
8668     DCI.AddToWorklist(Cast.getNode());
8669 
8670     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
8671                               DAG.getConstant(EltIdx, SL, MVT::i32));
8672     DCI.AddToWorklist(Elt.getNode());
8673     SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
8674                               DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
8675     DCI.AddToWorklist(Srl.getNode());
8676 
8677     SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, EltVT.changeTypeToInteger(), Srl);
8678     DCI.AddToWorklist(Trunc.getNode());
8679     return DAG.getNode(ISD::BITCAST, SL, EltVT, Trunc);
8680   }
8681 
8682   return SDValue();
8683 }
8684 
8685 SDValue
8686 SITargetLowering::performInsertVectorEltCombine(SDNode *N,
8687                                                 DAGCombinerInfo &DCI) const {
8688   SDValue Vec = N->getOperand(0);
8689   SDValue Idx = N->getOperand(2);
8690   EVT VecVT = Vec.getValueType();
8691   EVT EltVT = VecVT.getVectorElementType();
8692   unsigned VecSize = VecVT.getSizeInBits();
8693   unsigned EltSize = EltVT.getSizeInBits();
8694 
8695   // INSERT_VECTOR_ELT (<n x e>, var-idx)
8696   // => BUILD_VECTOR n x select (e, const-idx)
8697   // This elminates non-constant index and subsequent movrel or scratch access.
8698   // Sub-dword vectors of size 2 dword or less have better implementation.
8699   // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
8700   // instructions.
8701   if (isa<ConstantSDNode>(Idx) ||
8702       VecSize > 256 || (VecSize <= 64 && EltSize < 32))
8703     return SDValue();
8704 
8705   SelectionDAG &DAG = DCI.DAG;
8706   SDLoc SL(N);
8707   SDValue Ins = N->getOperand(1);
8708   EVT IdxVT = Idx.getValueType();
8709 
8710   SmallVector<SDValue, 16> Ops;
8711   for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
8712     SDValue IC = DAG.getConstant(I, SL, IdxVT);
8713     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
8714     SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
8715     Ops.push_back(V);
8716   }
8717 
8718   return DAG.getBuildVector(VecVT, SL, Ops);
8719 }
8720 
8721 unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
8722                                           const SDNode *N0,
8723                                           const SDNode *N1) const {
8724   EVT VT = N0->getValueType(0);
8725 
8726   // Only do this if we are not trying to support denormals. v_mad_f32 does not
8727   // support denormals ever.
8728   if (((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) ||
8729        (VT == MVT::f16 && !Subtarget->hasFP16Denormals() &&
8730         getSubtarget()->hasMadF16())) &&
8731        isOperationLegal(ISD::FMAD, VT))
8732     return ISD::FMAD;
8733 
8734   const TargetOptions &Options = DAG.getTarget().Options;
8735   if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
8736        (N0->getFlags().hasAllowContract() &&
8737         N1->getFlags().hasAllowContract())) &&
8738       isFMAFasterThanFMulAndFAdd(VT)) {
8739     return ISD::FMA;
8740   }
8741 
8742   return 0;
8743 }
8744 
8745 // For a reassociatable opcode perform:
8746 // op x, (op y, z) -> op (op x, z), y, if x and z are uniform
8747 SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
8748                                                SelectionDAG &DAG) const {
8749   EVT VT = N->getValueType(0);
8750   if (VT != MVT::i32 && VT != MVT::i64)
8751     return SDValue();
8752 
8753   unsigned Opc = N->getOpcode();
8754   SDValue Op0 = N->getOperand(0);
8755   SDValue Op1 = N->getOperand(1);
8756 
8757   if (!(Op0->isDivergent() ^ Op1->isDivergent()))
8758     return SDValue();
8759 
8760   if (Op0->isDivergent())
8761     std::swap(Op0, Op1);
8762 
8763   if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
8764     return SDValue();
8765 
8766   SDValue Op2 = Op1.getOperand(1);
8767   Op1 = Op1.getOperand(0);
8768   if (!(Op1->isDivergent() ^ Op2->isDivergent()))
8769     return SDValue();
8770 
8771   if (Op1->isDivergent())
8772     std::swap(Op1, Op2);
8773 
8774   // If either operand is constant this will conflict with
8775   // DAGCombiner::ReassociateOps().
8776   if (DAG.isConstantIntBuildVectorOrConstantInt(Op0) ||
8777       DAG.isConstantIntBuildVectorOrConstantInt(Op1))
8778     return SDValue();
8779 
8780   SDLoc SL(N);
8781   SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
8782   return DAG.getNode(Opc, SL, VT, Add1, Op2);
8783 }
8784 
8785 static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
8786                            EVT VT,
8787                            SDValue N0, SDValue N1, SDValue N2,
8788                            bool Signed) {
8789   unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32;
8790   SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
8791   SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
8792   return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
8793 }
8794 
8795 SDValue SITargetLowering::performAddCombine(SDNode *N,
8796                                             DAGCombinerInfo &DCI) const {
8797   SelectionDAG &DAG = DCI.DAG;
8798   EVT VT = N->getValueType(0);
8799   SDLoc SL(N);
8800   SDValue LHS = N->getOperand(0);
8801   SDValue RHS = N->getOperand(1);
8802 
8803   if ((LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL)
8804       && Subtarget->hasMad64_32() &&
8805       !VT.isVector() && VT.getScalarSizeInBits() > 32 &&
8806       VT.getScalarSizeInBits() <= 64) {
8807     if (LHS.getOpcode() != ISD::MUL)
8808       std::swap(LHS, RHS);
8809 
8810     SDValue MulLHS = LHS.getOperand(0);
8811     SDValue MulRHS = LHS.getOperand(1);
8812     SDValue AddRHS = RHS;
8813 
8814     // TODO: Maybe restrict if SGPR inputs.
8815     if (numBitsUnsigned(MulLHS, DAG) <= 32 &&
8816         numBitsUnsigned(MulRHS, DAG) <= 32) {
8817       MulLHS = DAG.getZExtOrTrunc(MulLHS, SL, MVT::i32);
8818       MulRHS = DAG.getZExtOrTrunc(MulRHS, SL, MVT::i32);
8819       AddRHS = DAG.getZExtOrTrunc(AddRHS, SL, MVT::i64);
8820       return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, false);
8821     }
8822 
8823     if (numBitsSigned(MulLHS, DAG) < 32 && numBitsSigned(MulRHS, DAG) < 32) {
8824       MulLHS = DAG.getSExtOrTrunc(MulLHS, SL, MVT::i32);
8825       MulRHS = DAG.getSExtOrTrunc(MulRHS, SL, MVT::i32);
8826       AddRHS = DAG.getSExtOrTrunc(AddRHS, SL, MVT::i64);
8827       return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, true);
8828     }
8829 
8830     return SDValue();
8831   }
8832 
8833   if (SDValue V = reassociateScalarOps(N, DAG)) {
8834     return V;
8835   }
8836 
8837   if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
8838     return SDValue();
8839 
8840   // add x, zext (setcc) => addcarry x, 0, setcc
8841   // add x, sext (setcc) => subcarry x, 0, setcc
8842   unsigned Opc = LHS.getOpcode();
8843   if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
8844       Opc == ISD::ANY_EXTEND || Opc == ISD::ADDCARRY)
8845     std::swap(RHS, LHS);
8846 
8847   Opc = RHS.getOpcode();
8848   switch (Opc) {
8849   default: break;
8850   case ISD::ZERO_EXTEND:
8851   case ISD::SIGN_EXTEND:
8852   case ISD::ANY_EXTEND: {
8853     auto Cond = RHS.getOperand(0);
8854     if (!isBoolSGPR(Cond))
8855       break;
8856     SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
8857     SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
8858     Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::SUBCARRY : ISD::ADDCARRY;
8859     return DAG.getNode(Opc, SL, VTList, Args);
8860   }
8861   case ISD::ADDCARRY: {
8862     // add x, (addcarry y, 0, cc) => addcarry x, y, cc
8863     auto C = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
8864     if (!C || C->getZExtValue() != 0) break;
8865     SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) };
8866     return DAG.getNode(ISD::ADDCARRY, SDLoc(N), RHS->getVTList(), Args);
8867   }
8868   }
8869   return SDValue();
8870 }
8871 
8872 SDValue SITargetLowering::performSubCombine(SDNode *N,
8873                                             DAGCombinerInfo &DCI) const {
8874   SelectionDAG &DAG = DCI.DAG;
8875   EVT VT = N->getValueType(0);
8876 
8877   if (VT != MVT::i32)
8878     return SDValue();
8879 
8880   SDLoc SL(N);
8881   SDValue LHS = N->getOperand(0);
8882   SDValue RHS = N->getOperand(1);
8883 
8884   if (LHS.getOpcode() == ISD::SUBCARRY) {
8885     // sub (subcarry x, 0, cc), y => subcarry x, y, cc
8886     auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
8887     if (!C || !C->isNullValue())
8888       return SDValue();
8889     SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
8890     return DAG.getNode(ISD::SUBCARRY, SDLoc(N), LHS->getVTList(), Args);
8891   }
8892   return SDValue();
8893 }
8894 
8895 SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
8896   DAGCombinerInfo &DCI) const {
8897 
8898   if (N->getValueType(0) != MVT::i32)
8899     return SDValue();
8900 
8901   auto C = dyn_cast<ConstantSDNode>(N->getOperand(1));
8902   if (!C || C->getZExtValue() != 0)
8903     return SDValue();
8904 
8905   SelectionDAG &DAG = DCI.DAG;
8906   SDValue LHS = N->getOperand(0);
8907 
8908   // addcarry (add x, y), 0, cc => addcarry x, y, cc
8909   // subcarry (sub x, y), 0, cc => subcarry x, y, cc
8910   unsigned LHSOpc = LHS.getOpcode();
8911   unsigned Opc = N->getOpcode();
8912   if ((LHSOpc == ISD::ADD && Opc == ISD::ADDCARRY) ||
8913       (LHSOpc == ISD::SUB && Opc == ISD::SUBCARRY)) {
8914     SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) };
8915     return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
8916   }
8917   return SDValue();
8918 }
8919 
8920 SDValue SITargetLowering::performFAddCombine(SDNode *N,
8921                                              DAGCombinerInfo &DCI) const {
8922   if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
8923     return SDValue();
8924 
8925   SelectionDAG &DAG = DCI.DAG;
8926   EVT VT = N->getValueType(0);
8927 
8928   SDLoc SL(N);
8929   SDValue LHS = N->getOperand(0);
8930   SDValue RHS = N->getOperand(1);
8931 
8932   // These should really be instruction patterns, but writing patterns with
8933   // source modiifiers is a pain.
8934 
8935   // fadd (fadd (a, a), b) -> mad 2.0, a, b
8936   if (LHS.getOpcode() == ISD::FADD) {
8937     SDValue A = LHS.getOperand(0);
8938     if (A == LHS.getOperand(1)) {
8939       unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
8940       if (FusedOp != 0) {
8941         const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
8942         return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
8943       }
8944     }
8945   }
8946 
8947   // fadd (b, fadd (a, a)) -> mad 2.0, a, b
8948   if (RHS.getOpcode() == ISD::FADD) {
8949     SDValue A = RHS.getOperand(0);
8950     if (A == RHS.getOperand(1)) {
8951       unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
8952       if (FusedOp != 0) {
8953         const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
8954         return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
8955       }
8956     }
8957   }
8958 
8959   return SDValue();
8960 }
8961 
8962 SDValue SITargetLowering::performFSubCombine(SDNode *N,
8963                                              DAGCombinerInfo &DCI) const {
8964   if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
8965     return SDValue();
8966 
8967   SelectionDAG &DAG = DCI.DAG;
8968   SDLoc SL(N);
8969   EVT VT = N->getValueType(0);
8970   assert(!VT.isVector());
8971 
8972   // Try to get the fneg to fold into the source modifier. This undoes generic
8973   // DAG combines and folds them into the mad.
8974   //
8975   // Only do this if we are not trying to support denormals. v_mad_f32 does
8976   // not support denormals ever.
8977   SDValue LHS = N->getOperand(0);
8978   SDValue RHS = N->getOperand(1);
8979   if (LHS.getOpcode() == ISD::FADD) {
8980     // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
8981     SDValue A = LHS.getOperand(0);
8982     if (A == LHS.getOperand(1)) {
8983       unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
8984       if (FusedOp != 0){
8985         const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
8986         SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
8987 
8988         return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
8989       }
8990     }
8991   }
8992 
8993   if (RHS.getOpcode() == ISD::FADD) {
8994     // (fsub c, (fadd a, a)) -> mad -2.0, a, c
8995 
8996     SDValue A = RHS.getOperand(0);
8997     if (A == RHS.getOperand(1)) {
8998       unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
8999       if (FusedOp != 0){
9000         const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
9001         return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
9002       }
9003     }
9004   }
9005 
9006   return SDValue();
9007 }
9008 
9009 SDValue SITargetLowering::performFMACombine(SDNode *N,
9010                                             DAGCombinerInfo &DCI) const {
9011   SelectionDAG &DAG = DCI.DAG;
9012   EVT VT = N->getValueType(0);
9013   SDLoc SL(N);
9014 
9015   if (!Subtarget->hasDot2Insts() || VT != MVT::f32)
9016     return SDValue();
9017 
9018   // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
9019   //   FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
9020   SDValue Op1 = N->getOperand(0);
9021   SDValue Op2 = N->getOperand(1);
9022   SDValue FMA = N->getOperand(2);
9023 
9024   if (FMA.getOpcode() != ISD::FMA ||
9025       Op1.getOpcode() != ISD::FP_EXTEND ||
9026       Op2.getOpcode() != ISD::FP_EXTEND)
9027     return SDValue();
9028 
9029   // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
9030   // regardless of the denorm mode setting. Therefore, unsafe-fp-math/fp-contract
9031   // is sufficient to allow generaing fdot2.
9032   const TargetOptions &Options = DAG.getTarget().Options;
9033   if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
9034       (N->getFlags().hasAllowContract() &&
9035        FMA->getFlags().hasAllowContract())) {
9036     Op1 = Op1.getOperand(0);
9037     Op2 = Op2.getOperand(0);
9038     if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9039         Op2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9040       return SDValue();
9041 
9042     SDValue Vec1 = Op1.getOperand(0);
9043     SDValue Idx1 = Op1.getOperand(1);
9044     SDValue Vec2 = Op2.getOperand(0);
9045 
9046     SDValue FMAOp1 = FMA.getOperand(0);
9047     SDValue FMAOp2 = FMA.getOperand(1);
9048     SDValue FMAAcc = FMA.getOperand(2);
9049 
9050     if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
9051         FMAOp2.getOpcode() != ISD::FP_EXTEND)
9052       return SDValue();
9053 
9054     FMAOp1 = FMAOp1.getOperand(0);
9055     FMAOp2 = FMAOp2.getOperand(0);
9056     if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9057         FMAOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
9058       return SDValue();
9059 
9060     SDValue Vec3 = FMAOp1.getOperand(0);
9061     SDValue Vec4 = FMAOp2.getOperand(0);
9062     SDValue Idx2 = FMAOp1.getOperand(1);
9063 
9064     if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
9065         // Idx1 and Idx2 cannot be the same.
9066         Idx1 == Idx2)
9067       return SDValue();
9068 
9069     if (Vec1 == Vec2 || Vec3 == Vec4)
9070       return SDValue();
9071 
9072     if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
9073       return SDValue();
9074 
9075     if ((Vec1 == Vec3 && Vec2 == Vec4) ||
9076         (Vec1 == Vec4 && Vec2 == Vec3)) {
9077       return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
9078                          DAG.getTargetConstant(0, SL, MVT::i1));
9079     }
9080   }
9081   return SDValue();
9082 }
9083 
9084 SDValue SITargetLowering::performSetCCCombine(SDNode *N,
9085                                               DAGCombinerInfo &DCI) const {
9086   SelectionDAG &DAG = DCI.DAG;
9087   SDLoc SL(N);
9088 
9089   SDValue LHS = N->getOperand(0);
9090   SDValue RHS = N->getOperand(1);
9091   EVT VT = LHS.getValueType();
9092   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
9093 
9094   auto CRHS = dyn_cast<ConstantSDNode>(RHS);
9095   if (!CRHS) {
9096     CRHS = dyn_cast<ConstantSDNode>(LHS);
9097     if (CRHS) {
9098       std::swap(LHS, RHS);
9099       CC = getSetCCSwappedOperands(CC);
9100     }
9101   }
9102 
9103   if (CRHS) {
9104     if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
9105         isBoolSGPR(LHS.getOperand(0))) {
9106       // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
9107       // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
9108       // setcc (sext from i1 cc),  0, eq|sge|ule) => not cc => xor cc, -1
9109       // setcc (sext from i1 cc),  0, ne|ugt|slt) => cc
9110       if ((CRHS->isAllOnesValue() &&
9111            (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
9112           (CRHS->isNullValue() &&
9113            (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
9114         return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
9115                            DAG.getConstant(-1, SL, MVT::i1));
9116       if ((CRHS->isAllOnesValue() &&
9117            (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
9118           (CRHS->isNullValue() &&
9119            (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
9120         return LHS.getOperand(0);
9121     }
9122 
9123     uint64_t CRHSVal = CRHS->getZExtValue();
9124     if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
9125         LHS.getOpcode() == ISD::SELECT &&
9126         isa<ConstantSDNode>(LHS.getOperand(1)) &&
9127         isa<ConstantSDNode>(LHS.getOperand(2)) &&
9128         LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
9129         isBoolSGPR(LHS.getOperand(0))) {
9130       // Given CT != FT:
9131       // setcc (select cc, CT, CF), CF, eq => xor cc, -1
9132       // setcc (select cc, CT, CF), CF, ne => cc
9133       // setcc (select cc, CT, CF), CT, ne => xor cc, -1
9134       // setcc (select cc, CT, CF), CT, eq => cc
9135       uint64_t CT = LHS.getConstantOperandVal(1);
9136       uint64_t CF = LHS.getConstantOperandVal(2);
9137 
9138       if ((CF == CRHSVal && CC == ISD::SETEQ) ||
9139           (CT == CRHSVal && CC == ISD::SETNE))
9140         return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
9141                            DAG.getConstant(-1, SL, MVT::i1));
9142       if ((CF == CRHSVal && CC == ISD::SETNE) ||
9143           (CT == CRHSVal && CC == ISD::SETEQ))
9144         return LHS.getOperand(0);
9145     }
9146   }
9147 
9148   if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() &&
9149                                            VT != MVT::f16))
9150     return SDValue();
9151 
9152   // Match isinf/isfinite pattern
9153   // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
9154   // (fcmp one (fabs x), inf) -> (fp_class x,
9155   // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
9156   if ((CC == ISD::SETOEQ || CC == ISD::SETONE) && LHS.getOpcode() == ISD::FABS) {
9157     const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
9158     if (!CRHS)
9159       return SDValue();
9160 
9161     const APFloat &APF = CRHS->getValueAPF();
9162     if (APF.isInfinity() && !APF.isNegative()) {
9163       const unsigned IsInfMask = SIInstrFlags::P_INFINITY |
9164                                  SIInstrFlags::N_INFINITY;
9165       const unsigned IsFiniteMask = SIInstrFlags::N_ZERO |
9166                                     SIInstrFlags::P_ZERO |
9167                                     SIInstrFlags::N_NORMAL |
9168                                     SIInstrFlags::P_NORMAL |
9169                                     SIInstrFlags::N_SUBNORMAL |
9170                                     SIInstrFlags::P_SUBNORMAL;
9171       unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
9172       return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
9173                          DAG.getConstant(Mask, SL, MVT::i32));
9174     }
9175   }
9176 
9177   return SDValue();
9178 }
9179 
9180 SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
9181                                                      DAGCombinerInfo &DCI) const {
9182   SelectionDAG &DAG = DCI.DAG;
9183   SDLoc SL(N);
9184   unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
9185 
9186   SDValue Src = N->getOperand(0);
9187   SDValue Srl = N->getOperand(0);
9188   if (Srl.getOpcode() == ISD::ZERO_EXTEND)
9189     Srl = Srl.getOperand(0);
9190 
9191   // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero.
9192   if (Srl.getOpcode() == ISD::SRL) {
9193     // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
9194     // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
9195     // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
9196 
9197     if (const ConstantSDNode *C =
9198         dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
9199       Srl = DAG.getZExtOrTrunc(Srl.getOperand(0), SDLoc(Srl.getOperand(0)),
9200                                EVT(MVT::i32));
9201 
9202       unsigned SrcOffset = C->getZExtValue() + 8 * Offset;
9203       if (SrcOffset < 32 && SrcOffset % 8 == 0) {
9204         return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, SL,
9205                            MVT::f32, Srl);
9206       }
9207     }
9208   }
9209 
9210   APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
9211 
9212   KnownBits Known;
9213   TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
9214                                         !DCI.isBeforeLegalizeOps());
9215   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
9216   if (TLI.SimplifyDemandedBits(Src, Demanded, Known, TLO)) {
9217     DCI.CommitTargetLoweringOpt(TLO);
9218   }
9219 
9220   return SDValue();
9221 }
9222 
9223 SDValue SITargetLowering::performClampCombine(SDNode *N,
9224                                               DAGCombinerInfo &DCI) const {
9225   ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
9226   if (!CSrc)
9227     return SDValue();
9228 
9229   const MachineFunction &MF = DCI.DAG.getMachineFunction();
9230   const APFloat &F = CSrc->getValueAPF();
9231   APFloat Zero = APFloat::getZero(F.getSemantics());
9232   APFloat::cmpResult Cmp0 = F.compare(Zero);
9233   if (Cmp0 == APFloat::cmpLessThan ||
9234       (Cmp0 == APFloat::cmpUnordered &&
9235        MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
9236     return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
9237   }
9238 
9239   APFloat One(F.getSemantics(), "1.0");
9240   APFloat::cmpResult Cmp1 = F.compare(One);
9241   if (Cmp1 == APFloat::cmpGreaterThan)
9242     return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
9243 
9244   return SDValue(CSrc, 0);
9245 }
9246 
9247 
9248 SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
9249                                             DAGCombinerInfo &DCI) const {
9250   if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
9251     return SDValue();
9252   switch (N->getOpcode()) {
9253   default:
9254     return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
9255   case ISD::ADD:
9256     return performAddCombine(N, DCI);
9257   case ISD::SUB:
9258     return performSubCombine(N, DCI);
9259   case ISD::ADDCARRY:
9260   case ISD::SUBCARRY:
9261     return performAddCarrySubCarryCombine(N, DCI);
9262   case ISD::FADD:
9263     return performFAddCombine(N, DCI);
9264   case ISD::FSUB:
9265     return performFSubCombine(N, DCI);
9266   case ISD::SETCC:
9267     return performSetCCCombine(N, DCI);
9268   case ISD::FMAXNUM:
9269   case ISD::FMINNUM:
9270   case ISD::FMAXNUM_IEEE:
9271   case ISD::FMINNUM_IEEE:
9272   case ISD::SMAX:
9273   case ISD::SMIN:
9274   case ISD::UMAX:
9275   case ISD::UMIN:
9276   case AMDGPUISD::FMIN_LEGACY:
9277   case AMDGPUISD::FMAX_LEGACY:
9278     return performMinMaxCombine(N, DCI);
9279   case ISD::FMA:
9280     return performFMACombine(N, DCI);
9281   case ISD::LOAD: {
9282     if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI))
9283       return Widended;
9284     LLVM_FALLTHROUGH;
9285   }
9286   case ISD::STORE:
9287   case ISD::ATOMIC_LOAD:
9288   case ISD::ATOMIC_STORE:
9289   case ISD::ATOMIC_CMP_SWAP:
9290   case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
9291   case ISD::ATOMIC_SWAP:
9292   case ISD::ATOMIC_LOAD_ADD:
9293   case ISD::ATOMIC_LOAD_SUB:
9294   case ISD::ATOMIC_LOAD_AND:
9295   case ISD::ATOMIC_LOAD_OR:
9296   case ISD::ATOMIC_LOAD_XOR:
9297   case ISD::ATOMIC_LOAD_NAND:
9298   case ISD::ATOMIC_LOAD_MIN:
9299   case ISD::ATOMIC_LOAD_MAX:
9300   case ISD::ATOMIC_LOAD_UMIN:
9301   case ISD::ATOMIC_LOAD_UMAX:
9302   case ISD::ATOMIC_LOAD_FADD:
9303   case AMDGPUISD::ATOMIC_INC:
9304   case AMDGPUISD::ATOMIC_DEC:
9305   case AMDGPUISD::ATOMIC_LOAD_FMIN:
9306   case AMDGPUISD::ATOMIC_LOAD_FMAX: // TODO: Target mem intrinsics.
9307     if (DCI.isBeforeLegalize())
9308       break;
9309     return performMemSDNodeCombine(cast<MemSDNode>(N), DCI);
9310   case ISD::AND:
9311     return performAndCombine(N, DCI);
9312   case ISD::OR:
9313     return performOrCombine(N, DCI);
9314   case ISD::XOR:
9315     return performXorCombine(N, DCI);
9316   case ISD::ZERO_EXTEND:
9317     return performZeroExtendCombine(N, DCI);
9318   case ISD::SIGN_EXTEND_INREG:
9319     return performSignExtendInRegCombine(N , DCI);
9320   case AMDGPUISD::FP_CLASS:
9321     return performClassCombine(N, DCI);
9322   case ISD::FCANONICALIZE:
9323     return performFCanonicalizeCombine(N, DCI);
9324   case AMDGPUISD::RCP:
9325     return performRcpCombine(N, DCI);
9326   case AMDGPUISD::FRACT:
9327   case AMDGPUISD::RSQ:
9328   case AMDGPUISD::RCP_LEGACY:
9329   case AMDGPUISD::RSQ_LEGACY:
9330   case AMDGPUISD::RCP_IFLAG:
9331   case AMDGPUISD::RSQ_CLAMP:
9332   case AMDGPUISD::LDEXP: {
9333     SDValue Src = N->getOperand(0);
9334     if (Src.isUndef())
9335       return Src;
9336     break;
9337   }
9338   case ISD::SINT_TO_FP:
9339   case ISD::UINT_TO_FP:
9340     return performUCharToFloatCombine(N, DCI);
9341   case AMDGPUISD::CVT_F32_UBYTE0:
9342   case AMDGPUISD::CVT_F32_UBYTE1:
9343   case AMDGPUISD::CVT_F32_UBYTE2:
9344   case AMDGPUISD::CVT_F32_UBYTE3:
9345     return performCvtF32UByteNCombine(N, DCI);
9346   case AMDGPUISD::FMED3:
9347     return performFMed3Combine(N, DCI);
9348   case AMDGPUISD::CVT_PKRTZ_F16_F32:
9349     return performCvtPkRTZCombine(N, DCI);
9350   case AMDGPUISD::CLAMP:
9351     return performClampCombine(N, DCI);
9352   case ISD::SCALAR_TO_VECTOR: {
9353     SelectionDAG &DAG = DCI.DAG;
9354     EVT VT = N->getValueType(0);
9355 
9356     // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
9357     if (VT == MVT::v2i16 || VT == MVT::v2f16) {
9358       SDLoc SL(N);
9359       SDValue Src = N->getOperand(0);
9360       EVT EltVT = Src.getValueType();
9361       if (EltVT == MVT::f16)
9362         Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
9363 
9364       SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
9365       return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
9366     }
9367 
9368     break;
9369   }
9370   case ISD::EXTRACT_VECTOR_ELT:
9371     return performExtractVectorEltCombine(N, DCI);
9372   case ISD::INSERT_VECTOR_ELT:
9373     return performInsertVectorEltCombine(N, DCI);
9374   }
9375   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
9376 }
9377 
9378 /// Helper function for adjustWritemask
9379 static unsigned SubIdx2Lane(unsigned Idx) {
9380   switch (Idx) {
9381   default: return 0;
9382   case AMDGPU::sub0: return 0;
9383   case AMDGPU::sub1: return 1;
9384   case AMDGPU::sub2: return 2;
9385   case AMDGPU::sub3: return 3;
9386   case AMDGPU::sub4: return 4; // Possible with TFE/LWE
9387   }
9388 }
9389 
9390 /// Adjust the writemask of MIMG instructions
9391 SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
9392                                           SelectionDAG &DAG) const {
9393   unsigned Opcode = Node->getMachineOpcode();
9394 
9395   // Subtract 1 because the vdata output is not a MachineSDNode operand.
9396   int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
9397   if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
9398     return Node; // not implemented for D16
9399 
9400   SDNode *Users[5] = { nullptr };
9401   unsigned Lane = 0;
9402   unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
9403   unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
9404   unsigned NewDmask = 0;
9405   unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
9406   unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
9407   bool UsesTFC = (Node->getConstantOperandVal(TFEIdx) ||
9408                   Node->getConstantOperandVal(LWEIdx)) ? 1 : 0;
9409   unsigned TFCLane = 0;
9410   bool HasChain = Node->getNumValues() > 1;
9411 
9412   if (OldDmask == 0) {
9413     // These are folded out, but on the chance it happens don't assert.
9414     return Node;
9415   }
9416 
9417   unsigned OldBitsSet = countPopulation(OldDmask);
9418   // Work out which is the TFE/LWE lane if that is enabled.
9419   if (UsesTFC) {
9420     TFCLane = OldBitsSet;
9421   }
9422 
9423   // Try to figure out the used register components
9424   for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
9425        I != E; ++I) {
9426 
9427     // Don't look at users of the chain.
9428     if (I.getUse().getResNo() != 0)
9429       continue;
9430 
9431     // Abort if we can't understand the usage
9432     if (!I->isMachineOpcode() ||
9433         I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
9434       return Node;
9435 
9436     // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
9437     // Note that subregs are packed, i.e. Lane==0 is the first bit set
9438     // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
9439     // set, etc.
9440     Lane = SubIdx2Lane(I->getConstantOperandVal(1));
9441 
9442     // Check if the use is for the TFE/LWE generated result at VGPRn+1.
9443     if (UsesTFC && Lane == TFCLane) {
9444       Users[Lane] = *I;
9445     } else {
9446       // Set which texture component corresponds to the lane.
9447       unsigned Comp;
9448       for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
9449         Comp = countTrailingZeros(Dmask);
9450         Dmask &= ~(1 << Comp);
9451       }
9452 
9453       // Abort if we have more than one user per component.
9454       if (Users[Lane])
9455         return Node;
9456 
9457       Users[Lane] = *I;
9458       NewDmask |= 1 << Comp;
9459     }
9460   }
9461 
9462   // Don't allow 0 dmask, as hardware assumes one channel enabled.
9463   bool NoChannels = !NewDmask;
9464   if (NoChannels) {
9465     if (!UsesTFC) {
9466       // No uses of the result and not using TFC. Then do nothing.
9467       return Node;
9468     }
9469     // If the original dmask has one channel - then nothing to do
9470     if (OldBitsSet == 1)
9471       return Node;
9472     // Use an arbitrary dmask - required for the instruction to work
9473     NewDmask = 1;
9474   }
9475   // Abort if there's no change
9476   if (NewDmask == OldDmask)
9477     return Node;
9478 
9479   unsigned BitsSet = countPopulation(NewDmask);
9480 
9481   // Check for TFE or LWE - increase the number of channels by one to account
9482   // for the extra return value
9483   // This will need adjustment for D16 if this is also included in
9484   // adjustWriteMask (this function) but at present D16 are excluded.
9485   unsigned NewChannels = BitsSet + UsesTFC;
9486 
9487   int NewOpcode =
9488       AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
9489   assert(NewOpcode != -1 &&
9490          NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
9491          "failed to find equivalent MIMG op");
9492 
9493   // Adjust the writemask in the node
9494   SmallVector<SDValue, 12> Ops;
9495   Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
9496   Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
9497   Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
9498 
9499   MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
9500 
9501   MVT ResultVT = NewChannels == 1 ?
9502     SVT : MVT::getVectorVT(SVT, NewChannels == 3 ? 4 :
9503                            NewChannels == 5 ? 8 : NewChannels);
9504   SDVTList NewVTList = HasChain ?
9505     DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
9506 
9507 
9508   MachineSDNode *NewNode = DAG.getMachineNode(NewOpcode, SDLoc(Node),
9509                                               NewVTList, Ops);
9510 
9511   if (HasChain) {
9512     // Update chain.
9513     DAG.setNodeMemRefs(NewNode, Node->memoperands());
9514     DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
9515   }
9516 
9517   if (NewChannels == 1) {
9518     assert(Node->hasNUsesOfValue(1, 0));
9519     SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY,
9520                                       SDLoc(Node), Users[Lane]->getValueType(0),
9521                                       SDValue(NewNode, 0));
9522     DAG.ReplaceAllUsesWith(Users[Lane], Copy);
9523     return nullptr;
9524   }
9525 
9526   // Update the users of the node with the new indices
9527   for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
9528     SDNode *User = Users[i];
9529     if (!User) {
9530       // Handle the special case of NoChannels. We set NewDmask to 1 above, but
9531       // Users[0] is still nullptr because channel 0 doesn't really have a use.
9532       if (i || !NoChannels)
9533         continue;
9534     } else {
9535       SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
9536       DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
9537     }
9538 
9539     switch (Idx) {
9540     default: break;
9541     case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
9542     case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
9543     case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
9544     case AMDGPU::sub3: Idx = AMDGPU::sub4; break;
9545     }
9546   }
9547 
9548   DAG.RemoveDeadNode(Node);
9549   return nullptr;
9550 }
9551 
9552 static bool isFrameIndexOp(SDValue Op) {
9553   if (Op.getOpcode() == ISD::AssertZext)
9554     Op = Op.getOperand(0);
9555 
9556   return isa<FrameIndexSDNode>(Op);
9557 }
9558 
9559 /// Legalize target independent instructions (e.g. INSERT_SUBREG)
9560 /// with frame index operands.
9561 /// LLVM assumes that inputs are to these instructions are registers.
9562 SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
9563                                                         SelectionDAG &DAG) const {
9564   if (Node->getOpcode() == ISD::CopyToReg) {
9565     RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
9566     SDValue SrcVal = Node->getOperand(2);
9567 
9568     // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
9569     // to try understanding copies to physical registers.
9570     if (SrcVal.getValueType() == MVT::i1 &&
9571         TargetRegisterInfo::isPhysicalRegister(DestReg->getReg())) {
9572       SDLoc SL(Node);
9573       MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
9574       SDValue VReg = DAG.getRegister(
9575         MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
9576 
9577       SDNode *Glued = Node->getGluedNode();
9578       SDValue ToVReg
9579         = DAG.getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
9580                          SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
9581       SDValue ToResultReg
9582         = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
9583                            VReg, ToVReg.getValue(1));
9584       DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
9585       DAG.RemoveDeadNode(Node);
9586       return ToResultReg.getNode();
9587     }
9588   }
9589 
9590   SmallVector<SDValue, 8> Ops;
9591   for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
9592     if (!isFrameIndexOp(Node->getOperand(i))) {
9593       Ops.push_back(Node->getOperand(i));
9594       continue;
9595     }
9596 
9597     SDLoc DL(Node);
9598     Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
9599                                      Node->getOperand(i).getValueType(),
9600                                      Node->getOperand(i)), 0));
9601   }
9602 
9603   return DAG.UpdateNodeOperands(Node, Ops);
9604 }
9605 
9606 /// Fold the instructions after selecting them.
9607 /// Returns null if users were already updated.
9608 SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
9609                                           SelectionDAG &DAG) const {
9610   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
9611   unsigned Opcode = Node->getMachineOpcode();
9612 
9613   if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() &&
9614       !TII->isGather4(Opcode)) {
9615     return adjustWritemask(Node, DAG);
9616   }
9617 
9618   if (Opcode == AMDGPU::INSERT_SUBREG ||
9619       Opcode == AMDGPU::REG_SEQUENCE) {
9620     legalizeTargetIndependentNode(Node, DAG);
9621     return Node;
9622   }
9623 
9624   switch (Opcode) {
9625   case AMDGPU::V_DIV_SCALE_F32:
9626   case AMDGPU::V_DIV_SCALE_F64: {
9627     // Satisfy the operand register constraint when one of the inputs is
9628     // undefined. Ordinarily each undef value will have its own implicit_def of
9629     // a vreg, so force these to use a single register.
9630     SDValue Src0 = Node->getOperand(0);
9631     SDValue Src1 = Node->getOperand(1);
9632     SDValue Src2 = Node->getOperand(2);
9633 
9634     if ((Src0.isMachineOpcode() &&
9635          Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
9636         (Src0 == Src1 || Src0 == Src2))
9637       break;
9638 
9639     MVT VT = Src0.getValueType().getSimpleVT();
9640     const TargetRegisterClass *RC =
9641         getRegClassFor(VT, Src0.getNode()->isDivergent());
9642 
9643     MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
9644     SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
9645 
9646     SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node),
9647                                       UndefReg, Src0, SDValue());
9648 
9649     // src0 must be the same register as src1 or src2, even if the value is
9650     // undefined, so make sure we don't violate this constraint.
9651     if (Src0.isMachineOpcode() &&
9652         Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
9653       if (Src1.isMachineOpcode() &&
9654           Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
9655         Src0 = Src1;
9656       else if (Src2.isMachineOpcode() &&
9657                Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
9658         Src0 = Src2;
9659       else {
9660         assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
9661         Src0 = UndefReg;
9662         Src1 = UndefReg;
9663       }
9664     } else
9665       break;
9666 
9667     SmallVector<SDValue, 4> Ops = { Src0, Src1, Src2 };
9668     for (unsigned I = 3, N = Node->getNumOperands(); I != N; ++I)
9669       Ops.push_back(Node->getOperand(I));
9670 
9671     Ops.push_back(ImpDef.getValue(1));
9672     return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
9673   }
9674   default:
9675     break;
9676   }
9677 
9678   return Node;
9679 }
9680 
9681 /// Assign the register class depending on the number of
9682 /// bits set in the writemask
9683 void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
9684                                                      SDNode *Node) const {
9685   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
9686 
9687   MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
9688 
9689   if (TII->isVOP3(MI.getOpcode())) {
9690     // Make sure constant bus requirements are respected.
9691     TII->legalizeOperandsVOP3(MRI, MI);
9692     return;
9693   }
9694 
9695   // Replace unused atomics with the no return version.
9696   int NoRetAtomicOp = AMDGPU::getAtomicNoRetOp(MI.getOpcode());
9697   if (NoRetAtomicOp != -1) {
9698     if (!Node->hasAnyUseOfValue(0)) {
9699       MI.setDesc(TII->get(NoRetAtomicOp));
9700       MI.RemoveOperand(0);
9701       return;
9702     }
9703 
9704     // For mubuf_atomic_cmpswap, we need to have tablegen use an extract_subreg
9705     // instruction, because the return type of these instructions is a vec2 of
9706     // the memory type, so it can be tied to the input operand.
9707     // This means these instructions always have a use, so we need to add a
9708     // special case to check if the atomic has only one extract_subreg use,
9709     // which itself has no uses.
9710     if ((Node->hasNUsesOfValue(1, 0) &&
9711          Node->use_begin()->isMachineOpcode() &&
9712          Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG &&
9713          !Node->use_begin()->hasAnyUseOfValue(0))) {
9714       unsigned Def = MI.getOperand(0).getReg();
9715 
9716       // Change this into a noret atomic.
9717       MI.setDesc(TII->get(NoRetAtomicOp));
9718       MI.RemoveOperand(0);
9719 
9720       // If we only remove the def operand from the atomic instruction, the
9721       // extract_subreg will be left with a use of a vreg without a def.
9722       // So we need to insert an implicit_def to avoid machine verifier
9723       // errors.
9724       BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
9725               TII->get(AMDGPU::IMPLICIT_DEF), Def);
9726     }
9727     return;
9728   }
9729 }
9730 
9731 static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
9732                               uint64_t Val) {
9733   SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
9734   return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
9735 }
9736 
9737 MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
9738                                                 const SDLoc &DL,
9739                                                 SDValue Ptr) const {
9740   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
9741 
9742   // Build the half of the subregister with the constants before building the
9743   // full 128-bit register. If we are building multiple resource descriptors,
9744   // this will allow CSEing of the 2-component register.
9745   const SDValue Ops0[] = {
9746     DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
9747     buildSMovImm32(DAG, DL, 0),
9748     DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
9749     buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
9750     DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
9751   };
9752 
9753   SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
9754                                                 MVT::v2i32, Ops0), 0);
9755 
9756   // Combine the constants and the pointer.
9757   const SDValue Ops1[] = {
9758     DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
9759     Ptr,
9760     DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
9761     SubRegHi,
9762     DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
9763   };
9764 
9765   return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
9766 }
9767 
9768 /// Return a resource descriptor with the 'Add TID' bit enabled
9769 ///        The TID (Thread ID) is multiplied by the stride value (bits [61:48]
9770 ///        of the resource descriptor) to create an offset, which is added to
9771 ///        the resource pointer.
9772 MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
9773                                            SDValue Ptr, uint32_t RsrcDword1,
9774                                            uint64_t RsrcDword2And3) const {
9775   SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
9776   SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
9777   if (RsrcDword1) {
9778     PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
9779                                      DAG.getConstant(RsrcDword1, DL, MVT::i32)),
9780                     0);
9781   }
9782 
9783   SDValue DataLo = buildSMovImm32(DAG, DL,
9784                                   RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
9785   SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
9786 
9787   const SDValue Ops[] = {
9788     DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
9789     PtrLo,
9790     DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
9791     PtrHi,
9792     DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
9793     DataLo,
9794     DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
9795     DataHi,
9796     DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
9797   };
9798 
9799   return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
9800 }
9801 
9802 //===----------------------------------------------------------------------===//
9803 //                         SI Inline Assembly Support
9804 //===----------------------------------------------------------------------===//
9805 
9806 std::pair<unsigned, const TargetRegisterClass *>
9807 SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
9808                                                StringRef Constraint,
9809                                                MVT VT) const {
9810   const TargetRegisterClass *RC = nullptr;
9811   if (Constraint.size() == 1) {
9812     switch (Constraint[0]) {
9813     default:
9814       return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
9815     case 's':
9816     case 'r':
9817       switch (VT.getSizeInBits()) {
9818       default:
9819         return std::make_pair(0U, nullptr);
9820       case 32:
9821       case 16:
9822         RC = &AMDGPU::SReg_32_XM0RegClass;
9823         break;
9824       case 64:
9825         RC = &AMDGPU::SGPR_64RegClass;
9826         break;
9827       case 96:
9828         RC = &AMDGPU::SReg_96RegClass;
9829         break;
9830       case 128:
9831         RC = &AMDGPU::SReg_128RegClass;
9832         break;
9833       case 160:
9834         RC = &AMDGPU::SReg_160RegClass;
9835         break;
9836       case 256:
9837         RC = &AMDGPU::SReg_256RegClass;
9838         break;
9839       case 512:
9840         RC = &AMDGPU::SReg_512RegClass;
9841         break;
9842       }
9843       break;
9844     case 'v':
9845       switch (VT.getSizeInBits()) {
9846       default:
9847         return std::make_pair(0U, nullptr);
9848       case 32:
9849       case 16:
9850         RC = &AMDGPU::VGPR_32RegClass;
9851         break;
9852       case 64:
9853         RC = &AMDGPU::VReg_64RegClass;
9854         break;
9855       case 96:
9856         RC = &AMDGPU::VReg_96RegClass;
9857         break;
9858       case 128:
9859         RC = &AMDGPU::VReg_128RegClass;
9860         break;
9861       case 160:
9862         RC = &AMDGPU::VReg_160RegClass;
9863         break;
9864       case 256:
9865         RC = &AMDGPU::VReg_256RegClass;
9866         break;
9867       case 512:
9868         RC = &AMDGPU::VReg_512RegClass;
9869         break;
9870       }
9871       break;
9872     }
9873     // We actually support i128, i16 and f16 as inline parameters
9874     // even if they are not reported as legal
9875     if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
9876                VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
9877       return std::make_pair(0U, RC);
9878   }
9879 
9880   if (Constraint.size() > 1) {
9881     if (Constraint[1] == 'v') {
9882       RC = &AMDGPU::VGPR_32RegClass;
9883     } else if (Constraint[1] == 's') {
9884       RC = &AMDGPU::SGPR_32RegClass;
9885     }
9886 
9887     if (RC) {
9888       uint32_t Idx;
9889       bool Failed = Constraint.substr(2).getAsInteger(10, Idx);
9890       if (!Failed && Idx < RC->getNumRegs())
9891         return std::make_pair(RC->getRegister(Idx), RC);
9892     }
9893   }
9894   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
9895 }
9896 
9897 SITargetLowering::ConstraintType
9898 SITargetLowering::getConstraintType(StringRef Constraint) const {
9899   if (Constraint.size() == 1) {
9900     switch (Constraint[0]) {
9901     default: break;
9902     case 's':
9903     case 'v':
9904       return C_RegisterClass;
9905     }
9906   }
9907   return TargetLowering::getConstraintType(Constraint);
9908 }
9909 
9910 // Figure out which registers should be reserved for stack access. Only after
9911 // the function is legalized do we know all of the non-spill stack objects or if
9912 // calls are present.
9913 void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
9914   MachineRegisterInfo &MRI = MF.getRegInfo();
9915   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
9916   const MachineFrameInfo &MFI = MF.getFrameInfo();
9917   const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
9918 
9919   if (Info->isEntryFunction()) {
9920     // Callable functions have fixed registers used for stack access.
9921     reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);
9922   }
9923 
9924   // We have to assume the SP is needed in case there are calls in the function
9925   // during lowering. Calls are only detected after the function is
9926   // lowered. We're about to reserve registers, so don't bother using it if we
9927   // aren't really going to use it.
9928   bool NeedSP = !Info->isEntryFunction() ||
9929     MFI.hasVarSizedObjects() ||
9930     MFI.hasCalls();
9931 
9932   if (NeedSP) {
9933     unsigned ReservedStackPtrOffsetReg = TRI->reservedStackPtrOffsetReg(MF);
9934     Info->setStackPtrOffsetReg(ReservedStackPtrOffsetReg);
9935 
9936     assert(Info->getStackPtrOffsetReg() != Info->getFrameOffsetReg());
9937     assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
9938                                Info->getStackPtrOffsetReg()));
9939     if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
9940       MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
9941   }
9942 
9943   // We need to worry about replacing the default register with itself in case
9944   // of MIR testcases missing the MFI.
9945   if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
9946     MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
9947 
9948   if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
9949     MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
9950 
9951   if (Info->getScratchWaveOffsetReg() != AMDGPU::SCRATCH_WAVE_OFFSET_REG) {
9952     MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG,
9953                        Info->getScratchWaveOffsetReg());
9954   }
9955 
9956   Info->limitOccupancy(MF);
9957 
9958   TargetLoweringBase::finalizeLowering(MF);
9959 }
9960 
9961 void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
9962                                                      KnownBits &Known,
9963                                                      const APInt &DemandedElts,
9964                                                      const SelectionDAG &DAG,
9965                                                      unsigned Depth) const {
9966   TargetLowering::computeKnownBitsForFrameIndex(Op, Known, DemandedElts,
9967                                                 DAG, Depth);
9968 
9969   // Set the high bits to zero based on the maximum allowed scratch size per
9970   // wave. We can't use vaddr in MUBUF instructions if we don't know the address
9971   // calculation won't overflow, so assume the sign bit is never set.
9972   Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
9973 }
9974 
9975 unsigned SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
9976   const unsigned PrefAlign = TargetLowering::getPrefLoopAlignment(ML);
9977   const unsigned CacheLineAlign = 6; // log2(64)
9978 
9979   // Pre-GFX10 target did not benefit from loop alignment
9980   if (!ML || DisableLoopAlignment ||
9981       (getSubtarget()->getGeneration() < AMDGPUSubtarget::GFX10) ||
9982       getSubtarget()->hasInstFwdPrefetchBug())
9983     return PrefAlign;
9984 
9985   // On GFX10 I$ is 4 x 64 bytes cache lines.
9986   // By default prefetcher keeps one cache line behind and reads two ahead.
9987   // We can modify it with S_INST_PREFETCH for larger loops to have two lines
9988   // behind and one ahead.
9989   // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
9990   // If loop fits 64 bytes it always spans no more than two cache lines and
9991   // does not need an alignment.
9992   // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
9993   // Else if loop is less or equal 192 bytes we need two lines behind.
9994 
9995   const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
9996   const MachineBasicBlock *Header = ML->getHeader();
9997   if (Header->getAlignment() != PrefAlign)
9998     return Header->getAlignment(); // Already processed.
9999 
10000   unsigned LoopSize = 0;
10001   for (const MachineBasicBlock *MBB : ML->blocks()) {
10002     // If inner loop block is aligned assume in average half of the alignment
10003     // size to be added as nops.
10004     if (MBB != Header)
10005       LoopSize += (1 << MBB->getAlignment()) / 2;
10006 
10007     for (const MachineInstr &MI : *MBB) {
10008       LoopSize += TII->getInstSizeInBytes(MI);
10009       if (LoopSize > 192)
10010         return PrefAlign;
10011     }
10012   }
10013 
10014   if (LoopSize <= 64)
10015     return PrefAlign;
10016 
10017   if (LoopSize <= 128)
10018     return CacheLineAlign;
10019 
10020   // If any of parent loops is surrounded by prefetch instructions do not
10021   // insert new for inner loop, which would reset parent's settings.
10022   for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
10023     if (MachineBasicBlock *Exit = P->getExitBlock()) {
10024       auto I = Exit->getFirstNonDebugInstr();
10025       if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
10026         return CacheLineAlign;
10027     }
10028   }
10029 
10030   MachineBasicBlock *Pre = ML->getLoopPreheader();
10031   MachineBasicBlock *Exit = ML->getExitBlock();
10032 
10033   if (Pre && Exit) {
10034     BuildMI(*Pre, Pre->getFirstTerminator(), DebugLoc(),
10035             TII->get(AMDGPU::S_INST_PREFETCH))
10036       .addImm(1); // prefetch 2 lines behind PC
10037 
10038     BuildMI(*Exit, Exit->getFirstNonDebugInstr(), DebugLoc(),
10039             TII->get(AMDGPU::S_INST_PREFETCH))
10040       .addImm(2); // prefetch 1 line behind PC
10041   }
10042 
10043   return CacheLineAlign;
10044 }
10045 
10046 LLVM_ATTRIBUTE_UNUSED
10047 static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
10048   assert(N->getOpcode() == ISD::CopyFromReg);
10049   do {
10050     // Follow the chain until we find an INLINEASM node.
10051     N = N->getOperand(0).getNode();
10052     if (N->getOpcode() == ISD::INLINEASM ||
10053         N->getOpcode() == ISD::INLINEASM_BR)
10054       return true;
10055   } while (N->getOpcode() == ISD::CopyFromReg);
10056   return false;
10057 }
10058 
10059 bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N,
10060   FunctionLoweringInfo * FLI, LegacyDivergenceAnalysis * KDA) const
10061 {
10062   switch (N->getOpcode()) {
10063     case ISD::CopyFromReg:
10064     {
10065       const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
10066       const MachineFunction * MF = FLI->MF;
10067       const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
10068       const MachineRegisterInfo &MRI = MF->getRegInfo();
10069       const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo();
10070       unsigned Reg = R->getReg();
10071       if (TRI.isPhysicalRegister(Reg))
10072         return !TRI.isSGPRReg(MRI, Reg);
10073 
10074       if (MRI.isLiveIn(Reg)) {
10075         // workitem.id.x workitem.id.y workitem.id.z
10076         // Any VGPR formal argument is also considered divergent
10077         if (!TRI.isSGPRReg(MRI, Reg))
10078           return true;
10079         // Formal arguments of non-entry functions
10080         // are conservatively considered divergent
10081         else if (!AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv()))
10082           return true;
10083         return false;
10084       }
10085       const Value *V = FLI->getValueFromVirtualReg(Reg);
10086       if (V)
10087         return KDA->isDivergent(V);
10088       assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
10089       return !TRI.isSGPRReg(MRI, Reg);
10090     }
10091     break;
10092     case ISD::LOAD: {
10093       const LoadSDNode *L = cast<LoadSDNode>(N);
10094       unsigned AS = L->getAddressSpace();
10095       // A flat load may access private memory.
10096       return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
10097     } break;
10098     case ISD::CALLSEQ_END:
10099     return true;
10100     break;
10101     case ISD::INTRINSIC_WO_CHAIN:
10102     {
10103 
10104     }
10105       return AMDGPU::isIntrinsicSourceOfDivergence(
10106       cast<ConstantSDNode>(N->getOperand(0))->getZExtValue());
10107     case ISD::INTRINSIC_W_CHAIN:
10108       return AMDGPU::isIntrinsicSourceOfDivergence(
10109       cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
10110     // In some cases intrinsics that are a source of divergence have been
10111     // lowered to AMDGPUISD so we also need to check those too.
10112     case AMDGPUISD::INTERP_MOV:
10113     case AMDGPUISD::INTERP_P1:
10114     case AMDGPUISD::INTERP_P2:
10115       return true;
10116   }
10117   return false;
10118 }
10119 
10120 bool SITargetLowering::denormalsEnabledForType(EVT VT) const {
10121   switch (VT.getScalarType().getSimpleVT().SimpleTy) {
10122   case MVT::f32:
10123     return Subtarget->hasFP32Denormals();
10124   case MVT::f64:
10125     return Subtarget->hasFP64Denormals();
10126   case MVT::f16:
10127     return Subtarget->hasFP16Denormals();
10128   default:
10129     return false;
10130   }
10131 }
10132 
10133 bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
10134                                                     const SelectionDAG &DAG,
10135                                                     bool SNaN,
10136                                                     unsigned Depth) const {
10137   if (Op.getOpcode() == AMDGPUISD::CLAMP) {
10138     const MachineFunction &MF = DAG.getMachineFunction();
10139     const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
10140 
10141     if (Info->getMode().DX10Clamp)
10142       return true; // Clamped to 0.
10143     return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
10144   }
10145 
10146   return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op, DAG,
10147                                                             SNaN, Depth);
10148 }
10149 
10150 TargetLowering::AtomicExpansionKind
10151 SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
10152   switch (RMW->getOperation()) {
10153   case AtomicRMWInst::FAdd: {
10154     Type *Ty = RMW->getType();
10155 
10156     // We don't have a way to support 16-bit atomics now, so just leave them
10157     // as-is.
10158     if (Ty->isHalfTy())
10159       return AtomicExpansionKind::None;
10160 
10161     if (!Ty->isFloatTy())
10162       return AtomicExpansionKind::CmpXChg;
10163 
10164     // TODO: Do have these for flat. Older targets also had them for buffers.
10165     unsigned AS = RMW->getPointerAddressSpace();
10166     return (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) ?
10167       AtomicExpansionKind::None : AtomicExpansionKind::CmpXChg;
10168   }
10169   default:
10170     break;
10171   }
10172 
10173   return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW);
10174 }
10175 
10176 const TargetRegisterClass *
10177 SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
10178   const TargetRegisterClass *RC = TargetLoweringBase::getRegClassFor(VT, false);
10179   const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
10180   if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
10181     return &AMDGPU::SReg_64RegClass;
10182   if (!TRI->isSGPRClass(RC) && !isDivergent)
10183     return TRI->getEquivalentSGPRClass(RC);
10184   else if (TRI->isSGPRClass(RC) && isDivergent)
10185     return TRI->getEquivalentVGPRClass(RC);
10186 
10187   return RC;
10188 }
10189 
10190 static bool hasIfBreakUser(const Value *V, SetVector<const Value *> &Visited) {
10191   if (Visited.count(V))
10192     return false;
10193   Visited.insert(V);
10194   bool Result = false;
10195   for (auto U : V->users()) {
10196     if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
10197       if ((Intrinsic->getIntrinsicID() == Intrinsic::amdgcn_if_break) &&
10198           (V == U->getOperand(1)))
10199         Result = true;
10200     } else {
10201       Result = hasIfBreakUser(U, Visited);
10202     }
10203     if (Result)
10204       break;
10205   }
10206   return Result;
10207 }
10208 
10209 bool SITargetLowering::requiresUniformRegister(MachineFunction &MF,
10210                                                const Value *V) const {
10211   if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
10212     switch (Intrinsic->getIntrinsicID()) {
10213     default:
10214       return false;
10215     case Intrinsic::amdgcn_if_break:
10216       return true;
10217     }
10218   }
10219   if (const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V)) {
10220     if (const IntrinsicInst *Intrinsic =
10221             dyn_cast<IntrinsicInst>(ExtValue->getOperand(0))) {
10222       switch (Intrinsic->getIntrinsicID()) {
10223       default:
10224         return false;
10225       case Intrinsic::amdgcn_if:
10226       case Intrinsic::amdgcn_else: {
10227         ArrayRef<unsigned> Indices = ExtValue->getIndices();
10228         if (Indices.size() == 1 && Indices[0] == 1) {
10229           return true;
10230         }
10231       }
10232       }
10233     }
10234   }
10235   if (const CallInst *CI = dyn_cast<CallInst>(V)) {
10236     if (isa<InlineAsm>(CI->getCalledValue())) {
10237       const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
10238       ImmutableCallSite CS(CI);
10239       TargetLowering::AsmOperandInfoVector TargetConstraints = ParseConstraints(
10240           MF.getDataLayout(), Subtarget->getRegisterInfo(), CS);
10241       for (auto &TC : TargetConstraints) {
10242         if (TC.Type == InlineAsm::isOutput) {
10243           ComputeConstraintToUse(TC, SDValue());
10244           unsigned AssignedReg;
10245           const TargetRegisterClass *RC;
10246           std::tie(AssignedReg, RC) = getRegForInlineAsmConstraint(
10247               SIRI, TC.ConstraintCode,
10248               getSimpleValueType(MF.getDataLayout(), CS.getType()));
10249           if (RC) {
10250             MachineRegisterInfo &MRI = MF.getRegInfo();
10251             if (AssignedReg != 0 && SIRI->isSGPRReg(MRI, AssignedReg))
10252               return true;
10253             else if (SIRI->isSGPRClass(RC))
10254               return true;
10255           }
10256         }
10257       }
10258     }
10259   }
10260   SetVector<const Value *> Visited;
10261   return hasIfBreakUser(V, Visited);
10262 }
10263