1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Custom DAG lowering for SI
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "SIISelLowering.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUInstrInfo.h"
17 #include "AMDGPUTargetMachine.h"
18 #include "SIMachineFunctionInfo.h"
19 #include "SIRegisterInfo.h"
20 #include "llvm/ADT/FloatingPointMode.h"
21 #include "llvm/ADT/Statistic.h"
22 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
23 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
24 #include "llvm/BinaryFormat/ELF.h"
25 #include "llvm/CodeGen/Analysis.h"
26 #include "llvm/CodeGen/FunctionLoweringInfo.h"
27 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
28 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
29 #include "llvm/CodeGen/MachineFrameInfo.h"
30 #include "llvm/CodeGen/MachineFunction.h"
31 #include "llvm/CodeGen/MachineLoopInfo.h"
32 #include "llvm/IR/DiagnosticInfo.h"
33 #include "llvm/IR/IntrinsicInst.h"
34 #include "llvm/IR/IntrinsicsAMDGPU.h"
35 #include "llvm/IR/IntrinsicsR600.h"
36 #include "llvm/Support/CommandLine.h"
37 #include "llvm/Support/KnownBits.h"
38
39 using namespace llvm;
40
41 #define DEBUG_TYPE "si-lower"
42
43 STATISTIC(NumTailCalls, "Number of tail calls");
44
45 static cl::opt<bool> DisableLoopAlignment(
46 "amdgpu-disable-loop-alignment",
47 cl::desc("Do not align and prefetch loops"),
48 cl::init(false));
49
50 static cl::opt<bool> UseDivergentRegisterIndexing(
51 "amdgpu-use-divergent-register-indexing",
52 cl::Hidden,
53 cl::desc("Use indirect register addressing for divergent indexes"),
54 cl::init(false));
55
hasFP32Denormals(const MachineFunction & MF)56 static bool hasFP32Denormals(const MachineFunction &MF) {
57 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
58 return Info->getMode().allFP32Denormals();
59 }
60
hasFP64FP16Denormals(const MachineFunction & MF)61 static bool hasFP64FP16Denormals(const MachineFunction &MF) {
62 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
63 return Info->getMode().allFP64FP16Denormals();
64 }
65
findFirstFreeSGPR(CCState & CCInfo)66 static unsigned findFirstFreeSGPR(CCState &CCInfo) {
67 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
68 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
69 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
70 return AMDGPU::SGPR0 + Reg;
71 }
72 }
73 llvm_unreachable("Cannot allocate sgpr");
74 }
75
SITargetLowering(const TargetMachine & TM,const GCNSubtarget & STI)76 SITargetLowering::SITargetLowering(const TargetMachine &TM,
77 const GCNSubtarget &STI)
78 : AMDGPUTargetLowering(TM, STI),
79 Subtarget(&STI) {
80 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
81 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
82
83 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
84 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
85
86 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
87
88 const SIRegisterInfo *TRI = STI.getRegisterInfo();
89 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
90
91 addRegisterClass(MVT::f64, V64RegClass);
92 addRegisterClass(MVT::v2f32, V64RegClass);
93
94 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
95 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
96
97 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
98 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
99
100 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
101 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
102
103 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
104 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
105
106 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
107 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
108
109 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
110 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
111
112 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
113 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
114
115 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
116 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
117
118 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
119 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
120
121 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
122 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
123
124 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
125 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
126
127 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
128 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
129
130 if (Subtarget->has16BitInsts()) {
131 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
132 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
133
134 // Unless there are also VOP3P operations, not operations are really legal.
135 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
136 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
137 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
138 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
139 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
140 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
141 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
142 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
143 }
144
145 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
146 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
147
148 computeRegisterProperties(Subtarget->getRegisterInfo());
149
150 // The boolean content concept here is too inflexible. Compares only ever
151 // really produce a 1-bit result. Any copy/extend from these will turn into a
152 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
153 // it's what most targets use.
154 setBooleanContents(ZeroOrOneBooleanContent);
155 setBooleanVectorContents(ZeroOrOneBooleanContent);
156
157 // We need to custom lower vector stores from local memory
158 setOperationAction(ISD::LOAD,
159 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
160 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v16i32, MVT::i1,
161 MVT::v32i32},
162 Custom);
163
164 setOperationAction(ISD::STORE,
165 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
166 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v16i32, MVT::i1,
167 MVT::v32i32},
168 Custom);
169
170 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
171 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
172 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
173 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
174 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
175 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
176 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
177 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
178 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
179 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
180 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
181 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
182 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
183 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
184 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
185 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
186
187 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
188 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
189 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
190 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
191 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
192 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
193 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
194
195 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
196
197 setOperationAction(ISD::SELECT, MVT::i1, Promote);
198 setOperationAction(ISD::SELECT, MVT::i64, Custom);
199 setOperationAction(ISD::SELECT, MVT::f64, Promote);
200 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
201
202 setOperationAction(ISD::SELECT_CC,
203 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
204
205 setOperationAction(ISD::SETCC, MVT::i1, Promote);
206 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
207 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
208
209 setOperationAction(ISD::TRUNCATE,
210 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
211 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v16i32},
212 Expand);
213 setOperationAction(ISD::FP_ROUND,
214 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
215 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32},
216 Expand);
217
218 setOperationAction(ISD::SIGN_EXTEND_INREG,
219 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
220 MVT::v3i16, MVT::v4i16, MVT::Other},
221 Custom);
222
223 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
224 setOperationAction(ISD::BR_CC,
225 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
226
227 setOperationAction({ISD::UADDO, ISD::USUBO}, MVT::i32, Legal);
228
229 setOperationAction({ISD::ADDCARRY, ISD::SUBCARRY}, MVT::i32, Legal);
230
231 setOperationAction({ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS}, MVT::i64,
232 Expand);
233
234 #if 0
235 setOperationAction({ISD::ADDCARRY, ISD::SUBCARRY}, MVT::i64, Legal);
236 #endif
237
238 // We only support LOAD/STORE and vector manipulation ops for vectors
239 // with > 4 elements.
240 for (MVT VT :
241 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64,
242 MVT::v2f64, MVT::v4i16, MVT::v4f16, MVT::v3i64, MVT::v3f64,
243 MVT::v6i32, MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64,
244 MVT::v8f64, MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16,
245 MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32}) {
246 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
247 switch (Op) {
248 case ISD::LOAD:
249 case ISD::STORE:
250 case ISD::BUILD_VECTOR:
251 case ISD::BITCAST:
252 case ISD::UNDEF:
253 case ISD::EXTRACT_VECTOR_ELT:
254 case ISD::INSERT_VECTOR_ELT:
255 case ISD::EXTRACT_SUBVECTOR:
256 case ISD::SCALAR_TO_VECTOR:
257 break;
258 case ISD::INSERT_SUBVECTOR:
259 case ISD::CONCAT_VECTORS:
260 setOperationAction(Op, VT, Custom);
261 break;
262 default:
263 setOperationAction(Op, VT, Expand);
264 break;
265 }
266 }
267 }
268
269 setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);
270
271 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
272 // is expanded to avoid having two separate loops in case the index is a VGPR.
273
274 // Most operations are naturally 32-bit vector operations. We only support
275 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
276 for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
277 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
278 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
279
280 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
281 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
282
283 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
284 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
285
286 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
287 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
288 }
289
290 for (MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {
291 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
292 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
293
294 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
295 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
296
297 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
298 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
299
300 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
301 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
302 }
303
304 for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
305 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
306 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
307
308 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
309 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
310
311 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
312 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
313
314 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
315 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
316 }
317
318 for (MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) {
319 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
320 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
321
322 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
323 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
324
325 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
326 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
327
328 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
329 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
330 }
331
332 for (MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) {
333 setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
334 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
335
336 setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
337 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
338
339 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
340 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
341
342 setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
343 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
344 }
345
346 setOperationAction(ISD::VECTOR_SHUFFLE,
347 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
348 Expand);
349
350 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16}, Custom);
351
352 // Avoid stack access for these.
353 // TODO: Generalize to more vector types.
354 setOperationAction({ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT},
355 {MVT::v2i16, MVT::v2f16, MVT::v2i8, MVT::v4i8, MVT::v8i8,
356 MVT::v4i16, MVT::v4f16},
357 Custom);
358
359 // Deal with vec3 vector operations when widened to vec4.
360 setOperationAction(ISD::INSERT_SUBVECTOR,
361 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
362
363 // Deal with vec5/6/7 vector operations when widened to vec8.
364 setOperationAction(ISD::INSERT_SUBVECTOR,
365 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
366 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32},
367 Custom);
368
369 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
370 // and output demarshalling
371 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
372
373 // We can't return success/failure, only the old value,
374 // let LLVM add the comparison
375 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, {MVT::i32, MVT::i64},
376 Expand);
377
378 if (Subtarget->hasFlatAddressSpace())
379 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
380
381 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
382
383 // FIXME: This should be narrowed to i32, but that only happens if i64 is
384 // illegal.
385 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
386 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
387
388 // On SI this is s_memtime and s_memrealtime on VI.
389 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
390 setOperationAction({ISD::TRAP, ISD::DEBUGTRAP}, MVT::Other, Custom);
391
392 if (Subtarget->has16BitInsts()) {
393 setOperationAction({ISD::FPOW, ISD::FPOWI}, MVT::f16, Promote);
394 setOperationAction({ISD::FLOG, ISD::FEXP, ISD::FLOG10}, MVT::f16, Custom);
395 }
396
397 if (Subtarget->hasMadMacF32Insts())
398 setOperationAction(ISD::FMAD, MVT::f32, Legal);
399
400 if (!Subtarget->hasBFI())
401 // fcopysign can be done in a single instruction with BFI.
402 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
403
404 if (!Subtarget->hasBCNT(32))
405 setOperationAction(ISD::CTPOP, MVT::i32, Expand);
406
407 if (!Subtarget->hasBCNT(64))
408 setOperationAction(ISD::CTPOP, MVT::i64, Expand);
409
410 if (Subtarget->hasFFBH())
411 setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, MVT::i32, Custom);
412
413 if (Subtarget->hasFFBL())
414 setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom);
415
416 // We only really have 32-bit BFE instructions (and 16-bit on VI).
417 //
418 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
419 // effort to match them now. We want this to be false for i64 cases when the
420 // extraction isn't restricted to the upper or lower half. Ideally we would
421 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
422 // span the midpoint are probably relatively rare, so don't worry about them
423 // for now.
424 if (Subtarget->hasBFE())
425 setHasExtractBitsInsn(true);
426
427 // Clamp modifier on add/sub
428 if (Subtarget->hasIntClamp())
429 setOperationAction({ISD::UADDSAT, ISD::USUBSAT}, MVT::i32, Legal);
430
431 if (Subtarget->hasAddNoCarry())
432 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
433 Legal);
434
435 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, {MVT::f32, MVT::f64},
436 Custom);
437
438 // These are really only legal for ieee_mode functions. We should be avoiding
439 // them for functions that don't have ieee_mode enabled, so just say they are
440 // legal.
441 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
442 {MVT::f32, MVT::f64}, Legal);
443
444 if (Subtarget->haveRoundOpsF64())
445 setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FRINT}, MVT::f64, Legal);
446 else
447 setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FFLOOR},
448 MVT::f64, Custom);
449
450 setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
451
452 setOperationAction({ISD::FSIN, ISD::FCOS, ISD::FDIV}, MVT::f32, Custom);
453 setOperationAction(ISD::FDIV, MVT::f64, Custom);
454
455 if (Subtarget->has16BitInsts()) {
456 setOperationAction({ISD::Constant, ISD::SMIN, ISD::SMAX, ISD::UMIN,
457 ISD::UMAX, ISD::UADDSAT, ISD::USUBSAT},
458 MVT::i16, Legal);
459
460 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
461
462 setOperationAction({ISD::ROTR, ISD::ROTL, ISD::SELECT_CC, ISD::BR_CC},
463 MVT::i16, Expand);
464
465 setOperationAction({ISD::SIGN_EXTEND, ISD::SDIV, ISD::UDIV, ISD::SREM,
466 ISD::UREM, ISD::BITREVERSE, ISD::CTTZ,
467 ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF,
468 ISD::CTPOP},
469 MVT::i16, Promote);
470
471 setOperationAction(ISD::LOAD, MVT::i16, Custom);
472
473 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
474
475 setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
476 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
477 setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
478 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
479
480 setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::i16, Custom);
481
482 // F16 - Constant Actions.
483 setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
484
485 // F16 - Load/Store Actions.
486 setOperationAction(ISD::LOAD, MVT::f16, Promote);
487 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
488 setOperationAction(ISD::STORE, MVT::f16, Promote);
489 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
490
491 // F16 - VOP1 Actions.
492 setOperationAction(
493 {ISD::FP_ROUND, ISD::FCOS, ISD::FSIN, ISD::FROUND, ISD::FPTRUNC_ROUND},
494 MVT::f16, Custom);
495
496 setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom);
497
498 setOperationAction(
499 {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP, ISD::UINT_TO_FP},
500 MVT::f16, Promote);
501
502 // F16 - VOP2 Actions.
503 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, MVT::f16, Expand);
504
505 setOperationAction(ISD::FDIV, MVT::f16, Custom);
506
507 // F16 - VOP3 Actions.
508 setOperationAction(ISD::FMA, MVT::f16, Legal);
509 if (STI.hasMadF16())
510 setOperationAction(ISD::FMAD, MVT::f16, Legal);
511
512 for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16, MVT::v8i16,
513 MVT::v8f16, MVT::v16i16, MVT::v16f16}) {
514 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
515 switch (Op) {
516 case ISD::LOAD:
517 case ISD::STORE:
518 case ISD::BUILD_VECTOR:
519 case ISD::BITCAST:
520 case ISD::UNDEF:
521 case ISD::EXTRACT_VECTOR_ELT:
522 case ISD::INSERT_VECTOR_ELT:
523 case ISD::INSERT_SUBVECTOR:
524 case ISD::EXTRACT_SUBVECTOR:
525 case ISD::SCALAR_TO_VECTOR:
526 break;
527 case ISD::CONCAT_VECTORS:
528 setOperationAction(Op, VT, Custom);
529 break;
530 default:
531 setOperationAction(Op, VT, Expand);
532 break;
533 }
534 }
535 }
536
537 // v_perm_b32 can handle either of these.
538 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
539 setOperationAction(ISD::BSWAP, MVT::v4i16, Custom);
540
541 // XXX - Do these do anything? Vector constants turn into build_vector.
542 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
543
544 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16}, Legal);
545
546 setOperationAction(ISD::STORE, MVT::v2i16, Promote);
547 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
548 setOperationAction(ISD::STORE, MVT::v2f16, Promote);
549 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
550
551 setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
552 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
553 setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
554 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
555
556 setOperationAction(ISD::AND, MVT::v2i16, Promote);
557 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
558 setOperationAction(ISD::OR, MVT::v2i16, Promote);
559 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
560 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
561 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
562
563 setOperationAction(ISD::LOAD, MVT::v4i16, Promote);
564 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
565 setOperationAction(ISD::LOAD, MVT::v4f16, Promote);
566 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
567
568 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
569 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
570 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
571 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
572
573 setOperationAction(ISD::LOAD, MVT::v8i16, Promote);
574 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
575 setOperationAction(ISD::LOAD, MVT::v8f16, Promote);
576 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
577
578 setOperationAction(ISD::STORE, MVT::v4i16, Promote);
579 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
580 setOperationAction(ISD::STORE, MVT::v4f16, Promote);
581 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
582
583 setOperationAction(ISD::STORE, MVT::v8i16, Promote);
584 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
585 setOperationAction(ISD::STORE, MVT::v8f16, Promote);
586 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
587
588 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
589 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
590 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
591 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
592
593 setOperationAction(ISD::STORE, MVT::v16i16, Promote);
594 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
595 setOperationAction(ISD::STORE, MVT::v16f16, Promote);
596 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
597
598 setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
599 MVT::v2i32, Expand);
600 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
601
602 setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
603 MVT::v4i32, Expand);
604
605 setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
606 MVT::v8i32, Expand);
607
608 if (!Subtarget->hasVOP3PInsts())
609 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16}, Custom);
610
611 setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
612 // This isn't really legal, but this avoids the legalizer unrolling it (and
613 // allows matching fneg (fabs x) patterns)
614 setOperationAction(ISD::FABS, MVT::v2f16, Legal);
615
616 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, MVT::f16, Custom);
617 setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal);
618
619 setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
620 {MVT::v4f16, MVT::v8f16, MVT::v16f16}, Custom);
621
622 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM},
623 {MVT::v4f16, MVT::v8f16, MVT::v16f16}, Expand);
624
625 for (MVT Vec16 : {MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16}) {
626 setOperationAction(
627 {ISD::BUILD_VECTOR, ISD::EXTRACT_VECTOR_ELT, ISD::SCALAR_TO_VECTOR},
628 Vec16, Custom);
629 setOperationAction(ISD::INSERT_VECTOR_ELT, Vec16, Expand);
630 }
631 }
632
633 if (Subtarget->hasVOP3PInsts()) {
634 setOperationAction({ISD::ADD, ISD::SUB, ISD::MUL, ISD::SHL, ISD::SRL,
635 ISD::SRA, ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX,
636 ISD::UADDSAT, ISD::USUBSAT, ISD::SADDSAT, ISD::SSUBSAT},
637 MVT::v2i16, Legal);
638
639 setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FMINNUM_IEEE,
640 ISD::FMAXNUM_IEEE, ISD::FCANONICALIZE},
641 MVT::v2f16, Legal);
642
643 setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2i16, MVT::v2f16},
644 Custom);
645
646 setOperationAction(ISD::VECTOR_SHUFFLE,
647 {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
648 MVT::v16f16, MVT::v16i16},
649 Custom);
650
651 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16})
652 // Split vector operations.
653 setOperationAction({ISD::SHL, ISD::SRA, ISD::SRL, ISD::ADD, ISD::SUB,
654 ISD::MUL, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX,
655 ISD::UADDSAT, ISD::SADDSAT, ISD::USUBSAT,
656 ISD::SSUBSAT},
657 VT, Custom);
658
659 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16})
660 // Split vector operations.
661 setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE},
662 VT, Custom);
663
664 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, {MVT::v2f16, MVT::v4f16},
665 Custom);
666
667 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
668 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16}, Custom);
669
670 if (Subtarget->hasPackedFP32Ops()) {
671 setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG},
672 MVT::v2f32, Legal);
673 setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA},
674 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
675 Custom);
676 }
677 }
678
679 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v4f16, Custom);
680
681 if (Subtarget->has16BitInsts()) {
682 setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
683 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
684 setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
685 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
686 } else {
687 // Legalization hack.
688 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
689
690 setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v2f16, Custom);
691 }
692
693 setOperationAction(ISD::SELECT,
694 {MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8,
695 MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16},
696 Custom);
697
698 setOperationAction({ISD::SMULO, ISD::UMULO}, MVT::i64, Custom);
699
700 if (Subtarget->hasMad64_32())
701 setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom);
702
703 setOperationAction(ISD::INTRINSIC_WO_CHAIN,
704 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
705 MVT::v2i16, MVT::v2f16},
706 Custom);
707
708 setOperationAction(ISD::INTRINSIC_W_CHAIN,
709 {MVT::v2f16, MVT::v2i16, MVT::v3f16, MVT::v3i16,
710 MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::Other, MVT::f16,
711 MVT::i16, MVT::i8},
712 Custom);
713
714 setOperationAction(ISD::INTRINSIC_VOID,
715 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v3i16,
716 MVT::v3f16, MVT::v4f16, MVT::v4i16, MVT::f16, MVT::i16,
717 MVT::i8},
718 Custom);
719
720 setTargetDAGCombine({ISD::ADD,
721 ISD::ADDCARRY,
722 ISD::SUB,
723 ISD::SUBCARRY,
724 ISD::FADD,
725 ISD::FSUB,
726 ISD::FMINNUM,
727 ISD::FMAXNUM,
728 ISD::FMINNUM_IEEE,
729 ISD::FMAXNUM_IEEE,
730 ISD::FMA,
731 ISD::SMIN,
732 ISD::SMAX,
733 ISD::UMIN,
734 ISD::UMAX,
735 ISD::SETCC,
736 ISD::AND,
737 ISD::OR,
738 ISD::XOR,
739 ISD::SINT_TO_FP,
740 ISD::UINT_TO_FP,
741 ISD::FCANONICALIZE,
742 ISD::SCALAR_TO_VECTOR,
743 ISD::ZERO_EXTEND,
744 ISD::SIGN_EXTEND_INREG,
745 ISD::EXTRACT_VECTOR_ELT,
746 ISD::INSERT_VECTOR_ELT});
747
748 // All memory operations. Some folding on the pointer operand is done to help
749 // matching the constant offsets in the addressing modes.
750 setTargetDAGCombine({ISD::LOAD,
751 ISD::STORE,
752 ISD::ATOMIC_LOAD,
753 ISD::ATOMIC_STORE,
754 ISD::ATOMIC_CMP_SWAP,
755 ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
756 ISD::ATOMIC_SWAP,
757 ISD::ATOMIC_LOAD_ADD,
758 ISD::ATOMIC_LOAD_SUB,
759 ISD::ATOMIC_LOAD_AND,
760 ISD::ATOMIC_LOAD_OR,
761 ISD::ATOMIC_LOAD_XOR,
762 ISD::ATOMIC_LOAD_NAND,
763 ISD::ATOMIC_LOAD_MIN,
764 ISD::ATOMIC_LOAD_MAX,
765 ISD::ATOMIC_LOAD_UMIN,
766 ISD::ATOMIC_LOAD_UMAX,
767 ISD::ATOMIC_LOAD_FADD,
768 ISD::INTRINSIC_VOID,
769 ISD::INTRINSIC_W_CHAIN});
770
771 // FIXME: In other contexts we pretend this is a per-function property.
772 setStackPointerRegisterToSaveRestore(AMDGPU::SGPR32);
773
774 setSchedulingPreference(Sched::RegPressure);
775 }
776
getSubtarget() const777 const GCNSubtarget *SITargetLowering::getSubtarget() const {
778 return Subtarget;
779 }
780
781 //===----------------------------------------------------------------------===//
782 // TargetLowering queries
783 //===----------------------------------------------------------------------===//
784
785 // v_mad_mix* support a conversion from f16 to f32.
786 //
787 // There is only one special case when denormals are enabled we don't currently,
788 // where this is OK to use.
isFPExtFoldable(const SelectionDAG & DAG,unsigned Opcode,EVT DestVT,EVT SrcVT) const789 bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
790 EVT DestVT, EVT SrcVT) const {
791 return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
792 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
793 DestVT.getScalarType() == MVT::f32 &&
794 SrcVT.getScalarType() == MVT::f16 &&
795 // TODO: This probably only requires no input flushing?
796 !hasFP32Denormals(DAG.getMachineFunction());
797 }
798
isFPExtFoldable(const MachineInstr & MI,unsigned Opcode,LLT DestTy,LLT SrcTy) const799 bool SITargetLowering::isFPExtFoldable(const MachineInstr &MI, unsigned Opcode,
800 LLT DestTy, LLT SrcTy) const {
801 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
802 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
803 DestTy.getScalarSizeInBits() == 32 &&
804 SrcTy.getScalarSizeInBits() == 16 &&
805 // TODO: This probably only requires no input flushing?
806 !hasFP32Denormals(*MI.getMF());
807 }
808
isShuffleMaskLegal(ArrayRef<int>,EVT) const809 bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
810 // SI has some legal vector types, but no legal vector operations. Say no
811 // shuffles are legal in order to prefer scalarizing some vector operations.
812 return false;
813 }
814
getRegisterTypeForCallingConv(LLVMContext & Context,CallingConv::ID CC,EVT VT) const815 MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
816 CallingConv::ID CC,
817 EVT VT) const {
818 if (CC == CallingConv::AMDGPU_KERNEL)
819 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
820
821 if (VT.isVector()) {
822 EVT ScalarVT = VT.getScalarType();
823 unsigned Size = ScalarVT.getSizeInBits();
824 if (Size == 16) {
825 if (Subtarget->has16BitInsts())
826 return VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
827 return VT.isInteger() ? MVT::i32 : MVT::f32;
828 }
829
830 if (Size < 16)
831 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
832 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
833 }
834
835 if (VT.getSizeInBits() > 32)
836 return MVT::i32;
837
838 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
839 }
840
getNumRegistersForCallingConv(LLVMContext & Context,CallingConv::ID CC,EVT VT) const841 unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
842 CallingConv::ID CC,
843 EVT VT) const {
844 if (CC == CallingConv::AMDGPU_KERNEL)
845 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
846
847 if (VT.isVector()) {
848 unsigned NumElts = VT.getVectorNumElements();
849 EVT ScalarVT = VT.getScalarType();
850 unsigned Size = ScalarVT.getSizeInBits();
851
852 // FIXME: Should probably promote 8-bit vectors to i16.
853 if (Size == 16 && Subtarget->has16BitInsts())
854 return (NumElts + 1) / 2;
855
856 if (Size <= 32)
857 return NumElts;
858
859 if (Size > 32)
860 return NumElts * ((Size + 31) / 32);
861 } else if (VT.getSizeInBits() > 32)
862 return (VT.getSizeInBits() + 31) / 32;
863
864 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
865 }
866
getVectorTypeBreakdownForCallingConv(LLVMContext & Context,CallingConv::ID CC,EVT VT,EVT & IntermediateVT,unsigned & NumIntermediates,MVT & RegisterVT) const867 unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
868 LLVMContext &Context, CallingConv::ID CC,
869 EVT VT, EVT &IntermediateVT,
870 unsigned &NumIntermediates, MVT &RegisterVT) const {
871 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
872 unsigned NumElts = VT.getVectorNumElements();
873 EVT ScalarVT = VT.getScalarType();
874 unsigned Size = ScalarVT.getSizeInBits();
875 // FIXME: We should fix the ABI to be the same on targets without 16-bit
876 // support, but unless we can properly handle 3-vectors, it will be still be
877 // inconsistent.
878 if (Size == 16 && Subtarget->has16BitInsts()) {
879 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
880 IntermediateVT = RegisterVT;
881 NumIntermediates = (NumElts + 1) / 2;
882 return NumIntermediates;
883 }
884
885 if (Size == 32) {
886 RegisterVT = ScalarVT.getSimpleVT();
887 IntermediateVT = RegisterVT;
888 NumIntermediates = NumElts;
889 return NumIntermediates;
890 }
891
892 if (Size < 16 && Subtarget->has16BitInsts()) {
893 // FIXME: Should probably form v2i16 pieces
894 RegisterVT = MVT::i16;
895 IntermediateVT = ScalarVT;
896 NumIntermediates = NumElts;
897 return NumIntermediates;
898 }
899
900
901 if (Size != 16 && Size <= 32) {
902 RegisterVT = MVT::i32;
903 IntermediateVT = ScalarVT;
904 NumIntermediates = NumElts;
905 return NumIntermediates;
906 }
907
908 if (Size > 32) {
909 RegisterVT = MVT::i32;
910 IntermediateVT = RegisterVT;
911 NumIntermediates = NumElts * ((Size + 31) / 32);
912 return NumIntermediates;
913 }
914 }
915
916 return TargetLowering::getVectorTypeBreakdownForCallingConv(
917 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
918 }
919
memVTFromImageData(Type * Ty,unsigned DMaskLanes)920 static EVT memVTFromImageData(Type *Ty, unsigned DMaskLanes) {
921 assert(DMaskLanes != 0);
922
923 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
924 unsigned NumElts = std::min(DMaskLanes, VT->getNumElements());
925 return EVT::getVectorVT(Ty->getContext(),
926 EVT::getEVT(VT->getElementType()),
927 NumElts);
928 }
929
930 return EVT::getEVT(Ty);
931 }
932
933 // Peek through TFE struct returns to only use the data size.
memVTFromImageReturn(Type * Ty,unsigned DMaskLanes)934 static EVT memVTFromImageReturn(Type *Ty, unsigned DMaskLanes) {
935 auto *ST = dyn_cast<StructType>(Ty);
936 if (!ST)
937 return memVTFromImageData(Ty, DMaskLanes);
938
939 // Some intrinsics return an aggregate type - special case to work out the
940 // correct memVT.
941 //
942 // Only limited forms of aggregate type currently expected.
943 if (ST->getNumContainedTypes() != 2 ||
944 !ST->getContainedType(1)->isIntegerTy(32))
945 return EVT();
946 return memVTFromImageData(ST->getContainedType(0), DMaskLanes);
947 }
948
getTgtMemIntrinsic(IntrinsicInfo & Info,const CallInst & CI,MachineFunction & MF,unsigned IntrID) const949 bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
950 const CallInst &CI,
951 MachineFunction &MF,
952 unsigned IntrID) const {
953 Info.flags = MachineMemOperand::MONone;
954 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
955 Info.flags |= MachineMemOperand::MOInvariant;
956
957 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
958 AMDGPU::lookupRsrcIntrinsic(IntrID)) {
959 AttributeList Attr = Intrinsic::getAttributes(CI.getContext(),
960 (Intrinsic::ID)IntrID);
961 if (Attr.hasFnAttr(Attribute::ReadNone))
962 return false;
963
964 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
965
966 const GCNTargetMachine &TM =
967 static_cast<const GCNTargetMachine &>(getTargetMachine());
968
969 if (RsrcIntr->IsImage) {
970 Info.ptrVal = MFI->getImagePSV(TM);
971 Info.align.reset();
972 } else {
973 Info.ptrVal = MFI->getBufferPSV(TM);
974 }
975
976 Info.flags |= MachineMemOperand::MODereferenceable;
977 if (Attr.hasFnAttr(Attribute::ReadOnly)) {
978 unsigned DMaskLanes = 4;
979
980 if (RsrcIntr->IsImage) {
981 const AMDGPU::ImageDimIntrinsicInfo *Intr
982 = AMDGPU::getImageDimIntrinsicInfo(IntrID);
983 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
984 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
985
986 if (!BaseOpcode->Gather4) {
987 // If this isn't a gather, we may have excess loaded elements in the
988 // IR type. Check the dmask for the real number of elements loaded.
989 unsigned DMask
990 = cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
991 DMaskLanes = DMask == 0 ? 1 : countPopulation(DMask);
992 }
993
994 Info.memVT = memVTFromImageReturn(CI.getType(), DMaskLanes);
995 } else
996 Info.memVT = EVT::getEVT(CI.getType());
997
998 // FIXME: What does alignment mean for an image?
999 Info.opc = ISD::INTRINSIC_W_CHAIN;
1000 Info.flags |= MachineMemOperand::MOLoad;
1001 } else if (Attr.hasFnAttr(Attribute::WriteOnly)) {
1002 Info.opc = ISD::INTRINSIC_VOID;
1003
1004 Type *DataTy = CI.getArgOperand(0)->getType();
1005 if (RsrcIntr->IsImage) {
1006 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1007 unsigned DMaskLanes = DMask == 0 ? 1 : countPopulation(DMask);
1008 Info.memVT = memVTFromImageData(DataTy, DMaskLanes);
1009 } else
1010 Info.memVT = EVT::getEVT(DataTy);
1011
1012 Info.flags |= MachineMemOperand::MOStore;
1013 } else {
1014 // Atomic
1015 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID :
1016 ISD::INTRINSIC_W_CHAIN;
1017 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1018 Info.flags |= MachineMemOperand::MOLoad |
1019 MachineMemOperand::MOStore |
1020 MachineMemOperand::MODereferenceable;
1021
1022 // XXX - Should this be volatile without known ordering?
1023 Info.flags |= MachineMemOperand::MOVolatile;
1024
1025 switch (IntrID) {
1026 default:
1027 break;
1028 case Intrinsic::amdgcn_raw_buffer_load_lds:
1029 case Intrinsic::amdgcn_struct_buffer_load_lds: {
1030 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1031 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1032 return true;
1033 }
1034 }
1035 }
1036 return true;
1037 }
1038
1039 switch (IntrID) {
1040 case Intrinsic::amdgcn_atomic_inc:
1041 case Intrinsic::amdgcn_atomic_dec:
1042 case Intrinsic::amdgcn_ds_ordered_add:
1043 case Intrinsic::amdgcn_ds_ordered_swap:
1044 case Intrinsic::amdgcn_ds_fadd:
1045 case Intrinsic::amdgcn_ds_fmin:
1046 case Intrinsic::amdgcn_ds_fmax: {
1047 Info.opc = ISD::INTRINSIC_W_CHAIN;
1048 Info.memVT = MVT::getVT(CI.getType());
1049 Info.ptrVal = CI.getOperand(0);
1050 Info.align.reset();
1051 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1052
1053 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1054 if (!Vol->isZero())
1055 Info.flags |= MachineMemOperand::MOVolatile;
1056
1057 return true;
1058 }
1059 case Intrinsic::amdgcn_buffer_atomic_fadd: {
1060 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1061
1062 const GCNTargetMachine &TM =
1063 static_cast<const GCNTargetMachine &>(getTargetMachine());
1064
1065 Info.opc = ISD::INTRINSIC_W_CHAIN;
1066 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1067 Info.ptrVal = MFI->getBufferPSV(TM);
1068 Info.align.reset();
1069 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1070
1071 const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
1072 if (!Vol || !Vol->isZero())
1073 Info.flags |= MachineMemOperand::MOVolatile;
1074
1075 return true;
1076 }
1077 case Intrinsic::amdgcn_ds_append:
1078 case Intrinsic::amdgcn_ds_consume: {
1079 Info.opc = ISD::INTRINSIC_W_CHAIN;
1080 Info.memVT = MVT::getVT(CI.getType());
1081 Info.ptrVal = CI.getOperand(0);
1082 Info.align.reset();
1083 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
1084
1085 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1086 if (!Vol->isZero())
1087 Info.flags |= MachineMemOperand::MOVolatile;
1088
1089 return true;
1090 }
1091 case Intrinsic::amdgcn_global_atomic_csub: {
1092 Info.opc = ISD::INTRINSIC_W_CHAIN;
1093 Info.memVT = MVT::getVT(CI.getType());
1094 Info.ptrVal = CI.getOperand(0);
1095 Info.align.reset();
1096 Info.flags |= MachineMemOperand::MOLoad |
1097 MachineMemOperand::MOStore |
1098 MachineMemOperand::MOVolatile;
1099 return true;
1100 }
1101 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1102 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1103 Info.opc = ISD::INTRINSIC_W_CHAIN;
1104 Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
1105
1106 const GCNTargetMachine &TM =
1107 static_cast<const GCNTargetMachine &>(getTargetMachine());
1108
1109 Info.ptrVal = MFI->getImagePSV(TM);
1110 Info.align.reset();
1111 Info.flags |= MachineMemOperand::MOLoad |
1112 MachineMemOperand::MODereferenceable;
1113 return true;
1114 }
1115 case Intrinsic::amdgcn_global_atomic_fadd:
1116 case Intrinsic::amdgcn_global_atomic_fmin:
1117 case Intrinsic::amdgcn_global_atomic_fmax:
1118 case Intrinsic::amdgcn_flat_atomic_fadd:
1119 case Intrinsic::amdgcn_flat_atomic_fmin:
1120 case Intrinsic::amdgcn_flat_atomic_fmax:
1121 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1122 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
1123 Info.opc = ISD::INTRINSIC_W_CHAIN;
1124 Info.memVT = MVT::getVT(CI.getType());
1125 Info.ptrVal = CI.getOperand(0);
1126 Info.align.reset();
1127 Info.flags |= MachineMemOperand::MOLoad |
1128 MachineMemOperand::MOStore |
1129 MachineMemOperand::MODereferenceable |
1130 MachineMemOperand::MOVolatile;
1131 return true;
1132 }
1133 case Intrinsic::amdgcn_ds_gws_init:
1134 case Intrinsic::amdgcn_ds_gws_barrier:
1135 case Intrinsic::amdgcn_ds_gws_sema_v:
1136 case Intrinsic::amdgcn_ds_gws_sema_br:
1137 case Intrinsic::amdgcn_ds_gws_sema_p:
1138 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1139 Info.opc = ISD::INTRINSIC_VOID;
1140
1141 const GCNTargetMachine &TM =
1142 static_cast<const GCNTargetMachine &>(getTargetMachine());
1143
1144 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1145 Info.ptrVal = MFI->getGWSPSV(TM);
1146
1147 // This is an abstract access, but we need to specify a type and size.
1148 Info.memVT = MVT::i32;
1149 Info.size = 4;
1150 Info.align = Align(4);
1151
1152 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1153 Info.flags |= MachineMemOperand::MOLoad;
1154 else
1155 Info.flags |= MachineMemOperand::MOStore;
1156 return true;
1157 }
1158 case Intrinsic::amdgcn_global_load_lds: {
1159 Info.opc = ISD::INTRINSIC_VOID;
1160 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1161 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1162 Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
1163 MachineMemOperand::MOVolatile;
1164 return true;
1165 }
1166 default:
1167 return false;
1168 }
1169 }
1170
getAddrModeArguments(IntrinsicInst * II,SmallVectorImpl<Value * > & Ops,Type * & AccessTy) const1171 bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
1172 SmallVectorImpl<Value*> &Ops,
1173 Type *&AccessTy) const {
1174 switch (II->getIntrinsicID()) {
1175 case Intrinsic::amdgcn_atomic_inc:
1176 case Intrinsic::amdgcn_atomic_dec:
1177 case Intrinsic::amdgcn_ds_ordered_add:
1178 case Intrinsic::amdgcn_ds_ordered_swap:
1179 case Intrinsic::amdgcn_ds_append:
1180 case Intrinsic::amdgcn_ds_consume:
1181 case Intrinsic::amdgcn_ds_fadd:
1182 case Intrinsic::amdgcn_ds_fmin:
1183 case Intrinsic::amdgcn_ds_fmax:
1184 case Intrinsic::amdgcn_global_atomic_fadd:
1185 case Intrinsic::amdgcn_flat_atomic_fadd:
1186 case Intrinsic::amdgcn_flat_atomic_fmin:
1187 case Intrinsic::amdgcn_flat_atomic_fmax:
1188 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1189 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
1190 case Intrinsic::amdgcn_global_atomic_csub: {
1191 Value *Ptr = II->getArgOperand(0);
1192 AccessTy = II->getType();
1193 Ops.push_back(Ptr);
1194 return true;
1195 }
1196 default:
1197 return false;
1198 }
1199 }
1200
isLegalFlatAddressingMode(const AddrMode & AM) const1201 bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
1202 if (!Subtarget->hasFlatInstOffsets()) {
1203 // Flat instructions do not have offsets, and only have the register
1204 // address.
1205 return AM.BaseOffs == 0 && AM.Scale == 0;
1206 }
1207
1208 return AM.Scale == 0 &&
1209 (AM.BaseOffs == 0 ||
1210 Subtarget->getInstrInfo()->isLegalFLATOffset(
1211 AM.BaseOffs, AMDGPUAS::FLAT_ADDRESS, SIInstrFlags::FLAT));
1212 }
1213
isLegalGlobalAddressingMode(const AddrMode & AM) const1214 bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
1215 if (Subtarget->hasFlatGlobalInsts())
1216 return AM.Scale == 0 &&
1217 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1218 AM.BaseOffs, AMDGPUAS::GLOBAL_ADDRESS,
1219 SIInstrFlags::FlatGlobal));
1220
1221 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1222 // Assume the we will use FLAT for all global memory accesses
1223 // on VI.
1224 // FIXME: This assumption is currently wrong. On VI we still use
1225 // MUBUF instructions for the r + i addressing mode. As currently
1226 // implemented, the MUBUF instructions only work on buffer < 4GB.
1227 // It may be possible to support > 4GB buffers with MUBUF instructions,
1228 // by setting the stride value in the resource descriptor which would
1229 // increase the size limit to (stride * 4GB). However, this is risky,
1230 // because it has never been validated.
1231 return isLegalFlatAddressingMode(AM);
1232 }
1233
1234 return isLegalMUBUFAddressingMode(AM);
1235 }
1236
isLegalMUBUFAddressingMode(const AddrMode & AM) const1237 bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1238 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1239 // additionally can do r + r + i with addr64. 32-bit has more addressing
1240 // mode options. Depending on the resource constant, it can also do
1241 // (i64 r0) + (i32 r1) * (i14 i).
1242 //
1243 // Private arrays end up using a scratch buffer most of the time, so also
1244 // assume those use MUBUF instructions. Scratch loads / stores are currently
1245 // implemented as mubuf instructions with offen bit set, so slightly
1246 // different than the normal addr64.
1247 if (!SIInstrInfo::isLegalMUBUFImmOffset(AM.BaseOffs))
1248 return false;
1249
1250 // FIXME: Since we can split immediate into soffset and immediate offset,
1251 // would it make sense to allow any immediate?
1252
1253 switch (AM.Scale) {
1254 case 0: // r + i or just i, depending on HasBaseReg.
1255 return true;
1256 case 1:
1257 return true; // We have r + r or r + i.
1258 case 2:
1259 if (AM.HasBaseReg) {
1260 // Reject 2 * r + r.
1261 return false;
1262 }
1263
1264 // Allow 2 * r as r + r
1265 // Or 2 * r + i is allowed as r + r + i.
1266 return true;
1267 default: // Don't allow n * r
1268 return false;
1269 }
1270 }
1271
isLegalAddressingMode(const DataLayout & DL,const AddrMode & AM,Type * Ty,unsigned AS,Instruction * I) const1272 bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
1273 const AddrMode &AM, Type *Ty,
1274 unsigned AS, Instruction *I) const {
1275 // No global is ever allowed as a base.
1276 if (AM.BaseGV)
1277 return false;
1278
1279 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1280 return isLegalGlobalAddressingMode(AM);
1281
1282 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1283 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
1284 AS == AMDGPUAS::BUFFER_FAT_POINTER) {
1285 // If the offset isn't a multiple of 4, it probably isn't going to be
1286 // correctly aligned.
1287 // FIXME: Can we get the real alignment here?
1288 if (AM.BaseOffs % 4 != 0)
1289 return isLegalMUBUFAddressingMode(AM);
1290
1291 // There are no SMRD extloads, so if we have to do a small type access we
1292 // will use a MUBUF load.
1293 // FIXME?: We also need to do this if unaligned, but we don't know the
1294 // alignment here.
1295 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1296 return isLegalGlobalAddressingMode(AM);
1297
1298 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1299 // SMRD instructions have an 8-bit, dword offset on SI.
1300 if (!isUInt<8>(AM.BaseOffs / 4))
1301 return false;
1302 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1303 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1304 // in 8-bits, it can use a smaller encoding.
1305 if (!isUInt<32>(AM.BaseOffs / 4))
1306 return false;
1307 } else if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
1308 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1309 if (!isUInt<20>(AM.BaseOffs))
1310 return false;
1311 } else
1312 llvm_unreachable("unhandled generation");
1313
1314 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1315 return true;
1316
1317 if (AM.Scale == 1 && AM.HasBaseReg)
1318 return true;
1319
1320 return false;
1321
1322 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1323 return isLegalMUBUFAddressingMode(AM);
1324 } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1325 AS == AMDGPUAS::REGION_ADDRESS) {
1326 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1327 // field.
1328 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1329 // an 8-bit dword offset but we don't know the alignment here.
1330 if (!isUInt<16>(AM.BaseOffs))
1331 return false;
1332
1333 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1334 return true;
1335
1336 if (AM.Scale == 1 && AM.HasBaseReg)
1337 return true;
1338
1339 return false;
1340 } else if (AS == AMDGPUAS::FLAT_ADDRESS ||
1341 AS == AMDGPUAS::UNKNOWN_ADDRESS_SPACE) {
1342 // For an unknown address space, this usually means that this is for some
1343 // reason being used for pure arithmetic, and not based on some addressing
1344 // computation. We don't have instructions that compute pointers with any
1345 // addressing modes, so treat them as having no offset like flat
1346 // instructions.
1347 return isLegalFlatAddressingMode(AM);
1348 }
1349
1350 // Assume a user alias of global for unknown address spaces.
1351 return isLegalGlobalAddressingMode(AM);
1352 }
1353
canMergeStoresTo(unsigned AS,EVT MemVT,const MachineFunction & MF) const1354 bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
1355 const MachineFunction &MF) const {
1356 if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {
1357 return (MemVT.getSizeInBits() <= 4 * 32);
1358 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1359 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1360 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1361 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1362 return (MemVT.getSizeInBits() <= 2 * 32);
1363 }
1364 return true;
1365 }
1366
allowsMisalignedMemoryAccessesImpl(unsigned Size,unsigned AddrSpace,Align Alignment,MachineMemOperand::Flags Flags,bool * IsFast) const1367 bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
1368 unsigned Size, unsigned AddrSpace, Align Alignment,
1369 MachineMemOperand::Flags Flags, bool *IsFast) const {
1370 if (IsFast)
1371 *IsFast = false;
1372
1373 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1374 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1375 // Check if alignment requirements for ds_read/write instructions are
1376 // disabled.
1377 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1378 return false;
1379
1380 Align RequiredAlignment(PowerOf2Ceil(Size/8)); // Natural alignment.
1381 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1382 Alignment < RequiredAlignment)
1383 return false;
1384
1385 // Either, the alignment requirements are "enabled", or there is an
1386 // unaligned LDS access related hardware bug though alignment requirements
1387 // are "disabled". In either case, we need to check for proper alignment
1388 // requirements.
1389 //
1390 switch (Size) {
1391 case 64:
1392 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1393 // address is negative, then the instruction is incorrectly treated as
1394 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1395 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1396 // load later in the SILoadStoreOptimizer.
1397 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1398 return false;
1399
1400 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1401 // can do a 4 byte aligned, 8 byte access in a single operation using
1402 // ds_read2/write2_b32 with adjacent offsets.
1403 RequiredAlignment = Align(4);
1404
1405 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1406 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1407 // ds_write2_b32 depending on the alignment. In either case with either
1408 // alignment there is no faster way of doing this.
1409 if (IsFast)
1410 *IsFast = true;
1411 return true;
1412 }
1413
1414 break;
1415 case 96:
1416 if (!Subtarget->hasDS96AndDS128())
1417 return false;
1418
1419 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1420 // gfx8 and older.
1421
1422 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1423 // Naturally aligned access is fastest. However, also report it is Fast
1424 // if memory is aligned less than DWORD. A narrow load or store will be
1425 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
1426 // be more of them, so overall we will pay less penalty issuing a single
1427 // instruction.
1428 if (IsFast)
1429 *IsFast = Alignment >= RequiredAlignment || Alignment < Align(4);
1430 return true;
1431 }
1432
1433 break;
1434 case 128:
1435 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
1436 return false;
1437
1438 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1439 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
1440 // single operation using ds_read2/write2_b64.
1441 RequiredAlignment = Align(8);
1442
1443 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1444 // Naturally aligned access is fastest. However, also report it is Fast
1445 // if memory is aligned less than DWORD. A narrow load or store will be
1446 // be equally slow as a single ds_read_b128/ds_write_b128, but there
1447 // will be more of them, so overall we will pay less penalty issuing a
1448 // single instruction.
1449 if (IsFast)
1450 *IsFast = Alignment >= RequiredAlignment || Alignment < Align(4);
1451 return true;
1452 }
1453
1454 break;
1455 default:
1456 if (Size > 32)
1457 return false;
1458
1459 break;
1460 }
1461
1462 if (IsFast)
1463 *IsFast = Alignment >= RequiredAlignment;
1464
1465 return Alignment >= RequiredAlignment ||
1466 Subtarget->hasUnalignedDSAccessEnabled();
1467 }
1468
1469 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
1470 bool AlignedBy4 = Alignment >= Align(4);
1471 if (IsFast)
1472 *IsFast = AlignedBy4;
1473
1474 return AlignedBy4 ||
1475 Subtarget->enableFlatScratch() ||
1476 Subtarget->hasUnalignedScratchAccess();
1477 }
1478
1479 // FIXME: We have to be conservative here and assume that flat operations
1480 // will access scratch. If we had access to the IR function, then we
1481 // could determine if any private memory was used in the function.
1482 if (AddrSpace == AMDGPUAS::FLAT_ADDRESS &&
1483 !Subtarget->hasUnalignedScratchAccess()) {
1484 bool AlignedBy4 = Alignment >= Align(4);
1485 if (IsFast)
1486 *IsFast = AlignedBy4;
1487
1488 return AlignedBy4;
1489 }
1490
1491 if (Subtarget->hasUnalignedBufferAccessEnabled()) {
1492 // If we have a uniform constant load, it still requires using a slow
1493 // buffer instruction if unaligned.
1494 if (IsFast) {
1495 // Accesses can really be issued as 1-byte aligned or 4-byte aligned, so
1496 // 2-byte alignment is worse than 1 unless doing a 2-byte access.
1497 *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
1498 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ?
1499 Alignment >= Align(4) : Alignment != Align(2);
1500 }
1501
1502 return true;
1503 }
1504
1505 // Smaller than dword value must be aligned.
1506 if (Size < 32)
1507 return false;
1508
1509 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1510 // byte-address are ignored, thus forcing Dword alignment.
1511 // This applies to private, global, and constant memory.
1512 if (IsFast)
1513 *IsFast = true;
1514
1515 return Size >= 32 && Alignment >= Align(4);
1516 }
1517
allowsMisalignedMemoryAccesses(EVT VT,unsigned AddrSpace,Align Alignment,MachineMemOperand::Flags Flags,bool * IsFast) const1518 bool SITargetLowering::allowsMisalignedMemoryAccesses(
1519 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1520 bool *IsFast) const {
1521 bool Allow = allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace,
1522 Alignment, Flags, IsFast);
1523
1524 if (Allow && IsFast && Subtarget->hasUnalignedDSAccessEnabled() &&
1525 (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1526 AddrSpace == AMDGPUAS::REGION_ADDRESS)) {
1527 // Lie it is fast if +unaligned-access-mode is passed so that DS accesses
1528 // get vectorized. We could use ds_read2_b*/ds_write2_b* instructions on a
1529 // misaligned data which is faster than a pair of ds_read_b*/ds_write_b*
1530 // which would be equally misaligned.
1531 // This is only used by the common passes, selection always calls the
1532 // allowsMisalignedMemoryAccessesImpl version.
1533 *IsFast = true;
1534 }
1535
1536 return Allow;
1537 }
1538
getOptimalMemOpType(const MemOp & Op,const AttributeList & FuncAttributes) const1539 EVT SITargetLowering::getOptimalMemOpType(
1540 const MemOp &Op, const AttributeList &FuncAttributes) const {
1541 // FIXME: Should account for address space here.
1542
1543 // The default fallback uses the private pointer size as a guess for a type to
1544 // use. Make sure we switch these to 64-bit accesses.
1545
1546 if (Op.size() >= 16 &&
1547 Op.isDstAligned(Align(4))) // XXX: Should only do for global
1548 return MVT::v4i32;
1549
1550 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
1551 return MVT::v2i32;
1552
1553 // Use the default.
1554 return MVT::Other;
1555 }
1556
isMemOpHasNoClobberedMemOperand(const SDNode * N) const1557 bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
1558 const MemSDNode *MemNode = cast<MemSDNode>(N);
1559 return MemNode->getMemOperand()->getFlags() & MONoClobber;
1560 }
1561
isNonGlobalAddrSpace(unsigned AS)1562 bool SITargetLowering::isNonGlobalAddrSpace(unsigned AS) {
1563 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
1564 AS == AMDGPUAS::PRIVATE_ADDRESS;
1565 }
1566
isFreeAddrSpaceCast(unsigned SrcAS,unsigned DestAS) const1567 bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS,
1568 unsigned DestAS) const {
1569 // Flat -> private/local is a simple truncate.
1570 // Flat -> global is no-op
1571 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1572 return true;
1573
1574 const GCNTargetMachine &TM =
1575 static_cast<const GCNTargetMachine &>(getTargetMachine());
1576 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1577 }
1578
isMemOpUniform(const SDNode * N) const1579 bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
1580 const MemSDNode *MemNode = cast<MemSDNode>(N);
1581
1582 return AMDGPUInstrInfo::isUniformMMO(MemNode->getMemOperand());
1583 }
1584
1585 TargetLoweringBase::LegalizeTypeAction
getPreferredVectorAction(MVT VT) const1586 SITargetLowering::getPreferredVectorAction(MVT VT) const {
1587 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1588 VT.getScalarType().bitsLE(MVT::i16))
1589 return VT.isPow2VectorType() ? TypeSplitVector : TypeWidenVector;
1590 return TargetLoweringBase::getPreferredVectorAction(VT);
1591 }
1592
shouldConvertConstantLoadToIntImm(const APInt & Imm,Type * Ty) const1593 bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
1594 Type *Ty) const {
1595 // FIXME: Could be smarter if called for vector constants.
1596 return true;
1597 }
1598
isExtractSubvectorCheap(EVT ResVT,EVT SrcVT,unsigned Index) const1599 bool SITargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
1600 unsigned Index) const {
1601 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
1602 return false;
1603
1604 // TODO: Add more cases that are cheap.
1605 return Index == 0;
1606 }
1607
isTypeDesirableForOp(unsigned Op,EVT VT) const1608 bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
1609 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1610 switch (Op) {
1611 case ISD::LOAD:
1612 case ISD::STORE:
1613
1614 // These operations are done with 32-bit instructions anyway.
1615 case ISD::AND:
1616 case ISD::OR:
1617 case ISD::XOR:
1618 case ISD::SELECT:
1619 // TODO: Extensions?
1620 return true;
1621 default:
1622 return false;
1623 }
1624 }
1625
1626 // SimplifySetCC uses this function to determine whether or not it should
1627 // create setcc with i1 operands. We don't have instructions for i1 setcc.
1628 if (VT == MVT::i1 && Op == ISD::SETCC)
1629 return false;
1630
1631 return TargetLowering::isTypeDesirableForOp(Op, VT);
1632 }
1633
lowerKernArgParameterPtr(SelectionDAG & DAG,const SDLoc & SL,SDValue Chain,uint64_t Offset) const1634 SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1635 const SDLoc &SL,
1636 SDValue Chain,
1637 uint64_t Offset) const {
1638 const DataLayout &DL = DAG.getDataLayout();
1639 MachineFunction &MF = DAG.getMachineFunction();
1640 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
1641
1642 const ArgDescriptor *InputPtrReg;
1643 const TargetRegisterClass *RC;
1644 LLT ArgTy;
1645 MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
1646
1647 std::tie(InputPtrReg, RC, ArgTy) =
1648 Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
1649
1650 // We may not have the kernarg segment argument if we have no kernel
1651 // arguments.
1652 if (!InputPtrReg)
1653 return DAG.getConstant(0, SL, PtrVT);
1654
1655 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
1656 SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
1657 MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1658
1659 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Offset));
1660 }
1661
getImplicitArgPtr(SelectionDAG & DAG,const SDLoc & SL) const1662 SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1663 const SDLoc &SL) const {
1664 uint64_t Offset = getImplicitParameterOffset(DAG.getMachineFunction(),
1665 FIRST_IMPLICIT);
1666 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
1667 }
1668
getLDSKernelId(SelectionDAG & DAG,const SDLoc & SL) const1669 SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
1670 const SDLoc &SL) const {
1671
1672 Function &F = DAG.getMachineFunction().getFunction();
1673 Optional<uint32_t> KnownSize =
1674 AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
1675 if (KnownSize.has_value())
1676 return DAG.getConstant(KnownSize.value(), SL, MVT::i32);
1677 return SDValue();
1678 }
1679
convertArgType(SelectionDAG & DAG,EVT VT,EVT MemVT,const SDLoc & SL,SDValue Val,bool Signed,const ISD::InputArg * Arg) const1680 SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
1681 const SDLoc &SL, SDValue Val,
1682 bool Signed,
1683 const ISD::InputArg *Arg) const {
1684 // First, if it is a widened vector, narrow it.
1685 if (VT.isVector() &&
1686 VT.getVectorNumElements() != MemVT.getVectorNumElements()) {
1687 EVT NarrowedVT =
1688 EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(),
1689 VT.getVectorNumElements());
1690 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
1691 DAG.getConstant(0, SL, MVT::i32));
1692 }
1693
1694 // Then convert the vector elements or scalar value.
1695 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
1696 VT.bitsLT(MemVT)) {
1697 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
1698 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
1699 }
1700
1701 if (MemVT.isFloatingPoint())
1702 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
1703 else if (Signed)
1704 Val = DAG.getSExtOrTrunc(Val, SL, VT);
1705 else
1706 Val = DAG.getZExtOrTrunc(Val, SL, VT);
1707
1708 return Val;
1709 }
1710
lowerKernargMemParameter(SelectionDAG & DAG,EVT VT,EVT MemVT,const SDLoc & SL,SDValue Chain,uint64_t Offset,Align Alignment,bool Signed,const ISD::InputArg * Arg) const1711 SDValue SITargetLowering::lowerKernargMemParameter(
1712 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
1713 uint64_t Offset, Align Alignment, bool Signed,
1714 const ISD::InputArg *Arg) const {
1715 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
1716
1717 // Try to avoid using an extload by loading earlier than the argument address,
1718 // and extracting the relevant bits. The load should hopefully be merged with
1719 // the previous argument.
1720 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
1721 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
1722 int64_t AlignDownOffset = alignDown(Offset, 4);
1723 int64_t OffsetDiff = Offset - AlignDownOffset;
1724
1725 EVT IntVT = MemVT.changeTypeToInteger();
1726
1727 // TODO: If we passed in the base kernel offset we could have a better
1728 // alignment than 4, but we don't really need it.
1729 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
1730 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
1731 MachineMemOperand::MODereferenceable |
1732 MachineMemOperand::MOInvariant);
1733
1734 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
1735 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
1736
1737 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
1738 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
1739 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
1740
1741
1742 return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
1743 }
1744
1745 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
1746 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
1747 MachineMemOperand::MODereferenceable |
1748 MachineMemOperand::MOInvariant);
1749
1750 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
1751 return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
1752 }
1753
lowerStackParameter(SelectionDAG & DAG,CCValAssign & VA,const SDLoc & SL,SDValue Chain,const ISD::InputArg & Arg) const1754 SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
1755 const SDLoc &SL, SDValue Chain,
1756 const ISD::InputArg &Arg) const {
1757 MachineFunction &MF = DAG.getMachineFunction();
1758 MachineFrameInfo &MFI = MF.getFrameInfo();
1759
1760 if (Arg.Flags.isByVal()) {
1761 unsigned Size = Arg.Flags.getByValSize();
1762 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
1763 return DAG.getFrameIndex(FrameIdx, MVT::i32);
1764 }
1765
1766 unsigned ArgOffset = VA.getLocMemOffset();
1767 unsigned ArgSize = VA.getValVT().getStoreSize();
1768
1769 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
1770
1771 // Create load nodes to retrieve arguments from the stack.
1772 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
1773 SDValue ArgValue;
1774
1775 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
1776 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
1777 MVT MemVT = VA.getValVT();
1778
1779 switch (VA.getLocInfo()) {
1780 default:
1781 break;
1782 case CCValAssign::BCvt:
1783 MemVT = VA.getLocVT();
1784 break;
1785 case CCValAssign::SExt:
1786 ExtType = ISD::SEXTLOAD;
1787 break;
1788 case CCValAssign::ZExt:
1789 ExtType = ISD::ZEXTLOAD;
1790 break;
1791 case CCValAssign::AExt:
1792 ExtType = ISD::EXTLOAD;
1793 break;
1794 }
1795
1796 ArgValue = DAG.getExtLoad(
1797 ExtType, SL, VA.getLocVT(), Chain, FIN,
1798 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
1799 MemVT);
1800 return ArgValue;
1801 }
1802
getPreloadedValue(SelectionDAG & DAG,const SIMachineFunctionInfo & MFI,EVT VT,AMDGPUFunctionArgInfo::PreloadedValue PVID) const1803 SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
1804 const SIMachineFunctionInfo &MFI,
1805 EVT VT,
1806 AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
1807 const ArgDescriptor *Reg;
1808 const TargetRegisterClass *RC;
1809 LLT Ty;
1810
1811 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
1812 if (!Reg) {
1813 if (PVID == AMDGPUFunctionArgInfo::PreloadedValue::KERNARG_SEGMENT_PTR) {
1814 // It's possible for a kernarg intrinsic call to appear in a kernel with
1815 // no allocated segment, in which case we do not add the user sgpr
1816 // argument, so just return null.
1817 return DAG.getConstant(0, SDLoc(), VT);
1818 }
1819
1820 // It's undefined behavior if a function marked with the amdgpu-no-*
1821 // attributes uses the corresponding intrinsic.
1822 return DAG.getUNDEF(VT);
1823 }
1824
1825 return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
1826 }
1827
processPSInputArgs(SmallVectorImpl<ISD::InputArg> & Splits,CallingConv::ID CallConv,ArrayRef<ISD::InputArg> Ins,BitVector & Skipped,FunctionType * FType,SIMachineFunctionInfo * Info)1828 static void processPSInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
1829 CallingConv::ID CallConv,
1830 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
1831 FunctionType *FType,
1832 SIMachineFunctionInfo *Info) {
1833 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
1834 const ISD::InputArg *Arg = &Ins[I];
1835
1836 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
1837 "vector type argument should have been split");
1838
1839 // First check if it's a PS input addr.
1840 if (CallConv == CallingConv::AMDGPU_PS &&
1841 !Arg->Flags.isInReg() && PSInputNum <= 15) {
1842 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
1843
1844 // Inconveniently only the first part of the split is marked as isSplit,
1845 // so skip to the end. We only want to increment PSInputNum once for the
1846 // entire split argument.
1847 if (Arg->Flags.isSplit()) {
1848 while (!Arg->Flags.isSplitEnd()) {
1849 assert((!Arg->VT.isVector() ||
1850 Arg->VT.getScalarSizeInBits() == 16) &&
1851 "unexpected vector split in ps argument type");
1852 if (!SkipArg)
1853 Splits.push_back(*Arg);
1854 Arg = &Ins[++I];
1855 }
1856 }
1857
1858 if (SkipArg) {
1859 // We can safely skip PS inputs.
1860 Skipped.set(Arg->getOrigArgIndex());
1861 ++PSInputNum;
1862 continue;
1863 }
1864
1865 Info->markPSInputAllocated(PSInputNum);
1866 if (Arg->Used)
1867 Info->markPSInputEnabled(PSInputNum);
1868
1869 ++PSInputNum;
1870 }
1871
1872 Splits.push_back(*Arg);
1873 }
1874 }
1875
1876 // Allocate special inputs passed in VGPRs.
allocateSpecialEntryInputVGPRs(CCState & CCInfo,MachineFunction & MF,const SIRegisterInfo & TRI,SIMachineFunctionInfo & Info) const1877 void SITargetLowering::allocateSpecialEntryInputVGPRs(CCState &CCInfo,
1878 MachineFunction &MF,
1879 const SIRegisterInfo &TRI,
1880 SIMachineFunctionInfo &Info) const {
1881 const LLT S32 = LLT::scalar(32);
1882 MachineRegisterInfo &MRI = MF.getRegInfo();
1883
1884 if (Info.hasWorkItemIDX()) {
1885 Register Reg = AMDGPU::VGPR0;
1886 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
1887
1888 CCInfo.AllocateReg(Reg);
1889 unsigned Mask = (Subtarget->hasPackedTID() &&
1890 Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
1891 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
1892 }
1893
1894 if (Info.hasWorkItemIDY()) {
1895 assert(Info.hasWorkItemIDX());
1896 if (Subtarget->hasPackedTID()) {
1897 Info.setWorkItemIDY(ArgDescriptor::createRegister(AMDGPU::VGPR0,
1898 0x3ff << 10));
1899 } else {
1900 unsigned Reg = AMDGPU::VGPR1;
1901 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
1902
1903 CCInfo.AllocateReg(Reg);
1904 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
1905 }
1906 }
1907
1908 if (Info.hasWorkItemIDZ()) {
1909 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
1910 if (Subtarget->hasPackedTID()) {
1911 Info.setWorkItemIDZ(ArgDescriptor::createRegister(AMDGPU::VGPR0,
1912 0x3ff << 20));
1913 } else {
1914 unsigned Reg = AMDGPU::VGPR2;
1915 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
1916
1917 CCInfo.AllocateReg(Reg);
1918 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
1919 }
1920 }
1921 }
1922
1923 // Try to allocate a VGPR at the end of the argument list, or if no argument
1924 // VGPRs are left allocating a stack slot.
1925 // If \p Mask is is given it indicates bitfield position in the register.
1926 // If \p Arg is given use it with new ]p Mask instead of allocating new.
allocateVGPR32Input(CCState & CCInfo,unsigned Mask=~0u,ArgDescriptor Arg=ArgDescriptor ())1927 static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
1928 ArgDescriptor Arg = ArgDescriptor()) {
1929 if (Arg.isSet())
1930 return ArgDescriptor::createArg(Arg, Mask);
1931
1932 ArrayRef<MCPhysReg> ArgVGPRs
1933 = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
1934 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
1935 if (RegIdx == ArgVGPRs.size()) {
1936 // Spill to stack required.
1937 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
1938
1939 return ArgDescriptor::createStack(Offset, Mask);
1940 }
1941
1942 unsigned Reg = ArgVGPRs[RegIdx];
1943 Reg = CCInfo.AllocateReg(Reg);
1944 assert(Reg != AMDGPU::NoRegister);
1945
1946 MachineFunction &MF = CCInfo.getMachineFunction();
1947 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
1948 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
1949 return ArgDescriptor::createRegister(Reg, Mask);
1950 }
1951
allocateSGPR32InputImpl(CCState & CCInfo,const TargetRegisterClass * RC,unsigned NumArgRegs)1952 static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo,
1953 const TargetRegisterClass *RC,
1954 unsigned NumArgRegs) {
1955 ArrayRef<MCPhysReg> ArgSGPRs = makeArrayRef(RC->begin(), 32);
1956 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
1957 if (RegIdx == ArgSGPRs.size())
1958 report_fatal_error("ran out of SGPRs for arguments");
1959
1960 unsigned Reg = ArgSGPRs[RegIdx];
1961 Reg = CCInfo.AllocateReg(Reg);
1962 assert(Reg != AMDGPU::NoRegister);
1963
1964 MachineFunction &MF = CCInfo.getMachineFunction();
1965 MF.addLiveIn(Reg, RC);
1966 return ArgDescriptor::createRegister(Reg);
1967 }
1968
1969 // If this has a fixed position, we still should allocate the register in the
1970 // CCInfo state. Technically we could get away with this for values passed
1971 // outside of the normal argument range.
allocateFixedSGPRInputImpl(CCState & CCInfo,const TargetRegisterClass * RC,MCRegister Reg)1972 static void allocateFixedSGPRInputImpl(CCState &CCInfo,
1973 const TargetRegisterClass *RC,
1974 MCRegister Reg) {
1975 Reg = CCInfo.AllocateReg(Reg);
1976 assert(Reg != AMDGPU::NoRegister);
1977 MachineFunction &MF = CCInfo.getMachineFunction();
1978 MF.addLiveIn(Reg, RC);
1979 }
1980
allocateSGPR32Input(CCState & CCInfo,ArgDescriptor & Arg)1981 static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
1982 if (Arg) {
1983 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
1984 Arg.getRegister());
1985 } else
1986 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
1987 }
1988
allocateSGPR64Input(CCState & CCInfo,ArgDescriptor & Arg)1989 static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
1990 if (Arg) {
1991 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
1992 Arg.getRegister());
1993 } else
1994 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
1995 }
1996
1997 /// Allocate implicit function VGPR arguments at the end of allocated user
1998 /// arguments.
allocateSpecialInputVGPRs(CCState & CCInfo,MachineFunction & MF,const SIRegisterInfo & TRI,SIMachineFunctionInfo & Info) const1999 void SITargetLowering::allocateSpecialInputVGPRs(
2000 CCState &CCInfo, MachineFunction &MF,
2001 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2002 const unsigned Mask = 0x3ff;
2003 ArgDescriptor Arg;
2004
2005 if (Info.hasWorkItemIDX()) {
2006 Arg = allocateVGPR32Input(CCInfo, Mask);
2007 Info.setWorkItemIDX(Arg);
2008 }
2009
2010 if (Info.hasWorkItemIDY()) {
2011 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2012 Info.setWorkItemIDY(Arg);
2013 }
2014
2015 if (Info.hasWorkItemIDZ())
2016 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2017 }
2018
2019 /// Allocate implicit function VGPR arguments in fixed registers.
allocateSpecialInputVGPRsFixed(CCState & CCInfo,MachineFunction & MF,const SIRegisterInfo & TRI,SIMachineFunctionInfo & Info) const2020 void SITargetLowering::allocateSpecialInputVGPRsFixed(
2021 CCState &CCInfo, MachineFunction &MF,
2022 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2023 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2024 if (!Reg)
2025 report_fatal_error("failed to allocated VGPR for implicit arguments");
2026
2027 const unsigned Mask = 0x3ff;
2028 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2029 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2030 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2031 }
2032
allocateSpecialInputSGPRs(CCState & CCInfo,MachineFunction & MF,const SIRegisterInfo & TRI,SIMachineFunctionInfo & Info) const2033 void SITargetLowering::allocateSpecialInputSGPRs(
2034 CCState &CCInfo,
2035 MachineFunction &MF,
2036 const SIRegisterInfo &TRI,
2037 SIMachineFunctionInfo &Info) const {
2038 auto &ArgInfo = Info.getArgInfo();
2039
2040 // TODO: Unify handling with private memory pointers.
2041 if (Info.hasDispatchPtr())
2042 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2043
2044 if (Info.hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5)
2045 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2046
2047 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2048 // constant offset from the kernarg segment.
2049 if (Info.hasImplicitArgPtr())
2050 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2051
2052 if (Info.hasDispatchID())
2053 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2054
2055 // flat_scratch_init is not applicable for non-kernel functions.
2056
2057 if (Info.hasWorkGroupIDX())
2058 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2059
2060 if (Info.hasWorkGroupIDY())
2061 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2062
2063 if (Info.hasWorkGroupIDZ())
2064 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2065
2066 if (Info.hasLDSKernelId())
2067 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2068 }
2069
2070 // Allocate special inputs passed in user SGPRs.
allocateHSAUserSGPRs(CCState & CCInfo,MachineFunction & MF,const SIRegisterInfo & TRI,SIMachineFunctionInfo & Info) const2071 void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
2072 MachineFunction &MF,
2073 const SIRegisterInfo &TRI,
2074 SIMachineFunctionInfo &Info) const {
2075 if (Info.hasImplicitBufferPtr()) {
2076 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2077 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2078 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2079 }
2080
2081 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2082 if (Info.hasPrivateSegmentBuffer()) {
2083 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2084 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2085 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2086 }
2087
2088 if (Info.hasDispatchPtr()) {
2089 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2090 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2091 CCInfo.AllocateReg(DispatchPtrReg);
2092 }
2093
2094 if (Info.hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5) {
2095 Register QueuePtrReg = Info.addQueuePtr(TRI);
2096 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2097 CCInfo.AllocateReg(QueuePtrReg);
2098 }
2099
2100 if (Info.hasKernargSegmentPtr()) {
2101 MachineRegisterInfo &MRI = MF.getRegInfo();
2102 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2103 CCInfo.AllocateReg(InputPtrReg);
2104
2105 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2106 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2107 }
2108
2109 if (Info.hasDispatchID()) {
2110 Register DispatchIDReg = Info.addDispatchID(TRI);
2111 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2112 CCInfo.AllocateReg(DispatchIDReg);
2113 }
2114
2115 if (Info.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2116 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2117 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2118 CCInfo.AllocateReg(FlatScratchInitReg);
2119 }
2120
2121 if (Info.hasLDSKernelId()) {
2122 Register Reg = Info.addLDSKernelId();
2123 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2124 CCInfo.AllocateReg(Reg);
2125 }
2126
2127 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2128 // these from the dispatch pointer.
2129 }
2130
2131 // Allocate special input registers that are initialized per-wave.
allocateSystemSGPRs(CCState & CCInfo,MachineFunction & MF,SIMachineFunctionInfo & Info,CallingConv::ID CallConv,bool IsShader) const2132 void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo,
2133 MachineFunction &MF,
2134 SIMachineFunctionInfo &Info,
2135 CallingConv::ID CallConv,
2136 bool IsShader) const {
2137 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2138 // Note: user SGPRs are handled by the front-end for graphics shaders
2139 // Pad up the used user SGPRs with dead inputs.
2140 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2141
2142 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
2143 // rely on it to reach 16 since if we end up having no stack usage, it will
2144 // not really be added.
2145 unsigned NumRequiredSystemSGPRs = Info.hasWorkGroupIDX() +
2146 Info.hasWorkGroupIDY() +
2147 Info.hasWorkGroupIDZ() +
2148 Info.hasWorkGroupInfo();
2149 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2150 Register Reg = Info.addReservedUserSGPR();
2151 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2152 CCInfo.AllocateReg(Reg);
2153 }
2154 }
2155
2156 if (Info.hasWorkGroupIDX()) {
2157 Register Reg = Info.addWorkGroupIDX();
2158 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2159 CCInfo.AllocateReg(Reg);
2160 }
2161
2162 if (Info.hasWorkGroupIDY()) {
2163 Register Reg = Info.addWorkGroupIDY();
2164 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2165 CCInfo.AllocateReg(Reg);
2166 }
2167
2168 if (Info.hasWorkGroupIDZ()) {
2169 Register Reg = Info.addWorkGroupIDZ();
2170 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2171 CCInfo.AllocateReg(Reg);
2172 }
2173
2174 if (Info.hasWorkGroupInfo()) {
2175 Register Reg = Info.addWorkGroupInfo();
2176 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2177 CCInfo.AllocateReg(Reg);
2178 }
2179
2180 if (Info.hasPrivateSegmentWaveByteOffset()) {
2181 // Scratch wave offset passed in system SGPR.
2182 unsigned PrivateSegmentWaveByteOffsetReg;
2183
2184 if (IsShader) {
2185 PrivateSegmentWaveByteOffsetReg =
2186 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2187
2188 // This is true if the scratch wave byte offset doesn't have a fixed
2189 // location.
2190 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2191 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2192 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2193 }
2194 } else
2195 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2196
2197 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2198 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
2199 }
2200
2201 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2202 Info.getNumPreloadedSGPRs() >= 16);
2203 }
2204
reservePrivateMemoryRegs(const TargetMachine & TM,MachineFunction & MF,const SIRegisterInfo & TRI,SIMachineFunctionInfo & Info)2205 static void reservePrivateMemoryRegs(const TargetMachine &TM,
2206 MachineFunction &MF,
2207 const SIRegisterInfo &TRI,
2208 SIMachineFunctionInfo &Info) {
2209 // Now that we've figured out where the scratch register inputs are, see if
2210 // should reserve the arguments and use them directly.
2211 MachineFrameInfo &MFI = MF.getFrameInfo();
2212 bool HasStackObjects = MFI.hasStackObjects();
2213 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2214
2215 // Record that we know we have non-spill stack objects so we don't need to
2216 // check all stack objects later.
2217 if (HasStackObjects)
2218 Info.setHasNonSpillStackObjects(true);
2219
2220 // Everything live out of a block is spilled with fast regalloc, so it's
2221 // almost certain that spilling will be required.
2222 if (TM.getOptLevel() == CodeGenOpt::None)
2223 HasStackObjects = true;
2224
2225 // For now assume stack access is needed in any callee functions, so we need
2226 // the scratch registers to pass in.
2227 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2228
2229 if (!ST.enableFlatScratch()) {
2230 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2231 // If we have stack objects, we unquestionably need the private buffer
2232 // resource. For the Code Object V2 ABI, this will be the first 4 user
2233 // SGPR inputs. We can reserve those and use them directly.
2234
2235 Register PrivateSegmentBufferReg =
2236 Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
2237 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2238 } else {
2239 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2240 // We tentatively reserve the last registers (skipping the last registers
2241 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2242 // we'll replace these with the ones immediately after those which were
2243 // really allocated. In the prologue copies will be inserted from the
2244 // argument to these reserved registers.
2245
2246 // Without HSA, relocations are used for the scratch pointer and the
2247 // buffer resource setup is always inserted in the prologue. Scratch wave
2248 // offset is still in an input SGPR.
2249 Info.setScratchRSrcReg(ReservedBufferReg);
2250 }
2251 }
2252
2253 MachineRegisterInfo &MRI = MF.getRegInfo();
2254
2255 // For entry functions we have to set up the stack pointer if we use it,
2256 // whereas non-entry functions get this "for free". This means there is no
2257 // intrinsic advantage to using S32 over S34 in cases where we do not have
2258 // calls but do need a frame pointer (i.e. if we are requested to have one
2259 // because frame pointer elimination is disabled). To keep things simple we
2260 // only ever use S32 as the call ABI stack pointer, and so using it does not
2261 // imply we need a separate frame pointer.
2262 //
2263 // Try to use s32 as the SP, but move it if it would interfere with input
2264 // arguments. This won't work with calls though.
2265 //
2266 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2267 // registers.
2268 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2269 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2270 } else {
2271 assert(AMDGPU::isShader(MF.getFunction().getCallingConv()));
2272
2273 if (MFI.hasCalls())
2274 report_fatal_error("call in graphics shader with too many input SGPRs");
2275
2276 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2277 if (!MRI.isLiveIn(Reg)) {
2278 Info.setStackPtrOffsetReg(Reg);
2279 break;
2280 }
2281 }
2282
2283 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2284 report_fatal_error("failed to find register for SP");
2285 }
2286
2287 // hasFP should be accurate for entry functions even before the frame is
2288 // finalized, because it does not rely on the known stack size, only
2289 // properties like whether variable sized objects are present.
2290 if (ST.getFrameLowering()->hasFP(MF)) {
2291 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2292 }
2293 }
2294
supportSplitCSR(MachineFunction * MF) const2295 bool SITargetLowering::supportSplitCSR(MachineFunction *MF) const {
2296 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
2297 return !Info->isEntryFunction();
2298 }
2299
initializeSplitCSR(MachineBasicBlock * Entry) const2300 void SITargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
2301
2302 }
2303
insertCopiesSplitCSR(MachineBasicBlock * Entry,const SmallVectorImpl<MachineBasicBlock * > & Exits) const2304 void SITargetLowering::insertCopiesSplitCSR(
2305 MachineBasicBlock *Entry,
2306 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
2307 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2308
2309 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2310 if (!IStart)
2311 return;
2312
2313 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2314 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2315 MachineBasicBlock::iterator MBBI = Entry->begin();
2316 for (const MCPhysReg *I = IStart; *I; ++I) {
2317 const TargetRegisterClass *RC = nullptr;
2318 if (AMDGPU::SReg_64RegClass.contains(*I))
2319 RC = &AMDGPU::SGPR_64RegClass;
2320 else if (AMDGPU::SReg_32RegClass.contains(*I))
2321 RC = &AMDGPU::SGPR_32RegClass;
2322 else
2323 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2324
2325 Register NewVR = MRI->createVirtualRegister(RC);
2326 // Create copy from CSR to a virtual register.
2327 Entry->addLiveIn(*I);
2328 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
2329 .addReg(*I);
2330
2331 // Insert the copy-back instructions right before the terminator.
2332 for (auto *Exit : Exits)
2333 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
2334 TII->get(TargetOpcode::COPY), *I)
2335 .addReg(NewVR);
2336 }
2337 }
2338
LowerFormalArguments(SDValue Chain,CallingConv::ID CallConv,bool isVarArg,const SmallVectorImpl<ISD::InputArg> & Ins,const SDLoc & DL,SelectionDAG & DAG,SmallVectorImpl<SDValue> & InVals) const2339 SDValue SITargetLowering::LowerFormalArguments(
2340 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2341 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2342 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2343 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2344
2345 MachineFunction &MF = DAG.getMachineFunction();
2346 const Function &Fn = MF.getFunction();
2347 FunctionType *FType = MF.getFunction().getFunctionType();
2348 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2349
2350 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
2351 DiagnosticInfoUnsupported NoGraphicsHSA(
2352 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
2353 DAG.getContext()->diagnose(NoGraphicsHSA);
2354 return DAG.getEntryNode();
2355 }
2356
2357 Info->allocateModuleLDSGlobal(Fn);
2358
2359 SmallVector<ISD::InputArg, 16> Splits;
2360 SmallVector<CCValAssign, 16> ArgLocs;
2361 BitVector Skipped(Ins.size());
2362 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2363 *DAG.getContext());
2364
2365 bool IsGraphics = AMDGPU::isGraphics(CallConv);
2366 bool IsKernel = AMDGPU::isKernel(CallConv);
2367 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
2368
2369 if (IsGraphics) {
2370 assert(!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() &&
2371 (!Info->hasFlatScratchInit() || Subtarget->enableFlatScratch()) &&
2372 !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2373 !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
2374 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
2375 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
2376 }
2377
2378 if (CallConv == CallingConv::AMDGPU_PS) {
2379 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
2380
2381 // At least one interpolation mode must be enabled or else the GPU will
2382 // hang.
2383 //
2384 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2385 // set PSInputAddr, the user wants to enable some bits after the compilation
2386 // based on run-time states. Since we can't know what the final PSInputEna
2387 // will look like, so we shouldn't do anything here and the user should take
2388 // responsibility for the correct programming.
2389 //
2390 // Otherwise, the following restrictions apply:
2391 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2392 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2393 // enabled too.
2394 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
2395 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
2396 CCInfo.AllocateReg(AMDGPU::VGPR0);
2397 CCInfo.AllocateReg(AMDGPU::VGPR1);
2398 Info->markPSInputAllocated(0);
2399 Info->markPSInputEnabled(0);
2400 }
2401 if (Subtarget->isAmdPalOS()) {
2402 // For isAmdPalOS, the user does not enable some bits after compilation
2403 // based on run-time states; the register values being generated here are
2404 // the final ones set in hardware. Therefore we need to apply the
2405 // workaround to PSInputAddr and PSInputEnable together. (The case where
2406 // a bit is set in PSInputAddr but not PSInputEnable is where the
2407 // frontend set up an input arg for a particular interpolation mode, but
2408 // nothing uses that input arg. Really we should have an earlier pass
2409 // that removes such an arg.)
2410 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2411 if ((PsInputBits & 0x7F) == 0 ||
2412 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2413 Info->markPSInputEnabled(
2414 countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
2415 }
2416 } else if (IsKernel) {
2417 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2418 } else {
2419 Splits.append(Ins.begin(), Ins.end());
2420 }
2421
2422 if (IsEntryFunc) {
2423 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
2424 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
2425 } else if (!IsGraphics) {
2426 // For the fixed ABI, pass workitem IDs in the last argument register.
2427 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
2428 }
2429
2430 if (IsKernel) {
2431 analyzeFormalArgumentsCompute(CCInfo, Ins);
2432 } else {
2433 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
2434 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
2435 }
2436
2437 SmallVector<SDValue, 16> Chains;
2438
2439 // FIXME: This is the minimum kernel argument alignment. We should improve
2440 // this to the maximum alignment of the arguments.
2441 //
2442 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2443 // kern arg offset.
2444 const Align KernelArgBaseAlign = Align(16);
2445
2446 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2447 const ISD::InputArg &Arg = Ins[i];
2448 if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
2449 InVals.push_back(DAG.getUNDEF(Arg.VT));
2450 continue;
2451 }
2452
2453 CCValAssign &VA = ArgLocs[ArgIdx++];
2454 MVT VT = VA.getLocVT();
2455
2456 if (IsEntryFunc && VA.isMemLoc()) {
2457 VT = Ins[i].VT;
2458 EVT MemVT = VA.getLocVT();
2459
2460 const uint64_t Offset = VA.getLocMemOffset();
2461 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
2462
2463 if (Arg.Flags.isByRef()) {
2464 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
2465
2466 const GCNTargetMachine &TM =
2467 static_cast<const GCNTargetMachine &>(getTargetMachine());
2468 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
2469 Arg.Flags.getPointerAddrSpace())) {
2470 Ptr = DAG.getAddrSpaceCast(DL, VT, Ptr, AMDGPUAS::CONSTANT_ADDRESS,
2471 Arg.Flags.getPointerAddrSpace());
2472 }
2473
2474 InVals.push_back(Ptr);
2475 continue;
2476 }
2477
2478 SDValue Arg = lowerKernargMemParameter(
2479 DAG, VT, MemVT, DL, Chain, Offset, Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
2480 Chains.push_back(Arg.getValue(1));
2481
2482 auto *ParamTy =
2483 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
2484 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
2485 ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
2486 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
2487 // On SI local pointers are just offsets into LDS, so they are always
2488 // less than 16-bits. On CI and newer they could potentially be
2489 // real pointers, so we can't guarantee their size.
2490 Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
2491 DAG.getValueType(MVT::i16));
2492 }
2493
2494 InVals.push_back(Arg);
2495 continue;
2496 } else if (!IsEntryFunc && VA.isMemLoc()) {
2497 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
2498 InVals.push_back(Val);
2499 if (!Arg.Flags.isByVal())
2500 Chains.push_back(Val.getValue(1));
2501 continue;
2502 }
2503
2504 assert(VA.isRegLoc() && "Parameter must be in a register!");
2505
2506 Register Reg = VA.getLocReg();
2507 const TargetRegisterClass *RC = nullptr;
2508 if (AMDGPU::VGPR_32RegClass.contains(Reg))
2509 RC = &AMDGPU::VGPR_32RegClass;
2510 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
2511 RC = &AMDGPU::SGPR_32RegClass;
2512 else
2513 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
2514 EVT ValVT = VA.getValVT();
2515
2516 Reg = MF.addLiveIn(Reg, RC);
2517 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
2518
2519 if (Arg.Flags.isSRet()) {
2520 // The return object should be reasonably addressable.
2521
2522 // FIXME: This helps when the return is a real sret. If it is a
2523 // automatically inserted sret (i.e. CanLowerReturn returns false), an
2524 // extra copy is inserted in SelectionDAGBuilder which obscures this.
2525 unsigned NumBits
2526 = 32 - getSubtarget()->getKnownHighZeroBitsForFrameIndex();
2527 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
2528 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
2529 }
2530
2531 // If this is an 8 or 16-bit value, it is really passed promoted
2532 // to 32 bits. Insert an assert[sz]ext to capture this, then
2533 // truncate to the right size.
2534 switch (VA.getLocInfo()) {
2535 case CCValAssign::Full:
2536 break;
2537 case CCValAssign::BCvt:
2538 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
2539 break;
2540 case CCValAssign::SExt:
2541 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
2542 DAG.getValueType(ValVT));
2543 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2544 break;
2545 case CCValAssign::ZExt:
2546 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
2547 DAG.getValueType(ValVT));
2548 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2549 break;
2550 case CCValAssign::AExt:
2551 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
2552 break;
2553 default:
2554 llvm_unreachable("Unknown loc info!");
2555 }
2556
2557 InVals.push_back(Val);
2558 }
2559
2560 // Start adding system SGPRs.
2561 if (IsEntryFunc) {
2562 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
2563 } else {
2564 CCInfo.AllocateReg(Info->getScratchRSrcReg());
2565 if (!IsGraphics)
2566 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
2567 }
2568
2569 auto &ArgUsageInfo =
2570 DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
2571 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
2572
2573 unsigned StackArgSize = CCInfo.getNextStackOffset();
2574 Info->setBytesInStackArgArea(StackArgSize);
2575
2576 return Chains.empty() ? Chain :
2577 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
2578 }
2579
2580 // TODO: If return values can't fit in registers, we should return as many as
2581 // possible in registers before passing on stack.
CanLowerReturn(CallingConv::ID CallConv,MachineFunction & MF,bool IsVarArg,const SmallVectorImpl<ISD::OutputArg> & Outs,LLVMContext & Context) const2582 bool SITargetLowering::CanLowerReturn(
2583 CallingConv::ID CallConv,
2584 MachineFunction &MF, bool IsVarArg,
2585 const SmallVectorImpl<ISD::OutputArg> &Outs,
2586 LLVMContext &Context) const {
2587 // Replacing returns with sret/stack usage doesn't make sense for shaders.
2588 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
2589 // for shaders. Vector types should be explicitly handled by CC.
2590 if (AMDGPU::isEntryFunctionCC(CallConv))
2591 return true;
2592
2593 SmallVector<CCValAssign, 16> RVLocs;
2594 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
2595 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
2596 }
2597
2598 SDValue
LowerReturn(SDValue Chain,CallingConv::ID CallConv,bool isVarArg,const SmallVectorImpl<ISD::OutputArg> & Outs,const SmallVectorImpl<SDValue> & OutVals,const SDLoc & DL,SelectionDAG & DAG) const2599 SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2600 bool isVarArg,
2601 const SmallVectorImpl<ISD::OutputArg> &Outs,
2602 const SmallVectorImpl<SDValue> &OutVals,
2603 const SDLoc &DL, SelectionDAG &DAG) const {
2604 MachineFunction &MF = DAG.getMachineFunction();
2605 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
2606
2607 if (AMDGPU::isKernel(CallConv)) {
2608 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
2609 OutVals, DL, DAG);
2610 }
2611
2612 bool IsShader = AMDGPU::isShader(CallConv);
2613
2614 Info->setIfReturnsVoid(Outs.empty());
2615 bool IsWaveEnd = Info->returnsVoid() && IsShader;
2616
2617 // CCValAssign - represent the assignment of the return value to a location.
2618 SmallVector<CCValAssign, 48> RVLocs;
2619 SmallVector<ISD::OutputArg, 48> Splits;
2620
2621 // CCState - Info about the registers and stack slots.
2622 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2623 *DAG.getContext());
2624
2625 // Analyze outgoing return values.
2626 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
2627
2628 SDValue Flag;
2629 SmallVector<SDValue, 48> RetOps;
2630 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2631
2632 // Copy the result values into the output registers.
2633 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
2634 ++I, ++RealRVLocIdx) {
2635 CCValAssign &VA = RVLocs[I];
2636 assert(VA.isRegLoc() && "Can only return in registers!");
2637 // TODO: Partially return in registers if return values don't fit.
2638 SDValue Arg = OutVals[RealRVLocIdx];
2639
2640 // Copied from other backends.
2641 switch (VA.getLocInfo()) {
2642 case CCValAssign::Full:
2643 break;
2644 case CCValAssign::BCvt:
2645 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
2646 break;
2647 case CCValAssign::SExt:
2648 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
2649 break;
2650 case CCValAssign::ZExt:
2651 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
2652 break;
2653 case CCValAssign::AExt:
2654 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
2655 break;
2656 default:
2657 llvm_unreachable("Unknown loc info!");
2658 }
2659
2660 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
2661 Flag = Chain.getValue(1);
2662 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
2663 }
2664
2665 // FIXME: Does sret work properly?
2666 if (!Info->isEntryFunction()) {
2667 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2668 const MCPhysReg *I =
2669 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2670 if (I) {
2671 for (; *I; ++I) {
2672 if (AMDGPU::SReg_64RegClass.contains(*I))
2673 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2674 else if (AMDGPU::SReg_32RegClass.contains(*I))
2675 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
2676 else
2677 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2678 }
2679 }
2680 }
2681
2682 // Update chain and glue.
2683 RetOps[0] = Chain;
2684 if (Flag.getNode())
2685 RetOps.push_back(Flag);
2686
2687 unsigned Opc = AMDGPUISD::ENDPGM;
2688 if (!IsWaveEnd)
2689 Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG;
2690 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
2691 }
2692
LowerCallResult(SDValue Chain,SDValue InFlag,CallingConv::ID CallConv,bool IsVarArg,const SmallVectorImpl<ISD::InputArg> & Ins,const SDLoc & DL,SelectionDAG & DAG,SmallVectorImpl<SDValue> & InVals,bool IsThisReturn,SDValue ThisVal) const2693 SDValue SITargetLowering::LowerCallResult(
2694 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
2695 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2696 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
2697 SDValue ThisVal) const {
2698 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
2699
2700 // Assign locations to each value returned by this call.
2701 SmallVector<CCValAssign, 16> RVLocs;
2702 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
2703 *DAG.getContext());
2704 CCInfo.AnalyzeCallResult(Ins, RetCC);
2705
2706 // Copy all of the result registers out of their specified physreg.
2707 for (unsigned i = 0; i != RVLocs.size(); ++i) {
2708 CCValAssign VA = RVLocs[i];
2709 SDValue Val;
2710
2711 if (VA.isRegLoc()) {
2712 Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
2713 Chain = Val.getValue(1);
2714 InFlag = Val.getValue(2);
2715 } else if (VA.isMemLoc()) {
2716 report_fatal_error("TODO: return values in memory");
2717 } else
2718 llvm_unreachable("unknown argument location type");
2719
2720 switch (VA.getLocInfo()) {
2721 case CCValAssign::Full:
2722 break;
2723 case CCValAssign::BCvt:
2724 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
2725 break;
2726 case CCValAssign::ZExt:
2727 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
2728 DAG.getValueType(VA.getValVT()));
2729 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2730 break;
2731 case CCValAssign::SExt:
2732 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
2733 DAG.getValueType(VA.getValVT()));
2734 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2735 break;
2736 case CCValAssign::AExt:
2737 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
2738 break;
2739 default:
2740 llvm_unreachable("Unknown loc info!");
2741 }
2742
2743 InVals.push_back(Val);
2744 }
2745
2746 return Chain;
2747 }
2748
2749 // Add code to pass special inputs required depending on used features separate
2750 // from the explicit user arguments present in the IR.
passSpecialInputs(CallLoweringInfo & CLI,CCState & CCInfo,const SIMachineFunctionInfo & Info,SmallVectorImpl<std::pair<unsigned,SDValue>> & RegsToPass,SmallVectorImpl<SDValue> & MemOpChains,SDValue Chain) const2751 void SITargetLowering::passSpecialInputs(
2752 CallLoweringInfo &CLI,
2753 CCState &CCInfo,
2754 const SIMachineFunctionInfo &Info,
2755 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
2756 SmallVectorImpl<SDValue> &MemOpChains,
2757 SDValue Chain) const {
2758 // If we don't have a call site, this was a call inserted by
2759 // legalization. These can never use special inputs.
2760 if (!CLI.CB)
2761 return;
2762
2763 SelectionDAG &DAG = CLI.DAG;
2764 const SDLoc &DL = CLI.DL;
2765 const Function &F = DAG.getMachineFunction().getFunction();
2766
2767 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
2768 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
2769
2770 const AMDGPUFunctionArgInfo *CalleeArgInfo
2771 = &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
2772 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
2773 auto &ArgUsageInfo =
2774 DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
2775 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
2776 }
2777
2778 // TODO: Unify with private memory register handling. This is complicated by
2779 // the fact that at least in kernels, the input argument is not necessarily
2780 // in the same location as the input.
2781 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
2782 StringLiteral> ImplicitAttrs[] = {
2783 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
2784 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
2785 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
2786 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
2787 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
2788 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
2789 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
2790 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
2791 };
2792
2793 for (auto Attr : ImplicitAttrs) {
2794 const ArgDescriptor *OutgoingArg;
2795 const TargetRegisterClass *ArgRC;
2796 LLT ArgTy;
2797
2798 AMDGPUFunctionArgInfo::PreloadedValue InputID = Attr.first;
2799
2800 // If the callee does not use the attribute value, skip copying the value.
2801 if (CLI.CB->hasFnAttr(Attr.second))
2802 continue;
2803
2804 std::tie(OutgoingArg, ArgRC, ArgTy) =
2805 CalleeArgInfo->getPreloadedValue(InputID);
2806 if (!OutgoingArg)
2807 continue;
2808
2809 const ArgDescriptor *IncomingArg;
2810 const TargetRegisterClass *IncomingArgRC;
2811 LLT Ty;
2812 std::tie(IncomingArg, IncomingArgRC, Ty) =
2813 CallerArgInfo.getPreloadedValue(InputID);
2814 assert(IncomingArgRC == ArgRC);
2815
2816 // All special arguments are ints for now.
2817 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
2818 SDValue InputReg;
2819
2820 if (IncomingArg) {
2821 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
2822 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
2823 // The implicit arg ptr is special because it doesn't have a corresponding
2824 // input for kernels, and is computed from the kernarg segment pointer.
2825 InputReg = getImplicitArgPtr(DAG, DL);
2826 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
2827 Optional<uint32_t> Id = AMDGPUMachineFunction::getLDSKernelIdMetadata(F);
2828 if (Id.has_value()) {
2829 InputReg = DAG.getConstant(Id.value(), DL, ArgVT);
2830 } else {
2831 InputReg = DAG.getUNDEF(ArgVT);
2832 }
2833 } else {
2834 // We may have proven the input wasn't needed, although the ABI is
2835 // requiring it. We just need to allocate the register appropriately.
2836 InputReg = DAG.getUNDEF(ArgVT);
2837 }
2838
2839 if (OutgoingArg->isRegister()) {
2840 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
2841 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
2842 report_fatal_error("failed to allocate implicit input argument");
2843 } else {
2844 unsigned SpecialArgOffset =
2845 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
2846 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
2847 SpecialArgOffset);
2848 MemOpChains.push_back(ArgStore);
2849 }
2850 }
2851
2852 // Pack workitem IDs into a single register or pass it as is if already
2853 // packed.
2854 const ArgDescriptor *OutgoingArg;
2855 const TargetRegisterClass *ArgRC;
2856 LLT Ty;
2857
2858 std::tie(OutgoingArg, ArgRC, Ty) =
2859 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X);
2860 if (!OutgoingArg)
2861 std::tie(OutgoingArg, ArgRC, Ty) =
2862 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
2863 if (!OutgoingArg)
2864 std::tie(OutgoingArg, ArgRC, Ty) =
2865 CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
2866 if (!OutgoingArg)
2867 return;
2868
2869 const ArgDescriptor *IncomingArgX = std::get<0>(
2870 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X));
2871 const ArgDescriptor *IncomingArgY = std::get<0>(
2872 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y));
2873 const ArgDescriptor *IncomingArgZ = std::get<0>(
2874 CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z));
2875
2876 SDValue InputReg;
2877 SDLoc SL;
2878
2879 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
2880 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
2881 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
2882
2883 // If incoming ids are not packed we need to pack them.
2884 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
2885 NeedWorkItemIDX) {
2886 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
2887 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
2888 } else {
2889 InputReg = DAG.getConstant(0, DL, MVT::i32);
2890 }
2891 }
2892
2893 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
2894 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
2895 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
2896 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
2897 DAG.getShiftAmountConstant(10, MVT::i32, SL));
2898 InputReg = InputReg.getNode() ?
2899 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y;
2900 }
2901
2902 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
2903 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
2904 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
2905 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
2906 DAG.getShiftAmountConstant(20, MVT::i32, SL));
2907 InputReg = InputReg.getNode() ?
2908 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) : Z;
2909 }
2910
2911 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
2912 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
2913 // We're in a situation where the outgoing function requires the workitem
2914 // ID, but the calling function does not have it (e.g a graphics function
2915 // calling a C calling convention function). This is illegal, but we need
2916 // to produce something.
2917 InputReg = DAG.getUNDEF(MVT::i32);
2918 } else {
2919 // Workitem ids are already packed, any of present incoming arguments
2920 // will carry all required fields.
2921 ArgDescriptor IncomingArg = ArgDescriptor::createArg(
2922 IncomingArgX ? *IncomingArgX :
2923 IncomingArgY ? *IncomingArgY :
2924 *IncomingArgZ, ~0u);
2925 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
2926 }
2927 }
2928
2929 if (OutgoingArg->isRegister()) {
2930 if (InputReg)
2931 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
2932
2933 CCInfo.AllocateReg(OutgoingArg->getRegister());
2934 } else {
2935 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
2936 if (InputReg) {
2937 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
2938 SpecialArgOffset);
2939 MemOpChains.push_back(ArgStore);
2940 }
2941 }
2942 }
2943
canGuaranteeTCO(CallingConv::ID CC)2944 static bool canGuaranteeTCO(CallingConv::ID CC) {
2945 return CC == CallingConv::Fast;
2946 }
2947
2948 /// Return true if we might ever do TCO for calls with this calling convention.
mayTailCallThisCC(CallingConv::ID CC)2949 static bool mayTailCallThisCC(CallingConv::ID CC) {
2950 switch (CC) {
2951 case CallingConv::C:
2952 case CallingConv::AMDGPU_Gfx:
2953 return true;
2954 default:
2955 return canGuaranteeTCO(CC);
2956 }
2957 }
2958
isEligibleForTailCallOptimization(SDValue Callee,CallingConv::ID CalleeCC,bool IsVarArg,const SmallVectorImpl<ISD::OutputArg> & Outs,const SmallVectorImpl<SDValue> & OutVals,const SmallVectorImpl<ISD::InputArg> & Ins,SelectionDAG & DAG) const2959 bool SITargetLowering::isEligibleForTailCallOptimization(
2960 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
2961 const SmallVectorImpl<ISD::OutputArg> &Outs,
2962 const SmallVectorImpl<SDValue> &OutVals,
2963 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
2964 if (!mayTailCallThisCC(CalleeCC))
2965 return false;
2966
2967 // For a divergent call target, we need to do a waterfall loop over the
2968 // possible callees which precludes us from using a simple jump.
2969 if (Callee->isDivergent())
2970 return false;
2971
2972 MachineFunction &MF = DAG.getMachineFunction();
2973 const Function &CallerF = MF.getFunction();
2974 CallingConv::ID CallerCC = CallerF.getCallingConv();
2975 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
2976 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2977
2978 // Kernels aren't callable, and don't have a live in return address so it
2979 // doesn't make sense to do a tail call with entry functions.
2980 if (!CallerPreserved)
2981 return false;
2982
2983 bool CCMatch = CallerCC == CalleeCC;
2984
2985 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
2986 if (canGuaranteeTCO(CalleeCC) && CCMatch)
2987 return true;
2988 return false;
2989 }
2990
2991 // TODO: Can we handle var args?
2992 if (IsVarArg)
2993 return false;
2994
2995 for (const Argument &Arg : CallerF.args()) {
2996 if (Arg.hasByValAttr())
2997 return false;
2998 }
2999
3000 LLVMContext &Ctx = *DAG.getContext();
3001
3002 // Check that the call results are passed in the same way.
3003 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
3004 CCAssignFnForCall(CalleeCC, IsVarArg),
3005 CCAssignFnForCall(CallerCC, IsVarArg)))
3006 return false;
3007
3008 // The callee has to preserve all registers the caller needs to preserve.
3009 if (!CCMatch) {
3010 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3011 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3012 return false;
3013 }
3014
3015 // Nothing more to check if the callee is taking no arguments.
3016 if (Outs.empty())
3017 return true;
3018
3019 SmallVector<CCValAssign, 16> ArgLocs;
3020 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3021
3022 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
3023
3024 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3025 // If the stack arguments for this call do not fit into our own save area then
3026 // the call cannot be made tail.
3027 // TODO: Is this really necessary?
3028 if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
3029 return false;
3030
3031 const MachineRegisterInfo &MRI = MF.getRegInfo();
3032 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
3033 }
3034
mayBeEmittedAsTailCall(const CallInst * CI) const3035 bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3036 if (!CI->isTailCall())
3037 return false;
3038
3039 const Function *ParentFn = CI->getParent()->getParent();
3040 if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
3041 return false;
3042 return true;
3043 }
3044
3045 // The wave scratch offset register is used as the global base pointer.
LowerCall(CallLoweringInfo & CLI,SmallVectorImpl<SDValue> & InVals) const3046 SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
3047 SmallVectorImpl<SDValue> &InVals) const {
3048 SelectionDAG &DAG = CLI.DAG;
3049 const SDLoc &DL = CLI.DL;
3050 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
3051 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3052 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
3053 SDValue Chain = CLI.Chain;
3054 SDValue Callee = CLI.Callee;
3055 bool &IsTailCall = CLI.IsTailCall;
3056 CallingConv::ID CallConv = CLI.CallConv;
3057 bool IsVarArg = CLI.IsVarArg;
3058 bool IsSibCall = false;
3059 bool IsThisReturn = false;
3060 MachineFunction &MF = DAG.getMachineFunction();
3061
3062 if (Callee.isUndef() || isNullConstant(Callee)) {
3063 if (!CLI.IsTailCall) {
3064 for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
3065 InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
3066 }
3067
3068 return Chain;
3069 }
3070
3071 if (IsVarArg) {
3072 return lowerUnhandledCall(CLI, InVals,
3073 "unsupported call to variadic function ");
3074 }
3075
3076 if (!CLI.CB)
3077 report_fatal_error("unsupported libcall legalization");
3078
3079 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
3080 return lowerUnhandledCall(CLI, InVals,
3081 "unsupported required tail call to function ");
3082 }
3083
3084 if (AMDGPU::isShader(CallConv)) {
3085 // Note the issue is with the CC of the called function, not of the call
3086 // itself.
3087 return lowerUnhandledCall(CLI, InVals,
3088 "unsupported call to a shader function ");
3089 }
3090
3091 if (AMDGPU::isShader(MF.getFunction().getCallingConv()) &&
3092 CallConv != CallingConv::AMDGPU_Gfx) {
3093 // Only allow calls with specific calling conventions.
3094 return lowerUnhandledCall(CLI, InVals,
3095 "unsupported calling convention for call from "
3096 "graphics shader of function ");
3097 }
3098
3099 if (IsTailCall) {
3100 IsTailCall = isEligibleForTailCallOptimization(
3101 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3102 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall()) {
3103 report_fatal_error("failed to perform tail call elimination on a call "
3104 "site marked musttail");
3105 }
3106
3107 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3108
3109 // A sibling call is one where we're under the usual C ABI and not planning
3110 // to change that but can still do a tail call:
3111 if (!TailCallOpt && IsTailCall)
3112 IsSibCall = true;
3113
3114 if (IsTailCall)
3115 ++NumTailCalls;
3116 }
3117
3118 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3119 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3120 SmallVector<SDValue, 8> MemOpChains;
3121
3122 // Analyze operands of the call, assigning locations to each operand.
3123 SmallVector<CCValAssign, 16> ArgLocs;
3124 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3125 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
3126
3127 if (CallConv != CallingConv::AMDGPU_Gfx) {
3128 // With a fixed ABI, allocate fixed registers before user arguments.
3129 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
3130 }
3131
3132 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
3133
3134 // Get a count of how many bytes are to be pushed on the stack.
3135 unsigned NumBytes = CCInfo.getNextStackOffset();
3136
3137 if (IsSibCall) {
3138 // Since we're not changing the ABI to make this a tail call, the memory
3139 // operands are already available in the caller's incoming argument space.
3140 NumBytes = 0;
3141 }
3142
3143 // FPDiff is the byte offset of the call's argument area from the callee's.
3144 // Stores to callee stack arguments will be placed in FixedStackSlots offset
3145 // by this amount for a tail call. In a sibling call it must be 0 because the
3146 // caller will deallocate the entire stack and the callee still expects its
3147 // arguments to begin at SP+0. Completely unused for non-tail calls.
3148 int32_t FPDiff = 0;
3149 MachineFrameInfo &MFI = MF.getFrameInfo();
3150
3151 // Adjust the stack pointer for the new arguments...
3152 // These operations are automatically eliminated by the prolog/epilog pass
3153 if (!IsSibCall) {
3154 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
3155
3156 if (!Subtarget->enableFlatScratch()) {
3157 SmallVector<SDValue, 4> CopyFromChains;
3158
3159 // In the HSA case, this should be an identity copy.
3160 SDValue ScratchRSrcReg
3161 = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
3162 RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
3163 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
3164 Chain = DAG.getTokenFactor(DL, CopyFromChains);
3165 }
3166 }
3167
3168 MVT PtrVT = MVT::i32;
3169
3170 // Walk the register/memloc assignments, inserting copies/loads.
3171 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3172 CCValAssign &VA = ArgLocs[i];
3173 SDValue Arg = OutVals[i];
3174
3175 // Promote the value if needed.
3176 switch (VA.getLocInfo()) {
3177 case CCValAssign::Full:
3178 break;
3179 case CCValAssign::BCvt:
3180 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3181 break;
3182 case CCValAssign::ZExt:
3183 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3184 break;
3185 case CCValAssign::SExt:
3186 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3187 break;
3188 case CCValAssign::AExt:
3189 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3190 break;
3191 case CCValAssign::FPExt:
3192 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3193 break;
3194 default:
3195 llvm_unreachable("Unknown loc info!");
3196 }
3197
3198 if (VA.isRegLoc()) {
3199 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3200 } else {
3201 assert(VA.isMemLoc());
3202
3203 SDValue DstAddr;
3204 MachinePointerInfo DstInfo;
3205
3206 unsigned LocMemOffset = VA.getLocMemOffset();
3207 int32_t Offset = LocMemOffset;
3208
3209 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
3210 MaybeAlign Alignment;
3211
3212 if (IsTailCall) {
3213 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3214 unsigned OpSize = Flags.isByVal() ?
3215 Flags.getByValSize() : VA.getValVT().getStoreSize();
3216
3217 // FIXME: We can have better than the minimum byval required alignment.
3218 Alignment =
3219 Flags.isByVal()
3220 ? Flags.getNonZeroByValAlign()
3221 : commonAlignment(Subtarget->getStackAlignment(), Offset);
3222
3223 Offset = Offset + FPDiff;
3224 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
3225
3226 DstAddr = DAG.getFrameIndex(FI, PtrVT);
3227 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
3228
3229 // Make sure any stack arguments overlapping with where we're storing
3230 // are loaded before this eventual operation. Otherwise they'll be
3231 // clobbered.
3232
3233 // FIXME: Why is this really necessary? This seems to just result in a
3234 // lot of code to copy the stack and write them back to the same
3235 // locations, which are supposed to be immutable?
3236 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
3237 } else {
3238 // Stores to the argument stack area are relative to the stack pointer.
3239 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
3240 MVT::i32);
3241 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
3242 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
3243 Alignment =
3244 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
3245 }
3246
3247 if (Outs[i].Flags.isByVal()) {
3248 SDValue SizeNode =
3249 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
3250 SDValue Cpy =
3251 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
3252 Outs[i].Flags.getNonZeroByValAlign(),
3253 /*isVol = */ false, /*AlwaysInline = */ true,
3254 /*isTailCall = */ false, DstInfo,
3255 MachinePointerInfo(AMDGPUAS::PRIVATE_ADDRESS));
3256
3257 MemOpChains.push_back(Cpy);
3258 } else {
3259 SDValue Store =
3260 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
3261 MemOpChains.push_back(Store);
3262 }
3263 }
3264 }
3265
3266 if (!MemOpChains.empty())
3267 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3268
3269 // Build a sequence of copy-to-reg nodes chained together with token chain
3270 // and flag operands which copy the outgoing args into the appropriate regs.
3271 SDValue InFlag;
3272 for (auto &RegToPass : RegsToPass) {
3273 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
3274 RegToPass.second, InFlag);
3275 InFlag = Chain.getValue(1);
3276 }
3277
3278
3279 // We don't usually want to end the call-sequence here because we would tidy
3280 // the frame up *after* the call, however in the ABI-changing tail-call case
3281 // we've carefully laid out the parameters so that when sp is reset they'll be
3282 // in the correct location.
3283 if (IsTailCall && !IsSibCall) {
3284 Chain = DAG.getCALLSEQ_END(Chain,
3285 DAG.getTargetConstant(NumBytes, DL, MVT::i32),
3286 DAG.getTargetConstant(0, DL, MVT::i32),
3287 InFlag, DL);
3288 InFlag = Chain.getValue(1);
3289 }
3290
3291 std::vector<SDValue> Ops;
3292 Ops.push_back(Chain);
3293 Ops.push_back(Callee);
3294 // Add a redundant copy of the callee global which will not be legalized, as
3295 // we need direct access to the callee later.
3296 if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) {
3297 const GlobalValue *GV = GSD->getGlobal();
3298 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
3299 } else {
3300 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
3301 }
3302
3303 if (IsTailCall) {
3304 // Each tail call may have to adjust the stack by a different amount, so
3305 // this information must travel along with the operation for eventual
3306 // consumption by emitEpilogue.
3307 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3308 }
3309
3310 // Add argument registers to the end of the list so that they are known live
3311 // into the call.
3312 for (auto &RegToPass : RegsToPass) {
3313 Ops.push_back(DAG.getRegister(RegToPass.first,
3314 RegToPass.second.getValueType()));
3315 }
3316
3317 // Add a register mask operand representing the call-preserved registers.
3318
3319 auto *TRI = static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
3320 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
3321 assert(Mask && "Missing call preserved mask for calling convention");
3322 Ops.push_back(DAG.getRegisterMask(Mask));
3323
3324 if (InFlag.getNode())
3325 Ops.push_back(InFlag);
3326
3327 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3328
3329 // If we're doing a tall call, use a TC_RETURN here rather than an
3330 // actual call instruction.
3331 if (IsTailCall) {
3332 MFI.setHasTailCall();
3333 return DAG.getNode(AMDGPUISD::TC_RETURN, DL, NodeTys, Ops);
3334 }
3335
3336 // Returns a chain and a flag for retval copy to use.
3337 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
3338 Chain = Call.getValue(0);
3339 InFlag = Call.getValue(1);
3340
3341 uint64_t CalleePopBytes = NumBytes;
3342 Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32),
3343 DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32),
3344 InFlag, DL);
3345 if (!Ins.empty())
3346 InFlag = Chain.getValue(1);
3347
3348 // Handle result values, copying them out of physregs into vregs that we
3349 // return.
3350 return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
3351 InVals, IsThisReturn,
3352 IsThisReturn ? OutVals[0] : SDValue());
3353 }
3354
3355 // This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
3356 // except for applying the wave size scale to the increment amount.
lowerDYNAMIC_STACKALLOCImpl(SDValue Op,SelectionDAG & DAG) const3357 SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(
3358 SDValue Op, SelectionDAG &DAG) const {
3359 const MachineFunction &MF = DAG.getMachineFunction();
3360 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
3361
3362 SDLoc dl(Op);
3363 EVT VT = Op.getValueType();
3364 SDValue Tmp1 = Op;
3365 SDValue Tmp2 = Op.getValue(1);
3366 SDValue Tmp3 = Op.getOperand(2);
3367 SDValue Chain = Tmp1.getOperand(0);
3368
3369 Register SPReg = Info->getStackPtrOffsetReg();
3370
3371 // Chain the dynamic stack allocation so that it doesn't modify the stack
3372 // pointer when other instructions are using the stack.
3373 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
3374
3375 SDValue Size = Tmp2.getOperand(1);
3376 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
3377 Chain = SP.getValue(1);
3378 MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
3379 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
3380 const TargetFrameLowering *TFL = ST.getFrameLowering();
3381 unsigned Opc =
3382 TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp ?
3383 ISD::ADD : ISD::SUB;
3384
3385 SDValue ScaledSize = DAG.getNode(
3386 ISD::SHL, dl, VT, Size,
3387 DAG.getConstant(ST.getWavefrontSizeLog2(), dl, MVT::i32));
3388
3389 Align StackAlign = TFL->getStackAlign();
3390 Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value
3391 if (Alignment && *Alignment > StackAlign) {
3392 Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
3393 DAG.getConstant(-(uint64_t)Alignment->value()
3394 << ST.getWavefrontSizeLog2(),
3395 dl, VT));
3396 }
3397
3398 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
3399 Tmp2 = DAG.getCALLSEQ_END(
3400 Chain, DAG.getIntPtrConstant(0, dl, true),
3401 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
3402
3403 return DAG.getMergeValues({Tmp1, Tmp2}, dl);
3404 }
3405
LowerDYNAMIC_STACKALLOC(SDValue Op,SelectionDAG & DAG) const3406 SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
3407 SelectionDAG &DAG) const {
3408 // We only handle constant sizes here to allow non-entry block, static sized
3409 // allocas. A truly dynamic value is more difficult to support because we
3410 // don't know if the size value is uniform or not. If the size isn't uniform,
3411 // we would need to do a wave reduction to get the maximum size to know how
3412 // much to increment the uniform stack pointer.
3413 SDValue Size = Op.getOperand(1);
3414 if (isa<ConstantSDNode>(Size))
3415 return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion.
3416
3417 return AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(Op, DAG);
3418 }
3419
getRegisterByName(const char * RegName,LLT VT,const MachineFunction & MF) const3420 Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT,
3421 const MachineFunction &MF) const {
3422 Register Reg = StringSwitch<Register>(RegName)
3423 .Case("m0", AMDGPU::M0)
3424 .Case("exec", AMDGPU::EXEC)
3425 .Case("exec_lo", AMDGPU::EXEC_LO)
3426 .Case("exec_hi", AMDGPU::EXEC_HI)
3427 .Case("flat_scratch", AMDGPU::FLAT_SCR)
3428 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
3429 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
3430 .Default(Register());
3431
3432 if (Reg == AMDGPU::NoRegister) {
3433 report_fatal_error(Twine("invalid register name \""
3434 + StringRef(RegName) + "\"."));
3435
3436 }
3437
3438 if (!Subtarget->hasFlatScrRegister() &&
3439 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
3440 report_fatal_error(Twine("invalid register \""
3441 + StringRef(RegName) + "\" for subtarget."));
3442 }
3443
3444 switch (Reg) {
3445 case AMDGPU::M0:
3446 case AMDGPU::EXEC_LO:
3447 case AMDGPU::EXEC_HI:
3448 case AMDGPU::FLAT_SCR_LO:
3449 case AMDGPU::FLAT_SCR_HI:
3450 if (VT.getSizeInBits() == 32)
3451 return Reg;
3452 break;
3453 case AMDGPU::EXEC:
3454 case AMDGPU::FLAT_SCR:
3455 if (VT.getSizeInBits() == 64)
3456 return Reg;
3457 break;
3458 default:
3459 llvm_unreachable("missing register type checking");
3460 }
3461
3462 report_fatal_error(Twine("invalid type for register \""
3463 + StringRef(RegName) + "\"."));
3464 }
3465
3466 // If kill is not the last instruction, split the block so kill is always a
3467 // proper terminator.
3468 MachineBasicBlock *
splitKillBlock(MachineInstr & MI,MachineBasicBlock * BB) const3469 SITargetLowering::splitKillBlock(MachineInstr &MI,
3470 MachineBasicBlock *BB) const {
3471 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
3472 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3473 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
3474 return SplitBB;
3475 }
3476
3477 // Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
3478 // \p MI will be the only instruction in the loop body block. Otherwise, it will
3479 // be the first instruction in the remainder block.
3480 //
3481 /// \returns { LoopBody, Remainder }
3482 static std::pair<MachineBasicBlock *, MachineBasicBlock *>
splitBlockForLoop(MachineInstr & MI,MachineBasicBlock & MBB,bool InstInLoop)3483 splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop) {
3484 MachineFunction *MF = MBB.getParent();
3485 MachineBasicBlock::iterator I(&MI);
3486
3487 // To insert the loop we need to split the block. Move everything after this
3488 // point to a new block, and insert a new empty block between the two.
3489 MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock();
3490 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
3491 MachineFunction::iterator MBBI(MBB);
3492 ++MBBI;
3493
3494 MF->insert(MBBI, LoopBB);
3495 MF->insert(MBBI, RemainderBB);
3496
3497 LoopBB->addSuccessor(LoopBB);
3498 LoopBB->addSuccessor(RemainderBB);
3499
3500 // Move the rest of the block into a new block.
3501 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
3502
3503 if (InstInLoop) {
3504 auto Next = std::next(I);
3505
3506 // Move instruction to loop body.
3507 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
3508
3509 // Move the rest of the block.
3510 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
3511 } else {
3512 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
3513 }
3514
3515 MBB.addSuccessor(LoopBB);
3516
3517 return std::make_pair(LoopBB, RemainderBB);
3518 }
3519
3520 /// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
bundleInstWithWaitcnt(MachineInstr & MI) const3521 void SITargetLowering::bundleInstWithWaitcnt(MachineInstr &MI) const {
3522 MachineBasicBlock *MBB = MI.getParent();
3523 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3524 auto I = MI.getIterator();
3525 auto E = std::next(I);
3526
3527 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
3528 .addImm(0);
3529
3530 MIBundleBuilder Bundler(*MBB, I, E);
3531 finalizeBundle(*MBB, Bundler.begin());
3532 }
3533
3534 MachineBasicBlock *
emitGWSMemViolTestLoop(MachineInstr & MI,MachineBasicBlock * BB) const3535 SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI,
3536 MachineBasicBlock *BB) const {
3537 const DebugLoc &DL = MI.getDebugLoc();
3538
3539 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
3540
3541 MachineBasicBlock *LoopBB;
3542 MachineBasicBlock *RemainderBB;
3543 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3544
3545 // Apparently kill flags are only valid if the def is in the same block?
3546 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
3547 Src->setIsKill(false);
3548
3549 std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, *BB, true);
3550
3551 MachineBasicBlock::iterator I = LoopBB->end();
3552
3553 const unsigned EncodedReg = AMDGPU::Hwreg::encodeHwreg(
3554 AMDGPU::Hwreg::ID_TRAPSTS, AMDGPU::Hwreg::OFFSET_MEM_VIOL, 1);
3555
3556 // Clear TRAP_STS.MEM_VIOL
3557 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
3558 .addImm(0)
3559 .addImm(EncodedReg);
3560
3561 bundleInstWithWaitcnt(MI);
3562
3563 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3564
3565 // Load and check TRAP_STS.MEM_VIOL
3566 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
3567 .addImm(EncodedReg);
3568
3569 // FIXME: Do we need to use an isel pseudo that may clobber scc?
3570 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
3571 .addReg(Reg, RegState::Kill)
3572 .addImm(0);
3573 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
3574 .addMBB(LoopBB);
3575
3576 return RemainderBB;
3577 }
3578
3579 // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
3580 // wavefront. If the value is uniform and just happens to be in a VGPR, this
3581 // will only do one iteration. In the worst case, this will loop 64 times.
3582 //
3583 // TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
3584 static MachineBasicBlock::iterator
emitLoadM0FromVGPRLoop(const SIInstrInfo * TII,MachineRegisterInfo & MRI,MachineBasicBlock & OrigBB,MachineBasicBlock & LoopBB,const DebugLoc & DL,const MachineOperand & Idx,unsigned InitReg,unsigned ResultReg,unsigned PhiReg,unsigned InitSaveExecReg,int Offset,bool UseGPRIdxMode,Register & SGPRIdxReg)3585 emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI,
3586 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
3587 const DebugLoc &DL, const MachineOperand &Idx,
3588 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
3589 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
3590 Register &SGPRIdxReg) {
3591
3592 MachineFunction *MF = OrigBB.getParent();
3593 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3594 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3595 MachineBasicBlock::iterator I = LoopBB.begin();
3596
3597 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
3598 Register PhiExec = MRI.createVirtualRegister(BoolRC);
3599 Register NewExec = MRI.createVirtualRegister(BoolRC);
3600 Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3601 Register CondReg = MRI.createVirtualRegister(BoolRC);
3602
3603 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
3604 .addReg(InitReg)
3605 .addMBB(&OrigBB)
3606 .addReg(ResultReg)
3607 .addMBB(&LoopBB);
3608
3609 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
3610 .addReg(InitSaveExecReg)
3611 .addMBB(&OrigBB)
3612 .addReg(NewExec)
3613 .addMBB(&LoopBB);
3614
3615 // Read the next variant <- also loop target.
3616 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
3617 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
3618
3619 // Compare the just read M0 value to all possible Idx values.
3620 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
3621 .addReg(CurrentIdxReg)
3622 .addReg(Idx.getReg(), 0, Idx.getSubReg());
3623
3624 // Update EXEC, save the original EXEC value to VCC.
3625 BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
3626 : AMDGPU::S_AND_SAVEEXEC_B64),
3627 NewExec)
3628 .addReg(CondReg, RegState::Kill);
3629
3630 MRI.setSimpleHint(NewExec, CondReg);
3631
3632 if (UseGPRIdxMode) {
3633 if (Offset == 0) {
3634 SGPRIdxReg = CurrentIdxReg;
3635 } else {
3636 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
3637 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
3638 .addReg(CurrentIdxReg, RegState::Kill)
3639 .addImm(Offset);
3640 }
3641 } else {
3642 // Move index from VCC into M0
3643 if (Offset == 0) {
3644 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
3645 .addReg(CurrentIdxReg, RegState::Kill);
3646 } else {
3647 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
3648 .addReg(CurrentIdxReg, RegState::Kill)
3649 .addImm(Offset);
3650 }
3651 }
3652
3653 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
3654 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3655 MachineInstr *InsertPt =
3656 BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
3657 : AMDGPU::S_XOR_B64_term), Exec)
3658 .addReg(Exec)
3659 .addReg(NewExec);
3660
3661 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
3662 // s_cbranch_scc0?
3663
3664 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
3665 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
3666 .addMBB(&LoopBB);
3667
3668 return InsertPt->getIterator();
3669 }
3670
3671 // This has slightly sub-optimal regalloc when the source vector is killed by
3672 // the read. The register allocator does not understand that the kill is
3673 // per-workitem, so is kept alive for the whole loop so we end up not re-using a
3674 // subregister from it, using 1 more VGPR than necessary. This was saved when
3675 // this was expanded after register allocation.
3676 static MachineBasicBlock::iterator
loadM0FromVGPR(const SIInstrInfo * TII,MachineBasicBlock & MBB,MachineInstr & MI,unsigned InitResultReg,unsigned PhiReg,int Offset,bool UseGPRIdxMode,Register & SGPRIdxReg)3677 loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI,
3678 unsigned InitResultReg, unsigned PhiReg, int Offset,
3679 bool UseGPRIdxMode, Register &SGPRIdxReg) {
3680 MachineFunction *MF = MBB.getParent();
3681 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
3682 const SIRegisterInfo *TRI = ST.getRegisterInfo();
3683 MachineRegisterInfo &MRI = MF->getRegInfo();
3684 const DebugLoc &DL = MI.getDebugLoc();
3685 MachineBasicBlock::iterator I(&MI);
3686
3687 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
3688 Register DstReg = MI.getOperand(0).getReg();
3689 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
3690 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
3691 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3692 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
3693
3694 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
3695
3696 // Save the EXEC mask
3697 BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
3698 .addReg(Exec);
3699
3700 MachineBasicBlock *LoopBB;
3701 MachineBasicBlock *RemainderBB;
3702 std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, MBB, false);
3703
3704 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3705
3706 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
3707 InitResultReg, DstReg, PhiReg, TmpExec,
3708 Offset, UseGPRIdxMode, SGPRIdxReg);
3709
3710 MachineBasicBlock* LandingPad = MF->CreateMachineBasicBlock();
3711 MachineFunction::iterator MBBI(LoopBB);
3712 ++MBBI;
3713 MF->insert(MBBI, LandingPad);
3714 LoopBB->removeSuccessor(RemainderBB);
3715 LandingPad->addSuccessor(RemainderBB);
3716 LoopBB->addSuccessor(LandingPad);
3717 MachineBasicBlock::iterator First = LandingPad->begin();
3718 BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
3719 .addReg(SaveExec);
3720
3721 return InsPt;
3722 }
3723
3724 // Returns subreg index, offset
3725 static std::pair<unsigned, int>
computeIndirectRegAndOffset(const SIRegisterInfo & TRI,const TargetRegisterClass * SuperRC,unsigned VecReg,int Offset)3726 computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
3727 const TargetRegisterClass *SuperRC,
3728 unsigned VecReg,
3729 int Offset) {
3730 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
3731
3732 // Skip out of bounds offsets, or else we would end up using an undefined
3733 // register.
3734 if (Offset >= NumElts || Offset < 0)
3735 return std::make_pair(AMDGPU::sub0, Offset);
3736
3737 return std::make_pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
3738 }
3739
setM0ToIndexFromSGPR(const SIInstrInfo * TII,MachineRegisterInfo & MRI,MachineInstr & MI,int Offset)3740 static void setM0ToIndexFromSGPR(const SIInstrInfo *TII,
3741 MachineRegisterInfo &MRI, MachineInstr &MI,
3742 int Offset) {
3743 MachineBasicBlock *MBB = MI.getParent();
3744 const DebugLoc &DL = MI.getDebugLoc();
3745 MachineBasicBlock::iterator I(&MI);
3746
3747 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3748
3749 assert(Idx->getReg() != AMDGPU::NoRegister);
3750
3751 if (Offset == 0) {
3752 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0).add(*Idx);
3753 } else {
3754 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
3755 .add(*Idx)
3756 .addImm(Offset);
3757 }
3758 }
3759
getIndirectSGPRIdx(const SIInstrInfo * TII,MachineRegisterInfo & MRI,MachineInstr & MI,int Offset)3760 static Register getIndirectSGPRIdx(const SIInstrInfo *TII,
3761 MachineRegisterInfo &MRI, MachineInstr &MI,
3762 int Offset) {
3763 MachineBasicBlock *MBB = MI.getParent();
3764 const DebugLoc &DL = MI.getDebugLoc();
3765 MachineBasicBlock::iterator I(&MI);
3766
3767 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3768
3769 if (Offset == 0)
3770 return Idx->getReg();
3771
3772 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
3773 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
3774 .add(*Idx)
3775 .addImm(Offset);
3776 return Tmp;
3777 }
3778
emitIndirectSrc(MachineInstr & MI,MachineBasicBlock & MBB,const GCNSubtarget & ST)3779 static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
3780 MachineBasicBlock &MBB,
3781 const GCNSubtarget &ST) {
3782 const SIInstrInfo *TII = ST.getInstrInfo();
3783 const SIRegisterInfo &TRI = TII->getRegisterInfo();
3784 MachineFunction *MF = MBB.getParent();
3785 MachineRegisterInfo &MRI = MF->getRegInfo();
3786
3787 Register Dst = MI.getOperand(0).getReg();
3788 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3789 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
3790 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
3791
3792 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
3793 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
3794
3795 unsigned SubReg;
3796 std::tie(SubReg, Offset)
3797 = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
3798
3799 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
3800
3801 // Check for a SGPR index.
3802 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
3803 MachineBasicBlock::iterator I(&MI);
3804 const DebugLoc &DL = MI.getDebugLoc();
3805
3806 if (UseGPRIdxMode) {
3807 // TODO: Look at the uses to avoid the copy. This may require rescheduling
3808 // to avoid interfering with other uses, so probably requires a new
3809 // optimization pass.
3810 Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
3811
3812 const MCInstrDesc &GPRIDXDesc =
3813 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
3814 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
3815 .addReg(SrcReg)
3816 .addReg(Idx)
3817 .addImm(SubReg);
3818 } else {
3819 setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
3820
3821 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
3822 .addReg(SrcReg, 0, SubReg)
3823 .addReg(SrcReg, RegState::Implicit);
3824 }
3825
3826 MI.eraseFromParent();
3827
3828 return &MBB;
3829 }
3830
3831 // Control flow needs to be inserted if indexing with a VGPR.
3832 const DebugLoc &DL = MI.getDebugLoc();
3833 MachineBasicBlock::iterator I(&MI);
3834
3835 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3836 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3837
3838 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
3839
3840 Register SGPRIdxReg;
3841 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
3842 UseGPRIdxMode, SGPRIdxReg);
3843
3844 MachineBasicBlock *LoopBB = InsPt->getParent();
3845
3846 if (UseGPRIdxMode) {
3847 const MCInstrDesc &GPRIDXDesc =
3848 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
3849
3850 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
3851 .addReg(SrcReg)
3852 .addReg(SGPRIdxReg)
3853 .addImm(SubReg);
3854 } else {
3855 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
3856 .addReg(SrcReg, 0, SubReg)
3857 .addReg(SrcReg, RegState::Implicit);
3858 }
3859
3860 MI.eraseFromParent();
3861
3862 return LoopBB;
3863 }
3864
emitIndirectDst(MachineInstr & MI,MachineBasicBlock & MBB,const GCNSubtarget & ST)3865 static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
3866 MachineBasicBlock &MBB,
3867 const GCNSubtarget &ST) {
3868 const SIInstrInfo *TII = ST.getInstrInfo();
3869 const SIRegisterInfo &TRI = TII->getRegisterInfo();
3870 MachineFunction *MF = MBB.getParent();
3871 MachineRegisterInfo &MRI = MF->getRegInfo();
3872
3873 Register Dst = MI.getOperand(0).getReg();
3874 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
3875 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
3876 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
3877 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
3878 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
3879 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
3880
3881 // This can be an immediate, but will be folded later.
3882 assert(Val->getReg());
3883
3884 unsigned SubReg;
3885 std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
3886 SrcVec->getReg(),
3887 Offset);
3888 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
3889
3890 if (Idx->getReg() == AMDGPU::NoRegister) {
3891 MachineBasicBlock::iterator I(&MI);
3892 const DebugLoc &DL = MI.getDebugLoc();
3893
3894 assert(Offset == 0);
3895
3896 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
3897 .add(*SrcVec)
3898 .add(*Val)
3899 .addImm(SubReg);
3900
3901 MI.eraseFromParent();
3902 return &MBB;
3903 }
3904
3905 // Check for a SGPR index.
3906 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
3907 MachineBasicBlock::iterator I(&MI);
3908 const DebugLoc &DL = MI.getDebugLoc();
3909
3910 if (UseGPRIdxMode) {
3911 Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
3912
3913 const MCInstrDesc &GPRIDXDesc =
3914 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3915 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
3916 .addReg(SrcVec->getReg())
3917 .add(*Val)
3918 .addReg(Idx)
3919 .addImm(SubReg);
3920 } else {
3921 setM0ToIndexFromSGPR(TII, MRI, MI, Offset);
3922
3923 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
3924 TRI.getRegSizeInBits(*VecRC), 32, false);
3925 BuildMI(MBB, I, DL, MovRelDesc, Dst)
3926 .addReg(SrcVec->getReg())
3927 .add(*Val)
3928 .addImm(SubReg);
3929 }
3930 MI.eraseFromParent();
3931 return &MBB;
3932 }
3933
3934 // Control flow needs to be inserted if indexing with a VGPR.
3935 if (Val->isReg())
3936 MRI.clearKillFlags(Val->getReg());
3937
3938 const DebugLoc &DL = MI.getDebugLoc();
3939
3940 Register PhiReg = MRI.createVirtualRegister(VecRC);
3941
3942 Register SGPRIdxReg;
3943 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
3944 UseGPRIdxMode, SGPRIdxReg);
3945 MachineBasicBlock *LoopBB = InsPt->getParent();
3946
3947 if (UseGPRIdxMode) {
3948 const MCInstrDesc &GPRIDXDesc =
3949 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3950
3951 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
3952 .addReg(PhiReg)
3953 .add(*Val)
3954 .addReg(SGPRIdxReg)
3955 .addImm(AMDGPU::sub0);
3956 } else {
3957 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
3958 TRI.getRegSizeInBits(*VecRC), 32, false);
3959 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
3960 .addReg(PhiReg)
3961 .add(*Val)
3962 .addImm(AMDGPU::sub0);
3963 }
3964
3965 MI.eraseFromParent();
3966 return LoopBB;
3967 }
3968
EmitInstrWithCustomInserter(MachineInstr & MI,MachineBasicBlock * BB) const3969 MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
3970 MachineInstr &MI, MachineBasicBlock *BB) const {
3971
3972 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
3973 MachineFunction *MF = BB->getParent();
3974 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
3975
3976 switch (MI.getOpcode()) {
3977 case AMDGPU::S_UADDO_PSEUDO:
3978 case AMDGPU::S_USUBO_PSEUDO: {
3979 const DebugLoc &DL = MI.getDebugLoc();
3980 MachineOperand &Dest0 = MI.getOperand(0);
3981 MachineOperand &Dest1 = MI.getOperand(1);
3982 MachineOperand &Src0 = MI.getOperand(2);
3983 MachineOperand &Src1 = MI.getOperand(3);
3984
3985 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
3986 ? AMDGPU::S_ADD_I32
3987 : AMDGPU::S_SUB_I32;
3988 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg()).add(Src0).add(Src1);
3989
3990 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
3991 .addImm(1)
3992 .addImm(0);
3993
3994 MI.eraseFromParent();
3995 return BB;
3996 }
3997 case AMDGPU::S_ADD_U64_PSEUDO:
3998 case AMDGPU::S_SUB_U64_PSEUDO: {
3999 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
4000 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4001 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4002 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4003 const DebugLoc &DL = MI.getDebugLoc();
4004
4005 MachineOperand &Dest = MI.getOperand(0);
4006 MachineOperand &Src0 = MI.getOperand(1);
4007 MachineOperand &Src1 = MI.getOperand(2);
4008
4009 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4010 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4011
4012 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
4013 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
4014 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
4015 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
4016
4017 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
4018 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
4019 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
4020 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
4021
4022 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
4023
4024 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
4025 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
4026 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
4027 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
4028 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
4029 .addReg(DestSub0)
4030 .addImm(AMDGPU::sub0)
4031 .addReg(DestSub1)
4032 .addImm(AMDGPU::sub1);
4033 MI.eraseFromParent();
4034 return BB;
4035 }
4036 case AMDGPU::V_ADD_U64_PSEUDO:
4037 case AMDGPU::V_SUB_U64_PSEUDO: {
4038 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
4039 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4040 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4041 const DebugLoc &DL = MI.getDebugLoc();
4042
4043 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
4044
4045 MachineOperand &Dest = MI.getOperand(0);
4046 MachineOperand &Src0 = MI.getOperand(1);
4047 MachineOperand &Src1 = MI.getOperand(2);
4048
4049 if (IsAdd && ST.hasLshlAddB64()) {
4050 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
4051 Dest.getReg())
4052 .add(Src0)
4053 .addImm(0)
4054 .add(Src1);
4055 TII->legalizeOperands(*Add);
4056 MI.eraseFromParent();
4057 return BB;
4058 }
4059
4060 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4061
4062 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4063 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4064
4065 Register CarryReg = MRI.createVirtualRegister(CarryRC);
4066 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
4067
4068 const TargetRegisterClass *Src0RC = Src0.isReg()
4069 ? MRI.getRegClass(Src0.getReg())
4070 : &AMDGPU::VReg_64RegClass;
4071 const TargetRegisterClass *Src1RC = Src1.isReg()
4072 ? MRI.getRegClass(Src1.getReg())
4073 : &AMDGPU::VReg_64RegClass;
4074
4075 const TargetRegisterClass *Src0SubRC =
4076 TRI->getSubRegClass(Src0RC, AMDGPU::sub0);
4077 const TargetRegisterClass *Src1SubRC =
4078 TRI->getSubRegClass(Src1RC, AMDGPU::sub1);
4079
4080 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
4081 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
4082 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
4083 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
4084
4085 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
4086 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
4087 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
4088 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
4089
4090 unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
4091 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
4092 .addReg(CarryReg, RegState::Define)
4093 .add(SrcReg0Sub0)
4094 .add(SrcReg1Sub0)
4095 .addImm(0); // clamp bit
4096
4097 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
4098 MachineInstr *HiHalf =
4099 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
4100 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
4101 .add(SrcReg0Sub1)
4102 .add(SrcReg1Sub1)
4103 .addReg(CarryReg, RegState::Kill)
4104 .addImm(0); // clamp bit
4105
4106 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
4107 .addReg(DestSub0)
4108 .addImm(AMDGPU::sub0)
4109 .addReg(DestSub1)
4110 .addImm(AMDGPU::sub1);
4111 TII->legalizeOperands(*LoHalf);
4112 TII->legalizeOperands(*HiHalf);
4113 MI.eraseFromParent();
4114 return BB;
4115 }
4116 case AMDGPU::S_ADD_CO_PSEUDO:
4117 case AMDGPU::S_SUB_CO_PSEUDO: {
4118 // This pseudo has a chance to be selected
4119 // only from uniform add/subcarry node. All the VGPR operands
4120 // therefore assumed to be splat vectors.
4121 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
4122 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4123 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4124 MachineBasicBlock::iterator MII = MI;
4125 const DebugLoc &DL = MI.getDebugLoc();
4126 MachineOperand &Dest = MI.getOperand(0);
4127 MachineOperand &CarryDest = MI.getOperand(1);
4128 MachineOperand &Src0 = MI.getOperand(2);
4129 MachineOperand &Src1 = MI.getOperand(3);
4130 MachineOperand &Src2 = MI.getOperand(4);
4131 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
4132 ? AMDGPU::S_ADDC_U32
4133 : AMDGPU::S_SUBB_U32;
4134 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
4135 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4136 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
4137 .addReg(Src0.getReg());
4138 Src0.setReg(RegOp0);
4139 }
4140 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
4141 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4142 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
4143 .addReg(Src1.getReg());
4144 Src1.setReg(RegOp1);
4145 }
4146 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4147 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
4148 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
4149 .addReg(Src2.getReg());
4150 Src2.setReg(RegOp2);
4151 }
4152
4153 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
4154 unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
4155 assert(WaveSize == 64 || WaveSize == 32);
4156
4157 if (WaveSize == 64) {
4158 if (ST.hasScalarCompareEq64()) {
4159 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
4160 .addReg(Src2.getReg())
4161 .addImm(0);
4162 } else {
4163 const TargetRegisterClass *SubRC =
4164 TRI->getSubRegClass(Src2RC, AMDGPU::sub0);
4165 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
4166 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
4167 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
4168 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
4169 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4170
4171 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
4172 .add(Src2Sub0)
4173 .add(Src2Sub1);
4174
4175 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
4176 .addReg(Src2_32, RegState::Kill)
4177 .addImm(0);
4178 }
4179 } else {
4180 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMPK_LG_U32))
4181 .addReg(Src2.getReg())
4182 .addImm(0);
4183 }
4184
4185 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);
4186
4187 unsigned SelOpc =
4188 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
4189
4190 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
4191 .addImm(-1)
4192 .addImm(0);
4193
4194 MI.eraseFromParent();
4195 return BB;
4196 }
4197 case AMDGPU::SI_INIT_M0: {
4198 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
4199 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4200 .add(MI.getOperand(0));
4201 MI.eraseFromParent();
4202 return BB;
4203 }
4204 case AMDGPU::GET_GROUPSTATICSIZE: {
4205 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
4206 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
4207 DebugLoc DL = MI.getDebugLoc();
4208 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
4209 .add(MI.getOperand(0))
4210 .addImm(MFI->getLDSSize());
4211 MI.eraseFromParent();
4212 return BB;
4213 }
4214 case AMDGPU::SI_INDIRECT_SRC_V1:
4215 case AMDGPU::SI_INDIRECT_SRC_V2:
4216 case AMDGPU::SI_INDIRECT_SRC_V4:
4217 case AMDGPU::SI_INDIRECT_SRC_V8:
4218 case AMDGPU::SI_INDIRECT_SRC_V16:
4219 case AMDGPU::SI_INDIRECT_SRC_V32:
4220 return emitIndirectSrc(MI, *BB, *getSubtarget());
4221 case AMDGPU::SI_INDIRECT_DST_V1:
4222 case AMDGPU::SI_INDIRECT_DST_V2:
4223 case AMDGPU::SI_INDIRECT_DST_V4:
4224 case AMDGPU::SI_INDIRECT_DST_V8:
4225 case AMDGPU::SI_INDIRECT_DST_V16:
4226 case AMDGPU::SI_INDIRECT_DST_V32:
4227 return emitIndirectDst(MI, *BB, *getSubtarget());
4228 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
4229 case AMDGPU::SI_KILL_I1_PSEUDO:
4230 return splitKillBlock(MI, BB);
4231 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
4232 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
4233 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4234 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4235
4236 Register Dst = MI.getOperand(0).getReg();
4237 Register Src0 = MI.getOperand(1).getReg();
4238 Register Src1 = MI.getOperand(2).getReg();
4239 const DebugLoc &DL = MI.getDebugLoc();
4240 Register SrcCond = MI.getOperand(3).getReg();
4241
4242 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4243 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4244 const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4245 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
4246
4247 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
4248 .addReg(SrcCond);
4249 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
4250 .addImm(0)
4251 .addReg(Src0, 0, AMDGPU::sub0)
4252 .addImm(0)
4253 .addReg(Src1, 0, AMDGPU::sub0)
4254 .addReg(SrcCondCopy);
4255 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
4256 .addImm(0)
4257 .addReg(Src0, 0, AMDGPU::sub1)
4258 .addImm(0)
4259 .addReg(Src1, 0, AMDGPU::sub1)
4260 .addReg(SrcCondCopy);
4261
4262 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
4263 .addReg(DstLo)
4264 .addImm(AMDGPU::sub0)
4265 .addReg(DstHi)
4266 .addImm(AMDGPU::sub1);
4267 MI.eraseFromParent();
4268 return BB;
4269 }
4270 case AMDGPU::SI_BR_UNDEF: {
4271 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4272 const DebugLoc &DL = MI.getDebugLoc();
4273 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4274 .add(MI.getOperand(0));
4275 Br->getOperand(1).setIsUndef(true); // read undef SCC
4276 MI.eraseFromParent();
4277 return BB;
4278 }
4279 case AMDGPU::ADJCALLSTACKUP:
4280 case AMDGPU::ADJCALLSTACKDOWN: {
4281 const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
4282 MachineInstrBuilder MIB(*MF, &MI);
4283 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
4284 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
4285 return BB;
4286 }
4287 case AMDGPU::SI_CALL_ISEL: {
4288 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
4289 const DebugLoc &DL = MI.getDebugLoc();
4290
4291 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
4292
4293 MachineInstrBuilder MIB;
4294 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
4295
4296 for (const MachineOperand &MO : MI.operands())
4297 MIB.add(MO);
4298
4299 MIB.cloneMemRefs(MI);
4300 MI.eraseFromParent();
4301 return BB;
4302 }
4303 case AMDGPU::V_ADD_CO_U32_e32:
4304 case AMDGPU::V_SUB_CO_U32_e32:
4305 case AMDGPU::V_SUBREV_CO_U32_e32: {
4306 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
4307 const DebugLoc &DL = MI.getDebugLoc();
4308 unsigned Opc = MI.getOpcode();
4309
4310 bool NeedClampOperand = false;
4311 if (TII->pseudoToMCOpcode(Opc) == -1) {
4312 Opc = AMDGPU::getVOPe64(Opc);
4313 NeedClampOperand = true;
4314 }
4315
4316 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
4317 if (TII->isVOP3(*I)) {
4318 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4319 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4320 I.addReg(TRI->getVCC(), RegState::Define);
4321 }
4322 I.add(MI.getOperand(1))
4323 .add(MI.getOperand(2));
4324 if (NeedClampOperand)
4325 I.addImm(0); // clamp bit for e64 encoding
4326
4327 TII->legalizeOperands(*I);
4328
4329 MI.eraseFromParent();
4330 return BB;
4331 }
4332 case AMDGPU::V_ADDC_U32_e32:
4333 case AMDGPU::V_SUBB_U32_e32:
4334 case AMDGPU::V_SUBBREV_U32_e32:
4335 // These instructions have an implicit use of vcc which counts towards the
4336 // constant bus limit.
4337 TII->legalizeOperands(MI);
4338 return BB;
4339 case AMDGPU::DS_GWS_INIT:
4340 case AMDGPU::DS_GWS_SEMA_BR:
4341 case AMDGPU::DS_GWS_BARRIER:
4342 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
4343 LLVM_FALLTHROUGH;
4344 case AMDGPU::DS_GWS_SEMA_V:
4345 case AMDGPU::DS_GWS_SEMA_P:
4346 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
4347 // A s_waitcnt 0 is required to be the instruction immediately following.
4348 if (getSubtarget()->hasGWSAutoReplay()) {
4349 bundleInstWithWaitcnt(MI);
4350 return BB;
4351 }
4352
4353 return emitGWSMemViolTestLoop(MI, BB);
4354 case AMDGPU::S_SETREG_B32: {
4355 // Try to optimize cases that only set the denormal mode or rounding mode.
4356 //
4357 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
4358 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
4359 // instead.
4360 //
4361 // FIXME: This could be predicates on the immediate, but tablegen doesn't
4362 // allow you to have a no side effect instruction in the output of a
4363 // sideeffecting pattern.
4364 unsigned ID, Offset, Width;
4365 AMDGPU::Hwreg::decodeHwreg(MI.getOperand(1).getImm(), ID, Offset, Width);
4366 if (ID != AMDGPU::Hwreg::ID_MODE)
4367 return BB;
4368
4369 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
4370 const unsigned SetMask = WidthMask << Offset;
4371
4372 if (getSubtarget()->hasDenormModeInst()) {
4373 unsigned SetDenormOp = 0;
4374 unsigned SetRoundOp = 0;
4375
4376 // The dedicated instructions can only set the whole denorm or round mode
4377 // at once, not a subset of bits in either.
4378 if (SetMask ==
4379 (AMDGPU::Hwreg::FP_ROUND_MASK | AMDGPU::Hwreg::FP_DENORM_MASK)) {
4380 // If this fully sets both the round and denorm mode, emit the two
4381 // dedicated instructions for these.
4382 SetRoundOp = AMDGPU::S_ROUND_MODE;
4383 SetDenormOp = AMDGPU::S_DENORM_MODE;
4384 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
4385 SetRoundOp = AMDGPU::S_ROUND_MODE;
4386 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
4387 SetDenormOp = AMDGPU::S_DENORM_MODE;
4388 }
4389
4390 if (SetRoundOp || SetDenormOp) {
4391 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
4392 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
4393 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
4394 unsigned ImmVal = Def->getOperand(1).getImm();
4395 if (SetRoundOp) {
4396 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
4397 .addImm(ImmVal & 0xf);
4398
4399 // If we also have the denorm mode, get just the denorm mode bits.
4400 ImmVal >>= 4;
4401 }
4402
4403 if (SetDenormOp) {
4404 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
4405 .addImm(ImmVal & 0xf);
4406 }
4407
4408 MI.eraseFromParent();
4409 return BB;
4410 }
4411 }
4412 }
4413
4414 // If only FP bits are touched, used the no side effects pseudo.
4415 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
4416 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
4417 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
4418
4419 return BB;
4420 }
4421 default:
4422 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
4423 }
4424 }
4425
hasBitPreservingFPLogic(EVT VT) const4426 bool SITargetLowering::hasBitPreservingFPLogic(EVT VT) const {
4427 return isTypeLegal(VT.getScalarType());
4428 }
4429
hasAtomicFaddRtnForTy(SDValue & Op) const4430 bool SITargetLowering::hasAtomicFaddRtnForTy(SDValue &Op) const {
4431 switch (Op.getValue(0).getSimpleValueType().SimpleTy) {
4432 case MVT::f32:
4433 return Subtarget->hasAtomicFaddRtnInsts();
4434 case MVT::v2f16:
4435 case MVT::f64:
4436 return Subtarget->hasGFX90AInsts();
4437 default:
4438 return false;
4439 }
4440 }
4441
enableAggressiveFMAFusion(EVT VT) const4442 bool SITargetLowering::enableAggressiveFMAFusion(EVT VT) const {
4443 // This currently forces unfolding various combinations of fsub into fma with
4444 // free fneg'd operands. As long as we have fast FMA (controlled by
4445 // isFMAFasterThanFMulAndFAdd), we should perform these.
4446
4447 // When fma is quarter rate, for f64 where add / sub are at best half rate,
4448 // most of these combines appear to be cycle neutral but save on instruction
4449 // count / code size.
4450 return true;
4451 }
4452
enableAggressiveFMAFusion(LLT Ty) const4453 bool SITargetLowering::enableAggressiveFMAFusion(LLT Ty) const { return true; }
4454
getSetCCResultType(const DataLayout & DL,LLVMContext & Ctx,EVT VT) const4455 EVT SITargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
4456 EVT VT) const {
4457 if (!VT.isVector()) {
4458 return MVT::i1;
4459 }
4460 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
4461 }
4462
getScalarShiftAmountTy(const DataLayout &,EVT VT) const4463 MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const {
4464 // TODO: Should i16 be used always if legal? For now it would force VALU
4465 // shifts.
4466 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
4467 }
4468
getPreferredShiftAmountTy(LLT Ty) const4469 LLT SITargetLowering::getPreferredShiftAmountTy(LLT Ty) const {
4470 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
4471 ? Ty.changeElementSize(16)
4472 : Ty.changeElementSize(32);
4473 }
4474
4475 // Answering this is somewhat tricky and depends on the specific device which
4476 // have different rates for fma or all f64 operations.
4477 //
4478 // v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
4479 // regardless of which device (although the number of cycles differs between
4480 // devices), so it is always profitable for f64.
4481 //
4482 // v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
4483 // only on full rate devices. Normally, we should prefer selecting v_mad_f32
4484 // which we can always do even without fused FP ops since it returns the same
4485 // result as the separate operations and since it is always full
4486 // rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
4487 // however does not support denormals, so we do report fma as faster if we have
4488 // a fast fma device and require denormals.
4489 //
isFMAFasterThanFMulAndFAdd(const MachineFunction & MF,EVT VT) const4490 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
4491 EVT VT) const {
4492 VT = VT.getScalarType();
4493
4494 switch (VT.getSimpleVT().SimpleTy) {
4495 case MVT::f32: {
4496 // If mad is not available this depends only on if f32 fma is full rate.
4497 if (!Subtarget->hasMadMacF32Insts())
4498 return Subtarget->hasFastFMAF32();
4499
4500 // Otherwise f32 mad is always full rate and returns the same result as
4501 // the separate operations so should be preferred over fma.
4502 // However does not support denormals.
4503 if (hasFP32Denormals(MF))
4504 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
4505
4506 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
4507 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
4508 }
4509 case MVT::f64:
4510 return true;
4511 case MVT::f16:
4512 return Subtarget->has16BitInsts() && hasFP64FP16Denormals(MF);
4513 default:
4514 break;
4515 }
4516
4517 return false;
4518 }
4519
isFMAFasterThanFMulAndFAdd(const MachineFunction & MF,LLT Ty) const4520 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
4521 LLT Ty) const {
4522 switch (Ty.getScalarSizeInBits()) {
4523 case 16:
4524 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
4525 case 32:
4526 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
4527 case 64:
4528 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
4529 default:
4530 break;
4531 }
4532
4533 return false;
4534 }
4535
isFMADLegal(const MachineInstr & MI,LLT Ty) const4536 bool SITargetLowering::isFMADLegal(const MachineInstr &MI, LLT Ty) const {
4537 if (!Ty.isScalar())
4538 return false;
4539
4540 if (Ty.getScalarSizeInBits() == 16)
4541 return Subtarget->hasMadF16() && !hasFP64FP16Denormals(*MI.getMF());
4542 if (Ty.getScalarSizeInBits() == 32)
4543 return Subtarget->hasMadMacF32Insts() && !hasFP32Denormals(*MI.getMF());
4544
4545 return false;
4546 }
4547
isFMADLegal(const SelectionDAG & DAG,const SDNode * N) const4548 bool SITargetLowering::isFMADLegal(const SelectionDAG &DAG,
4549 const SDNode *N) const {
4550 // TODO: Check future ftz flag
4551 // v_mad_f32/v_mac_f32 do not support denormals.
4552 EVT VT = N->getValueType(0);
4553 if (VT == MVT::f32)
4554 return Subtarget->hasMadMacF32Insts() &&
4555 !hasFP32Denormals(DAG.getMachineFunction());
4556 if (VT == MVT::f16) {
4557 return Subtarget->hasMadF16() &&
4558 !hasFP64FP16Denormals(DAG.getMachineFunction());
4559 }
4560
4561 return false;
4562 }
4563
4564 //===----------------------------------------------------------------------===//
4565 // Custom DAG Lowering Operations
4566 //===----------------------------------------------------------------------===//
4567
4568 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
4569 // wider vector type is legal.
splitUnaryVectorOp(SDValue Op,SelectionDAG & DAG) const4570 SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
4571 SelectionDAG &DAG) const {
4572 unsigned Opc = Op.getOpcode();
4573 EVT VT = Op.getValueType();
4574 assert(VT == MVT::v4f16 || VT == MVT::v4i16);
4575
4576 SDValue Lo, Hi;
4577 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
4578
4579 SDLoc SL(Op);
4580 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo,
4581 Op->getFlags());
4582 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi,
4583 Op->getFlags());
4584
4585 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
4586 }
4587
4588 // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
4589 // wider vector type is legal.
splitBinaryVectorOp(SDValue Op,SelectionDAG & DAG) const4590 SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
4591 SelectionDAG &DAG) const {
4592 unsigned Opc = Op.getOpcode();
4593 EVT VT = Op.getValueType();
4594 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
4595 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
4596 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
4597 VT == MVT::v32f32);
4598
4599 SDValue Lo0, Hi0;
4600 std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
4601 SDValue Lo1, Hi1;
4602 std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
4603
4604 SDLoc SL(Op);
4605
4606 SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
4607 Op->getFlags());
4608 SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
4609 Op->getFlags());
4610
4611 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
4612 }
4613
splitTernaryVectorOp(SDValue Op,SelectionDAG & DAG) const4614 SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op,
4615 SelectionDAG &DAG) const {
4616 unsigned Opc = Op.getOpcode();
4617 EVT VT = Op.getValueType();
4618 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
4619 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
4620 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
4621 VT == MVT::v32f32);
4622
4623 SDValue Lo0, Hi0;
4624 SDValue Op0 = Op.getOperand(0);
4625 std::tie(Lo0, Hi0) = Op0.getValueType().isVector()
4626 ? DAG.SplitVectorOperand(Op.getNode(), 0)
4627 : std::make_pair(Op0, Op0);
4628 SDValue Lo1, Hi1;
4629 std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
4630 SDValue Lo2, Hi2;
4631 std::tie(Lo2, Hi2) = DAG.SplitVectorOperand(Op.getNode(), 2);
4632
4633 SDLoc SL(Op);
4634 auto ResVT = DAG.GetSplitDestVTs(VT);
4635
4636 SDValue OpLo = DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
4637 Op->getFlags());
4638 SDValue OpHi = DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
4639 Op->getFlags());
4640
4641 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
4642 }
4643
4644
LowerOperation(SDValue Op,SelectionDAG & DAG) const4645 SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
4646 switch (Op.getOpcode()) {
4647 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
4648 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
4649 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
4650 case ISD::LOAD: {
4651 SDValue Result = LowerLOAD(Op, DAG);
4652 assert((!Result.getNode() ||
4653 Result.getNode()->getNumValues() == 2) &&
4654 "Load should return a value and a chain");
4655 return Result;
4656 }
4657
4658 case ISD::FSIN:
4659 case ISD::FCOS:
4660 return LowerTrig(Op, DAG);
4661 case ISD::SELECT: return LowerSELECT(Op, DAG);
4662 case ISD::FDIV: return LowerFDIV(Op, DAG);
4663 case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
4664 case ISD::STORE: return LowerSTORE(Op, DAG);
4665 case ISD::GlobalAddress: {
4666 MachineFunction &MF = DAG.getMachineFunction();
4667 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
4668 return LowerGlobalAddress(MFI, Op, DAG);
4669 }
4670 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
4671 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
4672 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
4673 case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
4674 case ISD::INSERT_SUBVECTOR:
4675 return lowerINSERT_SUBVECTOR(Op, DAG);
4676 case ISD::INSERT_VECTOR_ELT:
4677 return lowerINSERT_VECTOR_ELT(Op, DAG);
4678 case ISD::EXTRACT_VECTOR_ELT:
4679 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
4680 case ISD::VECTOR_SHUFFLE:
4681 return lowerVECTOR_SHUFFLE(Op, DAG);
4682 case ISD::SCALAR_TO_VECTOR:
4683 return lowerSCALAR_TO_VECTOR(Op, DAG);
4684 case ISD::BUILD_VECTOR:
4685 return lowerBUILD_VECTOR(Op, DAG);
4686 case ISD::FP_ROUND:
4687 return lowerFP_ROUND(Op, DAG);
4688 case ISD::FPTRUNC_ROUND: {
4689 unsigned Opc;
4690 SDLoc DL(Op);
4691
4692 if (Op.getOperand(0)->getValueType(0) != MVT::f32)
4693 return SDValue();
4694
4695 // Get the rounding mode from the last operand
4696 int RoundMode = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
4697 if (RoundMode == (int)RoundingMode::TowardPositive)
4698 Opc = AMDGPUISD::FPTRUNC_ROUND_UPWARD;
4699 else if (RoundMode == (int)RoundingMode::TowardNegative)
4700 Opc = AMDGPUISD::FPTRUNC_ROUND_DOWNWARD;
4701 else
4702 return SDValue();
4703
4704 return DAG.getNode(Opc, DL, Op.getNode()->getVTList(), Op->getOperand(0));
4705 }
4706 case ISD::TRAP:
4707 return lowerTRAP(Op, DAG);
4708 case ISD::DEBUGTRAP:
4709 return lowerDEBUGTRAP(Op, DAG);
4710 case ISD::FABS:
4711 case ISD::FNEG:
4712 case ISD::FCANONICALIZE:
4713 case ISD::BSWAP:
4714 return splitUnaryVectorOp(Op, DAG);
4715 case ISD::FMINNUM:
4716 case ISD::FMAXNUM:
4717 return lowerFMINNUM_FMAXNUM(Op, DAG);
4718 case ISD::FMA:
4719 return splitTernaryVectorOp(Op, DAG);
4720 case ISD::FP_TO_SINT:
4721 case ISD::FP_TO_UINT:
4722 return LowerFP_TO_INT(Op, DAG);
4723 case ISD::SHL:
4724 case ISD::SRA:
4725 case ISD::SRL:
4726 case ISD::ADD:
4727 case ISD::SUB:
4728 case ISD::MUL:
4729 case ISD::SMIN:
4730 case ISD::SMAX:
4731 case ISD::UMIN:
4732 case ISD::UMAX:
4733 case ISD::FADD:
4734 case ISD::FMUL:
4735 case ISD::FMINNUM_IEEE:
4736 case ISD::FMAXNUM_IEEE:
4737 case ISD::UADDSAT:
4738 case ISD::USUBSAT:
4739 case ISD::SADDSAT:
4740 case ISD::SSUBSAT:
4741 return splitBinaryVectorOp(Op, DAG);
4742 case ISD::SMULO:
4743 case ISD::UMULO:
4744 return lowerXMULO(Op, DAG);
4745 case ISD::SMUL_LOHI:
4746 case ISD::UMUL_LOHI:
4747 return lowerXMUL_LOHI(Op, DAG);
4748 case ISD::DYNAMIC_STACKALLOC:
4749 return LowerDYNAMIC_STACKALLOC(Op, DAG);
4750 }
4751 return SDValue();
4752 }
4753
4754 // Used for D16: Casts the result of an instruction into the right vector,
4755 // packs values if loads return unpacked values.
adjustLoadValueTypeImpl(SDValue Result,EVT LoadVT,const SDLoc & DL,SelectionDAG & DAG,bool Unpacked)4756 static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT,
4757 const SDLoc &DL,
4758 SelectionDAG &DAG, bool Unpacked) {
4759 if (!LoadVT.isVector())
4760 return Result;
4761
4762 // Cast back to the original packed type or to a larger type that is a
4763 // multiple of 32 bit for D16. Widening the return type is a required for
4764 // legalization.
4765 EVT FittingLoadVT = LoadVT;
4766 if ((LoadVT.getVectorNumElements() % 2) == 1) {
4767 FittingLoadVT =
4768 EVT::getVectorVT(*DAG.getContext(), LoadVT.getVectorElementType(),
4769 LoadVT.getVectorNumElements() + 1);
4770 }
4771
4772 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
4773 // Truncate to v2i16/v4i16.
4774 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
4775
4776 // Workaround legalizer not scalarizing truncate after vector op
4777 // legalization but not creating intermediate vector trunc.
4778 SmallVector<SDValue, 4> Elts;
4779 DAG.ExtractVectorElements(Result, Elts);
4780 for (SDValue &Elt : Elts)
4781 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
4782
4783 // Pad illegal v1i16/v3fi6 to v4i16
4784 if ((LoadVT.getVectorNumElements() % 2) == 1)
4785 Elts.push_back(DAG.getUNDEF(MVT::i16));
4786
4787 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
4788
4789 // Bitcast to original type (v2f16/v4f16).
4790 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
4791 }
4792
4793 // Cast back to the original packed type.
4794 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
4795 }
4796
adjustLoadValueType(unsigned Opcode,MemSDNode * M,SelectionDAG & DAG,ArrayRef<SDValue> Ops,bool IsIntrinsic) const4797 SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
4798 MemSDNode *M,
4799 SelectionDAG &DAG,
4800 ArrayRef<SDValue> Ops,
4801 bool IsIntrinsic) const {
4802 SDLoc DL(M);
4803
4804 bool Unpacked = Subtarget->hasUnpackedD16VMem();
4805 EVT LoadVT = M->getValueType(0);
4806
4807 EVT EquivLoadVT = LoadVT;
4808 if (LoadVT.isVector()) {
4809 if (Unpacked) {
4810 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
4811 LoadVT.getVectorNumElements());
4812 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
4813 // Widen v3f16 to legal type
4814 EquivLoadVT =
4815 EVT::getVectorVT(*DAG.getContext(), LoadVT.getVectorElementType(),
4816 LoadVT.getVectorNumElements() + 1);
4817 }
4818 }
4819
4820 // Change from v4f16/v2f16 to EquivLoadVT.
4821 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
4822
4823 SDValue Load
4824 = DAG.getMemIntrinsicNode(
4825 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL,
4826 VTList, Ops, M->getMemoryVT(),
4827 M->getMemOperand());
4828
4829 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
4830
4831 return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
4832 }
4833
lowerIntrinsicLoad(MemSDNode * M,bool IsFormat,SelectionDAG & DAG,ArrayRef<SDValue> Ops) const4834 SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
4835 SelectionDAG &DAG,
4836 ArrayRef<SDValue> Ops) const {
4837 SDLoc DL(M);
4838 EVT LoadVT = M->getValueType(0);
4839 EVT EltType = LoadVT.getScalarType();
4840 EVT IntVT = LoadVT.changeTypeToInteger();
4841
4842 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
4843
4844 unsigned Opc =
4845 IsFormat ? AMDGPUISD::BUFFER_LOAD_FORMAT : AMDGPUISD::BUFFER_LOAD;
4846
4847 if (IsD16) {
4848 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
4849 }
4850
4851 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
4852 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
4853 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
4854
4855 if (isTypeLegal(LoadVT)) {
4856 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
4857 M->getMemOperand(), DAG);
4858 }
4859
4860 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
4861 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
4862 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
4863 M->getMemOperand(), DAG);
4864 return DAG.getMergeValues(
4865 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
4866 DL);
4867 }
4868
lowerICMPIntrinsic(const SITargetLowering & TLI,SDNode * N,SelectionDAG & DAG)4869 static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI,
4870 SDNode *N, SelectionDAG &DAG) {
4871 EVT VT = N->getValueType(0);
4872 const auto *CD = cast<ConstantSDNode>(N->getOperand(3));
4873 unsigned CondCode = CD->getZExtValue();
4874 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
4875 return DAG.getUNDEF(VT);
4876
4877 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
4878
4879 SDValue LHS = N->getOperand(1);
4880 SDValue RHS = N->getOperand(2);
4881
4882 SDLoc DL(N);
4883
4884 EVT CmpVT = LHS.getValueType();
4885 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
4886 unsigned PromoteOp = ICmpInst::isSigned(IcInput) ?
4887 ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4888 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
4889 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
4890 }
4891
4892 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
4893
4894 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
4895 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
4896
4897 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
4898 DAG.getCondCode(CCOpcode));
4899 if (VT.bitsEq(CCVT))
4900 return SetCC;
4901 return DAG.getZExtOrTrunc(SetCC, DL, VT);
4902 }
4903
lowerFCMPIntrinsic(const SITargetLowering & TLI,SDNode * N,SelectionDAG & DAG)4904 static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI,
4905 SDNode *N, SelectionDAG &DAG) {
4906 EVT VT = N->getValueType(0);
4907 const auto *CD = cast<ConstantSDNode>(N->getOperand(3));
4908
4909 unsigned CondCode = CD->getZExtValue();
4910 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
4911 return DAG.getUNDEF(VT);
4912
4913 SDValue Src0 = N->getOperand(1);
4914 SDValue Src1 = N->getOperand(2);
4915 EVT CmpVT = Src0.getValueType();
4916 SDLoc SL(N);
4917
4918 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
4919 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
4920 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
4921 }
4922
4923 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
4924 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
4925 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
4926 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
4927 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0,
4928 Src1, DAG.getCondCode(CCOpcode));
4929 if (VT.bitsEq(CCVT))
4930 return SetCC;
4931 return DAG.getZExtOrTrunc(SetCC, SL, VT);
4932 }
4933
lowerBALLOTIntrinsic(const SITargetLowering & TLI,SDNode * N,SelectionDAG & DAG)4934 static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N,
4935 SelectionDAG &DAG) {
4936 EVT VT = N->getValueType(0);
4937 SDValue Src = N->getOperand(1);
4938 SDLoc SL(N);
4939
4940 if (Src.getOpcode() == ISD::SETCC) {
4941 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
4942 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
4943 Src.getOperand(1), Src.getOperand(2));
4944 }
4945 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
4946 // (ballot 0) -> 0
4947 if (Arg->isZero())
4948 return DAG.getConstant(0, SL, VT);
4949
4950 // (ballot 1) -> EXEC/EXEC_LO
4951 if (Arg->isOne()) {
4952 Register Exec;
4953 if (VT.getScalarSizeInBits() == 32)
4954 Exec = AMDGPU::EXEC_LO;
4955 else if (VT.getScalarSizeInBits() == 64)
4956 Exec = AMDGPU::EXEC;
4957 else
4958 return SDValue();
4959
4960 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
4961 }
4962 }
4963
4964 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
4965 // ISD::SETNE)
4966 return DAG.getNode(
4967 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
4968 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
4969 }
4970
ReplaceNodeResults(SDNode * N,SmallVectorImpl<SDValue> & Results,SelectionDAG & DAG) const4971 void SITargetLowering::ReplaceNodeResults(SDNode *N,
4972 SmallVectorImpl<SDValue> &Results,
4973 SelectionDAG &DAG) const {
4974 switch (N->getOpcode()) {
4975 case ISD::INSERT_VECTOR_ELT: {
4976 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
4977 Results.push_back(Res);
4978 return;
4979 }
4980 case ISD::EXTRACT_VECTOR_ELT: {
4981 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
4982 Results.push_back(Res);
4983 return;
4984 }
4985 case ISD::INTRINSIC_WO_CHAIN: {
4986 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
4987 switch (IID) {
4988 case Intrinsic::amdgcn_cvt_pkrtz: {
4989 SDValue Src0 = N->getOperand(1);
4990 SDValue Src1 = N->getOperand(2);
4991 SDLoc SL(N);
4992 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32,
4993 Src0, Src1);
4994 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
4995 return;
4996 }
4997 case Intrinsic::amdgcn_cvt_pknorm_i16:
4998 case Intrinsic::amdgcn_cvt_pknorm_u16:
4999 case Intrinsic::amdgcn_cvt_pk_i16:
5000 case Intrinsic::amdgcn_cvt_pk_u16: {
5001 SDValue Src0 = N->getOperand(1);
5002 SDValue Src1 = N->getOperand(2);
5003 SDLoc SL(N);
5004 unsigned Opcode;
5005
5006 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
5007 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
5008 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
5009 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
5010 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
5011 Opcode = AMDGPUISD::CVT_PK_I16_I32;
5012 else
5013 Opcode = AMDGPUISD::CVT_PK_U16_U32;
5014
5015 EVT VT = N->getValueType(0);
5016 if (isTypeLegal(VT))
5017 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
5018 else {
5019 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
5020 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
5021 }
5022 return;
5023 }
5024 }
5025 break;
5026 }
5027 case ISD::INTRINSIC_W_CHAIN: {
5028 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
5029 if (Res.getOpcode() == ISD::MERGE_VALUES) {
5030 // FIXME: Hacky
5031 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
5032 Results.push_back(Res.getOperand(I));
5033 }
5034 } else {
5035 Results.push_back(Res);
5036 Results.push_back(Res.getValue(1));
5037 }
5038 return;
5039 }
5040
5041 break;
5042 }
5043 case ISD::SELECT: {
5044 SDLoc SL(N);
5045 EVT VT = N->getValueType(0);
5046 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
5047 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
5048 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
5049
5050 EVT SelectVT = NewVT;
5051 if (NewVT.bitsLT(MVT::i32)) {
5052 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
5053 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
5054 SelectVT = MVT::i32;
5055 }
5056
5057 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT,
5058 N->getOperand(0), LHS, RHS);
5059
5060 if (NewVT != SelectVT)
5061 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
5062 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
5063 return;
5064 }
5065 case ISD::FNEG: {
5066 if (N->getValueType(0) != MVT::v2f16)
5067 break;
5068
5069 SDLoc SL(N);
5070 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
5071
5072 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32,
5073 BC,
5074 DAG.getConstant(0x80008000, SL, MVT::i32));
5075 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
5076 return;
5077 }
5078 case ISD::FABS: {
5079 if (N->getValueType(0) != MVT::v2f16)
5080 break;
5081
5082 SDLoc SL(N);
5083 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
5084
5085 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32,
5086 BC,
5087 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
5088 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
5089 return;
5090 }
5091 default:
5092 break;
5093 }
5094 }
5095
5096 /// Helper function for LowerBRCOND
findUser(SDValue Value,unsigned Opcode)5097 static SDNode *findUser(SDValue Value, unsigned Opcode) {
5098
5099 SDNode *Parent = Value.getNode();
5100 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
5101 I != E; ++I) {
5102
5103 if (I.getUse().get() != Value)
5104 continue;
5105
5106 if (I->getOpcode() == Opcode)
5107 return *I;
5108 }
5109 return nullptr;
5110 }
5111
isCFIntrinsic(const SDNode * Intr) const5112 unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
5113 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
5114 switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
5115 case Intrinsic::amdgcn_if:
5116 return AMDGPUISD::IF;
5117 case Intrinsic::amdgcn_else:
5118 return AMDGPUISD::ELSE;
5119 case Intrinsic::amdgcn_loop:
5120 return AMDGPUISD::LOOP;
5121 case Intrinsic::amdgcn_end_cf:
5122 llvm_unreachable("should not occur");
5123 default:
5124 return 0;
5125 }
5126 }
5127
5128 // break, if_break, else_break are all only used as inputs to loop, not
5129 // directly as branch conditions.
5130 return 0;
5131 }
5132
shouldEmitFixup(const GlobalValue * GV) const5133 bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
5134 const Triple &TT = getTargetMachine().getTargetTriple();
5135 return (GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
5136 GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
5137 AMDGPU::shouldEmitConstantsToTextSection(TT);
5138 }
5139
shouldEmitGOTReloc(const GlobalValue * GV) const5140 bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
5141 // FIXME: Either avoid relying on address space here or change the default
5142 // address space for functions to avoid the explicit check.
5143 return (GV->getValueType()->isFunctionTy() ||
5144 !isNonGlobalAddrSpace(GV->getAddressSpace())) &&
5145 !shouldEmitFixup(GV) &&
5146 !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
5147 }
5148
shouldEmitPCReloc(const GlobalValue * GV) const5149 bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
5150 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
5151 }
5152
shouldUseLDSConstAddress(const GlobalValue * GV) const5153 bool SITargetLowering::shouldUseLDSConstAddress(const GlobalValue *GV) const {
5154 if (!GV->hasExternalLinkage())
5155 return true;
5156
5157 const auto OS = getTargetMachine().getTargetTriple().getOS();
5158 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
5159 }
5160
5161 /// This transforms the control flow intrinsics to get the branch destination as
5162 /// last parameter, also switches branch target with BR if the need arise
LowerBRCOND(SDValue BRCOND,SelectionDAG & DAG) const5163 SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
5164 SelectionDAG &DAG) const {
5165 SDLoc DL(BRCOND);
5166
5167 SDNode *Intr = BRCOND.getOperand(1).getNode();
5168 SDValue Target = BRCOND.getOperand(2);
5169 SDNode *BR = nullptr;
5170 SDNode *SetCC = nullptr;
5171
5172 if (Intr->getOpcode() == ISD::SETCC) {
5173 // As long as we negate the condition everything is fine
5174 SetCC = Intr;
5175 Intr = SetCC->getOperand(0).getNode();
5176
5177 } else {
5178 // Get the target from BR if we don't negate the condition
5179 BR = findUser(BRCOND, ISD::BR);
5180 assert(BR && "brcond missing unconditional branch user");
5181 Target = BR->getOperand(1);
5182 }
5183
5184 unsigned CFNode = isCFIntrinsic(Intr);
5185 if (CFNode == 0) {
5186 // This is a uniform branch so we don't need to legalize.
5187 return BRCOND;
5188 }
5189
5190 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
5191 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
5192
5193 assert(!SetCC ||
5194 (SetCC->getConstantOperandVal(1) == 1 &&
5195 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
5196 ISD::SETNE));
5197
5198 // operands of the new intrinsic call
5199 SmallVector<SDValue, 4> Ops;
5200 if (HaveChain)
5201 Ops.push_back(BRCOND.getOperand(0));
5202
5203 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
5204 Ops.push_back(Target);
5205
5206 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
5207
5208 // build the new intrinsic call
5209 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
5210
5211 if (!HaveChain) {
5212 SDValue Ops[] = {
5213 SDValue(Result, 0),
5214 BRCOND.getOperand(0)
5215 };
5216
5217 Result = DAG.getMergeValues(Ops, DL).getNode();
5218 }
5219
5220 if (BR) {
5221 // Give the branch instruction our target
5222 SDValue Ops[] = {
5223 BR->getOperand(0),
5224 BRCOND.getOperand(2)
5225 };
5226 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
5227 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
5228 }
5229
5230 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
5231
5232 // Copy the intrinsic results to registers
5233 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
5234 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
5235 if (!CopyToReg)
5236 continue;
5237
5238 Chain = DAG.getCopyToReg(
5239 Chain, DL,
5240 CopyToReg->getOperand(1),
5241 SDValue(Result, i - 1),
5242 SDValue());
5243
5244 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
5245 }
5246
5247 // Remove the old intrinsic from the chain
5248 DAG.ReplaceAllUsesOfValueWith(
5249 SDValue(Intr, Intr->getNumValues() - 1),
5250 Intr->getOperand(0));
5251
5252 return Chain;
5253 }
5254
LowerRETURNADDR(SDValue Op,SelectionDAG & DAG) const5255 SDValue SITargetLowering::LowerRETURNADDR(SDValue Op,
5256 SelectionDAG &DAG) const {
5257 MVT VT = Op.getSimpleValueType();
5258 SDLoc DL(Op);
5259 // Checking the depth
5260 if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() != 0)
5261 return DAG.getConstant(0, DL, VT);
5262
5263 MachineFunction &MF = DAG.getMachineFunction();
5264 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
5265 // Check for kernel and shader functions
5266 if (Info->isEntryFunction())
5267 return DAG.getConstant(0, DL, VT);
5268
5269 MachineFrameInfo &MFI = MF.getFrameInfo();
5270 // There is a call to @llvm.returnaddress in this function
5271 MFI.setReturnAddressIsTaken(true);
5272
5273 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
5274 // Get the return address reg and mark it as an implicit live-in
5275 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF), getRegClassFor(VT, Op.getNode()->isDivergent()));
5276
5277 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
5278 }
5279
getFPExtOrFPRound(SelectionDAG & DAG,SDValue Op,const SDLoc & DL,EVT VT) const5280 SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG,
5281 SDValue Op,
5282 const SDLoc &DL,
5283 EVT VT) const {
5284 return Op.getValueType().bitsLE(VT) ?
5285 DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
5286 DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
5287 DAG.getTargetConstant(0, DL, MVT::i32));
5288 }
5289
lowerFP_ROUND(SDValue Op,SelectionDAG & DAG) const5290 SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
5291 assert(Op.getValueType() == MVT::f16 &&
5292 "Do not know how to custom lower FP_ROUND for non-f16 type");
5293
5294 SDValue Src = Op.getOperand(0);
5295 EVT SrcVT = Src.getValueType();
5296 if (SrcVT != MVT::f64)
5297 return Op;
5298
5299 SDLoc DL(Op);
5300
5301 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
5302 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
5303 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
5304 }
5305
lowerFMINNUM_FMAXNUM(SDValue Op,SelectionDAG & DAG) const5306 SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
5307 SelectionDAG &DAG) const {
5308 EVT VT = Op.getValueType();
5309 const MachineFunction &MF = DAG.getMachineFunction();
5310 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
5311 bool IsIEEEMode = Info->getMode().IEEE;
5312
5313 // FIXME: Assert during selection that this is only selected for
5314 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
5315 // mode functions, but this happens to be OK since it's only done in cases
5316 // where there is known no sNaN.
5317 if (IsIEEEMode)
5318 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
5319
5320 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16)
5321 return splitBinaryVectorOp(Op, DAG);
5322 return Op;
5323 }
5324
lowerXMULO(SDValue Op,SelectionDAG & DAG) const5325 SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
5326 EVT VT = Op.getValueType();
5327 SDLoc SL(Op);
5328 SDValue LHS = Op.getOperand(0);
5329 SDValue RHS = Op.getOperand(1);
5330 bool isSigned = Op.getOpcode() == ISD::SMULO;
5331
5332 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
5333 const APInt &C = RHSC->getAPIntValue();
5334 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
5335 if (C.isPowerOf2()) {
5336 // smulo(x, signed_min) is same as umulo(x, signed_min).
5337 bool UseArithShift = isSigned && !C.isMinSignedValue();
5338 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
5339 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
5340 SDValue Overflow = DAG.getSetCC(SL, MVT::i1,
5341 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL,
5342 SL, VT, Result, ShiftAmt),
5343 LHS, ISD::SETNE);
5344 return DAG.getMergeValues({ Result, Overflow }, SL);
5345 }
5346 }
5347
5348 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
5349 SDValue Top = DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU,
5350 SL, VT, LHS, RHS);
5351
5352 SDValue Sign = isSigned
5353 ? DAG.getNode(ISD::SRA, SL, VT, Result,
5354 DAG.getConstant(VT.getScalarSizeInBits() - 1, SL, MVT::i32))
5355 : DAG.getConstant(0, SL, VT);
5356 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
5357
5358 return DAG.getMergeValues({ Result, Overflow }, SL);
5359 }
5360
lowerXMUL_LOHI(SDValue Op,SelectionDAG & DAG) const5361 SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
5362 if (Op->isDivergent()) {
5363 // Select to V_MAD_[IU]64_[IU]32.
5364 return Op;
5365 }
5366 if (Subtarget->hasSMulHi()) {
5367 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
5368 return SDValue();
5369 }
5370 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
5371 // calculate the high part, so we might as well do the whole thing with
5372 // V_MAD_[IU]64_[IU]32.
5373 return Op;
5374 }
5375
lowerTRAP(SDValue Op,SelectionDAG & DAG) const5376 SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
5377 if (!Subtarget->isTrapHandlerEnabled() ||
5378 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
5379 return lowerTrapEndpgm(Op, DAG);
5380
5381 if (Optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(Subtarget)) {
5382 switch (*HsaAbiVer) {
5383 case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
5384 case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
5385 return lowerTrapHsaQueuePtr(Op, DAG);
5386 case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
5387 case ELF::ELFABIVERSION_AMDGPU_HSA_V5:
5388 return Subtarget->supportsGetDoorbellID() ?
5389 lowerTrapHsa(Op, DAG) : lowerTrapHsaQueuePtr(Op, DAG);
5390 }
5391 }
5392
5393 llvm_unreachable("Unknown trap handler");
5394 }
5395
lowerTrapEndpgm(SDValue Op,SelectionDAG & DAG) const5396 SDValue SITargetLowering::lowerTrapEndpgm(
5397 SDValue Op, SelectionDAG &DAG) const {
5398 SDLoc SL(Op);
5399 SDValue Chain = Op.getOperand(0);
5400 return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
5401 }
5402
loadImplicitKernelArgument(SelectionDAG & DAG,MVT VT,const SDLoc & DL,Align Alignment,ImplicitParameter Param) const5403 SDValue SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
5404 const SDLoc &DL, Align Alignment, ImplicitParameter Param) const {
5405 MachineFunction &MF = DAG.getMachineFunction();
5406 uint64_t Offset = getImplicitParameterOffset(MF, Param);
5407 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
5408 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
5409 return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
5410 MachineMemOperand::MODereferenceable |
5411 MachineMemOperand::MOInvariant);
5412 }
5413
lowerTrapHsaQueuePtr(SDValue Op,SelectionDAG & DAG) const5414 SDValue SITargetLowering::lowerTrapHsaQueuePtr(
5415 SDValue Op, SelectionDAG &DAG) const {
5416 SDLoc SL(Op);
5417 SDValue Chain = Op.getOperand(0);
5418
5419 SDValue QueuePtr;
5420 // For code object version 5, QueuePtr is passed through implicit kernarg.
5421 if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
5422 QueuePtr =
5423 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
5424 } else {
5425 MachineFunction &MF = DAG.getMachineFunction();
5426 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
5427 Register UserSGPR = Info->getQueuePtrUserSGPR();
5428
5429 if (UserSGPR == AMDGPU::NoRegister) {
5430 // We probably are in a function incorrectly marked with
5431 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
5432 // trap, so just use a null pointer.
5433 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
5434 } else {
5435 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
5436 MVT::i64);
5437 }
5438 }
5439
5440 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
5441 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
5442 QueuePtr, SDValue());
5443
5444 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
5445 SDValue Ops[] = {
5446 ToReg,
5447 DAG.getTargetConstant(TrapID, SL, MVT::i16),
5448 SGPR01,
5449 ToReg.getValue(1)
5450 };
5451 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
5452 }
5453
lowerTrapHsa(SDValue Op,SelectionDAG & DAG) const5454 SDValue SITargetLowering::lowerTrapHsa(
5455 SDValue Op, SelectionDAG &DAG) const {
5456 SDLoc SL(Op);
5457 SDValue Chain = Op.getOperand(0);
5458
5459 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
5460 SDValue Ops[] = {
5461 Chain,
5462 DAG.getTargetConstant(TrapID, SL, MVT::i16)
5463 };
5464 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
5465 }
5466
lowerDEBUGTRAP(SDValue Op,SelectionDAG & DAG) const5467 SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
5468 SDLoc SL(Op);
5469 SDValue Chain = Op.getOperand(0);
5470 MachineFunction &MF = DAG.getMachineFunction();
5471
5472 if (!Subtarget->isTrapHandlerEnabled() ||
5473 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
5474 DiagnosticInfoUnsupported NoTrap(MF.getFunction(),
5475 "debugtrap handler not supported",
5476 Op.getDebugLoc(),
5477 DS_Warning);
5478 LLVMContext &Ctx = MF.getFunction().getContext();
5479 Ctx.diagnose(NoTrap);
5480 return Chain;
5481 }
5482
5483 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
5484 SDValue Ops[] = {
5485 Chain,
5486 DAG.getTargetConstant(TrapID, SL, MVT::i16)
5487 };
5488 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
5489 }
5490
getSegmentAperture(unsigned AS,const SDLoc & DL,SelectionDAG & DAG) const5491 SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
5492 SelectionDAG &DAG) const {
5493 // FIXME: Use inline constants (src_{shared, private}_base) instead.
5494 if (Subtarget->hasApertureRegs()) {
5495 unsigned Offset = AS == AMDGPUAS::LOCAL_ADDRESS ?
5496 AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
5497 AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
5498 unsigned WidthM1 = AS == AMDGPUAS::LOCAL_ADDRESS ?
5499 AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
5500 AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
5501 unsigned Encoding =
5502 AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
5503 Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
5504 WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
5505
5506 SDValue EncodingImm = DAG.getTargetConstant(Encoding, DL, MVT::i16);
5507 SDValue ApertureReg = SDValue(
5508 DAG.getMachineNode(AMDGPU::S_GETREG_B32, DL, MVT::i32, EncodingImm), 0);
5509 SDValue ShiftAmount = DAG.getTargetConstant(WidthM1 + 1, DL, MVT::i32);
5510 return DAG.getNode(ISD::SHL, DL, MVT::i32, ApertureReg, ShiftAmount);
5511 }
5512
5513 // For code object version 5, private_base and shared_base are passed through
5514 // implicit kernargs.
5515 if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
5516 ImplicitParameter Param =
5517 (AS == AMDGPUAS::LOCAL_ADDRESS) ? SHARED_BASE : PRIVATE_BASE;
5518 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
5519 }
5520
5521 MachineFunction &MF = DAG.getMachineFunction();
5522 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
5523 Register UserSGPR = Info->getQueuePtrUserSGPR();
5524 if (UserSGPR == AMDGPU::NoRegister) {
5525 // We probably are in a function incorrectly marked with
5526 // amdgpu-no-queue-ptr. This is undefined.
5527 return DAG.getUNDEF(MVT::i32);
5528 }
5529
5530 SDValue QueuePtr = CreateLiveInRegister(
5531 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
5532
5533 // Offset into amd_queue_t for group_segment_aperture_base_hi /
5534 // private_segment_aperture_base_hi.
5535 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
5536
5537 SDValue Ptr =
5538 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::Fixed(StructOffset));
5539
5540 // TODO: Use custom target PseudoSourceValue.
5541 // TODO: We should use the value from the IR intrinsic call, but it might not
5542 // be available and how do we get it?
5543 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
5544 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
5545 commonAlignment(Align(64), StructOffset),
5546 MachineMemOperand::MODereferenceable |
5547 MachineMemOperand::MOInvariant);
5548 }
5549
5550 /// Return true if the value is a known valid address, such that a null check is
5551 /// not necessary.
isKnownNonNull(SDValue Val,SelectionDAG & DAG,const AMDGPUTargetMachine & TM,unsigned AddrSpace)5552 static bool isKnownNonNull(SDValue Val, SelectionDAG &DAG,
5553 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
5554 if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) ||
5555 isa<BasicBlockSDNode>(Val))
5556 return true;
5557
5558 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
5559 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
5560
5561 // TODO: Search through arithmetic, handle arguments and loads
5562 // marked nonnull.
5563 return false;
5564 }
5565
lowerADDRSPACECAST(SDValue Op,SelectionDAG & DAG) const5566 SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
5567 SelectionDAG &DAG) const {
5568 SDLoc SL(Op);
5569 const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
5570
5571 SDValue Src = ASC->getOperand(0);
5572 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
5573 unsigned SrcAS = ASC->getSrcAddressSpace();
5574
5575 const AMDGPUTargetMachine &TM =
5576 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
5577
5578 // flat -> local/private
5579 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
5580 unsigned DestAS = ASC->getDestAddressSpace();
5581
5582 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
5583 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
5584 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
5585
5586 if (isKnownNonNull(Src, DAG, TM, SrcAS))
5587 return Ptr;
5588
5589 unsigned NullVal = TM.getNullPointerValue(DestAS);
5590 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
5591 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
5592
5593 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
5594 SegmentNullPtr);
5595 }
5596 }
5597
5598 // local/private -> flat
5599 if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
5600 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
5601 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
5602
5603 SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG);
5604 SDValue CvtPtr =
5605 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
5606 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
5607
5608 if (isKnownNonNull(Src, DAG, TM, SrcAS))
5609 return CvtPtr;
5610
5611 unsigned NullVal = TM.getNullPointerValue(SrcAS);
5612 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
5613
5614 SDValue NonNull
5615 = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
5616
5617 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
5618 FlatNullPtr);
5619 }
5620 }
5621
5622 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
5623 Op.getValueType() == MVT::i64) {
5624 const SIMachineFunctionInfo *Info =
5625 DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
5626 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
5627 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
5628 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
5629 }
5630
5631 if (ASC->getDestAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
5632 Src.getValueType() == MVT::i64)
5633 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
5634
5635 // global <-> flat are no-ops and never emitted.
5636
5637 const MachineFunction &MF = DAG.getMachineFunction();
5638 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
5639 MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
5640 DAG.getContext()->diagnose(InvalidAddrSpaceCast);
5641
5642 return DAG.getUNDEF(ASC->getValueType(0));
5643 }
5644
5645 // This lowers an INSERT_SUBVECTOR by extracting the individual elements from
5646 // the small vector and inserting them into the big vector. That is better than
5647 // the default expansion of doing it via a stack slot. Even though the use of
5648 // the stack slot would be optimized away afterwards, the stack slot itself
5649 // remains.
lowerINSERT_SUBVECTOR(SDValue Op,SelectionDAG & DAG) const5650 SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
5651 SelectionDAG &DAG) const {
5652 SDValue Vec = Op.getOperand(0);
5653 SDValue Ins = Op.getOperand(1);
5654 SDValue Idx = Op.getOperand(2);
5655 EVT VecVT = Vec.getValueType();
5656 EVT InsVT = Ins.getValueType();
5657 EVT EltVT = VecVT.getVectorElementType();
5658 unsigned InsNumElts = InsVT.getVectorNumElements();
5659 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
5660 SDLoc SL(Op);
5661
5662 for (unsigned I = 0; I != InsNumElts; ++I) {
5663 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
5664 DAG.getConstant(I, SL, MVT::i32));
5665 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
5666 DAG.getConstant(IdxVal + I, SL, MVT::i32));
5667 }
5668 return Vec;
5669 }
5670
lowerINSERT_VECTOR_ELT(SDValue Op,SelectionDAG & DAG) const5671 SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
5672 SelectionDAG &DAG) const {
5673 SDValue Vec = Op.getOperand(0);
5674 SDValue InsVal = Op.getOperand(1);
5675 SDValue Idx = Op.getOperand(2);
5676 EVT VecVT = Vec.getValueType();
5677 EVT EltVT = VecVT.getVectorElementType();
5678 unsigned VecSize = VecVT.getSizeInBits();
5679 unsigned EltSize = EltVT.getSizeInBits();
5680 SDLoc SL(Op);
5681
5682 // Specially handle the case of v4i16 with static indexing.
5683 unsigned NumElts = VecVT.getVectorNumElements();
5684 auto KIdx = dyn_cast<ConstantSDNode>(Idx);
5685 if (NumElts == 4 && EltSize == 16 && KIdx) {
5686 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
5687
5688 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
5689 DAG.getConstant(0, SL, MVT::i32));
5690 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
5691 DAG.getConstant(1, SL, MVT::i32));
5692
5693 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
5694 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
5695
5696 unsigned Idx = KIdx->getZExtValue();
5697 bool InsertLo = Idx < 2;
5698 SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16,
5699 InsertLo ? LoVec : HiVec,
5700 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
5701 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
5702
5703 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
5704
5705 SDValue Concat = InsertLo ?
5706 DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) :
5707 DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf });
5708
5709 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
5710 }
5711
5712 // Static indexing does not lower to stack access, and hence there is no need
5713 // for special custom lowering to avoid stack access.
5714 if (isa<ConstantSDNode>(Idx))
5715 return SDValue();
5716
5717 // Avoid stack access for dynamic indexing by custom lowering to
5718 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
5719
5720 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
5721
5722 MVT IntVT = MVT::getIntegerVT(VecSize);
5723
5724 // Convert vector index to bit-index and get the required bit mask.
5725 assert(isPowerOf2_32(EltSize));
5726 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
5727 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
5728 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
5729 DAG.getConstant(0xffff, SL, IntVT),
5730 ScaledIdx);
5731
5732 // 1. Create a congruent vector with the target value in each element.
5733 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
5734 DAG.getSplatBuildVector(VecVT, SL, InsVal));
5735
5736 // 2. Mask off all other indicies except the required index within (1).
5737 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
5738
5739 // 3. Mask off the required index within the target vector.
5740 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
5741 SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
5742 DAG.getNOT(SL, BFM, IntVT), BCVec);
5743
5744 // 4. Get (2) and (3) ORed into the target vector.
5745 SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
5746
5747 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
5748 }
5749
lowerEXTRACT_VECTOR_ELT(SDValue Op,SelectionDAG & DAG) const5750 SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
5751 SelectionDAG &DAG) const {
5752 SDLoc SL(Op);
5753
5754 EVT ResultVT = Op.getValueType();
5755 SDValue Vec = Op.getOperand(0);
5756 SDValue Idx = Op.getOperand(1);
5757 EVT VecVT = Vec.getValueType();
5758 unsigned VecSize = VecVT.getSizeInBits();
5759 EVT EltVT = VecVT.getVectorElementType();
5760
5761 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
5762
5763 // Make sure we do any optimizations that will make it easier to fold
5764 // source modifiers before obscuring it with bit operations.
5765
5766 // XXX - Why doesn't this get called when vector_shuffle is expanded?
5767 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
5768 return Combined;
5769
5770 if (VecSize == 128 || VecSize == 256) {
5771 SDValue Lo, Hi;
5772 EVT LoVT, HiVT;
5773 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT);
5774
5775 if (VecSize == 128) {
5776 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
5777 Lo = DAG.getBitcast(LoVT,
5778 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
5779 DAG.getConstant(0, SL, MVT::i32)));
5780 Hi = DAG.getBitcast(HiVT,
5781 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
5782 DAG.getConstant(1, SL, MVT::i32)));
5783 } else {
5784 assert(VecSize == 256);
5785
5786 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
5787 SDValue Parts[4];
5788 for (unsigned P = 0; P < 4; ++P) {
5789 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
5790 DAG.getConstant(P, SL, MVT::i32));
5791 }
5792
5793 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
5794 Parts[0], Parts[1]));
5795 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
5796 Parts[2], Parts[3]));
5797 }
5798
5799 EVT IdxVT = Idx.getValueType();
5800 unsigned NElem = VecVT.getVectorNumElements();
5801 assert(isPowerOf2_32(NElem));
5802 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
5803 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
5804 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
5805 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
5806 }
5807
5808 assert(VecSize <= 64);
5809
5810 MVT IntVT = MVT::getIntegerVT(VecSize);
5811
5812 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
5813 SDValue VecBC = peekThroughBitcasts(Vec);
5814 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
5815 SDValue Src = VecBC.getOperand(0);
5816 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
5817 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
5818 }
5819
5820 unsigned EltSize = EltVT.getSizeInBits();
5821 assert(isPowerOf2_32(EltSize));
5822
5823 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
5824
5825 // Convert vector index to bit-index (* EltSize)
5826 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
5827
5828 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
5829 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
5830
5831 if (ResultVT == MVT::f16) {
5832 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
5833 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
5834 }
5835
5836 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
5837 }
5838
elementPairIsContiguous(ArrayRef<int> Mask,int Elt)5839 static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
5840 assert(Elt % 2 == 0);
5841 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
5842 }
5843
lowerVECTOR_SHUFFLE(SDValue Op,SelectionDAG & DAG) const5844 SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
5845 SelectionDAG &DAG) const {
5846 SDLoc SL(Op);
5847 EVT ResultVT = Op.getValueType();
5848 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
5849
5850 EVT PackVT = ResultVT.isInteger() ? MVT::v2i16 : MVT::v2f16;
5851 EVT EltVT = PackVT.getVectorElementType();
5852 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
5853
5854 // vector_shuffle <0,1,6,7> lhs, rhs
5855 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
5856 //
5857 // vector_shuffle <6,7,2,3> lhs, rhs
5858 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
5859 //
5860 // vector_shuffle <6,7,0,1> lhs, rhs
5861 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
5862
5863 // Avoid scalarizing when both halves are reading from consecutive elements.
5864 SmallVector<SDValue, 4> Pieces;
5865 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
5866 if (elementPairIsContiguous(SVN->getMask(), I)) {
5867 const int Idx = SVN->getMaskElt(I);
5868 int VecIdx = Idx < SrcNumElts ? 0 : 1;
5869 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
5870 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL,
5871 PackVT, SVN->getOperand(VecIdx),
5872 DAG.getConstant(EltIdx, SL, MVT::i32));
5873 Pieces.push_back(SubVec);
5874 } else {
5875 const int Idx0 = SVN->getMaskElt(I);
5876 const int Idx1 = SVN->getMaskElt(I + 1);
5877 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
5878 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
5879 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
5880 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
5881
5882 SDValue Vec0 = SVN->getOperand(VecIdx0);
5883 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
5884 Vec0, DAG.getConstant(EltIdx0, SL, MVT::i32));
5885
5886 SDValue Vec1 = SVN->getOperand(VecIdx1);
5887 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
5888 Vec1, DAG.getConstant(EltIdx1, SL, MVT::i32));
5889 Pieces.push_back(DAG.getBuildVector(PackVT, SL, { Elt0, Elt1 }));
5890 }
5891 }
5892
5893 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
5894 }
5895
lowerSCALAR_TO_VECTOR(SDValue Op,SelectionDAG & DAG) const5896 SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
5897 SelectionDAG &DAG) const {
5898 SDValue SVal = Op.getOperand(0);
5899 EVT ResultVT = Op.getValueType();
5900 EVT SValVT = SVal.getValueType();
5901 SDValue UndefVal = DAG.getUNDEF(SValVT);
5902 SDLoc SL(Op);
5903
5904 SmallVector<SDValue, 8> VElts;
5905 VElts.push_back(SVal);
5906 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
5907 VElts.push_back(UndefVal);
5908
5909 return DAG.getBuildVector(ResultVT, SL, VElts);
5910 }
5911
lowerBUILD_VECTOR(SDValue Op,SelectionDAG & DAG) const5912 SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
5913 SelectionDAG &DAG) const {
5914 SDLoc SL(Op);
5915 EVT VT = Op.getValueType();
5916
5917 if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
5918 VT == MVT::v8i16 || VT == MVT::v8f16) {
5919 EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(),
5920 VT.getVectorNumElements() / 2);
5921 MVT HalfIntVT = MVT::getIntegerVT(HalfVT.getSizeInBits());
5922
5923 // Turn into pair of packed build_vectors.
5924 // TODO: Special case for constants that can be materialized with s_mov_b64.
5925 SmallVector<SDValue, 4> LoOps, HiOps;
5926 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I != E; ++I) {
5927 LoOps.push_back(Op.getOperand(I));
5928 HiOps.push_back(Op.getOperand(I + E));
5929 }
5930 SDValue Lo = DAG.getBuildVector(HalfVT, SL, LoOps);
5931 SDValue Hi = DAG.getBuildVector(HalfVT, SL, HiOps);
5932
5933 SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Lo);
5934 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Hi);
5935
5936 SDValue Blend = DAG.getBuildVector(MVT::getVectorVT(HalfIntVT, 2), SL,
5937 { CastLo, CastHi });
5938 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
5939 }
5940
5941 if (VT == MVT::v16i16 || VT == MVT::v16f16) {
5942 EVT QuarterVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(),
5943 VT.getVectorNumElements() / 4);
5944 MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits());
5945
5946 SmallVector<SDValue, 4> Parts[4];
5947 for (unsigned I = 0, E = VT.getVectorNumElements() / 4; I != E; ++I) {
5948 for (unsigned P = 0; P < 4; ++P)
5949 Parts[P].push_back(Op.getOperand(I + P * E));
5950 }
5951 SDValue Casts[4];
5952 for (unsigned P = 0; P < 4; ++P) {
5953 SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]);
5954 Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec);
5955 }
5956
5957 SDValue Blend =
5958 DAG.getBuildVector(MVT::getVectorVT(QuarterIntVT, 4), SL, Casts);
5959 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
5960 }
5961
5962 assert(VT == MVT::v2f16 || VT == MVT::v2i16);
5963 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
5964
5965 SDValue Lo = Op.getOperand(0);
5966 SDValue Hi = Op.getOperand(1);
5967
5968 // Avoid adding defined bits with the zero_extend.
5969 if (Hi.isUndef()) {
5970 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
5971 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
5972 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
5973 }
5974
5975 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
5976 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
5977
5978 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
5979 DAG.getConstant(16, SL, MVT::i32));
5980 if (Lo.isUndef())
5981 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
5982
5983 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
5984 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
5985
5986 SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
5987 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
5988 }
5989
5990 bool
isOffsetFoldingLegal(const GlobalAddressSDNode * GA) const5991 SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
5992 // We can fold offsets for anything that doesn't require a GOT relocation.
5993 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
5994 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
5995 GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
5996 !shouldEmitGOTReloc(GA->getGlobal());
5997 }
5998
5999 static SDValue
buildPCRelGlobalAddress(SelectionDAG & DAG,const GlobalValue * GV,const SDLoc & DL,int64_t Offset,EVT PtrVT,unsigned GAFlags=SIInstrInfo::MO_NONE)6000 buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
6001 const SDLoc &DL, int64_t Offset, EVT PtrVT,
6002 unsigned GAFlags = SIInstrInfo::MO_NONE) {
6003 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
6004 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
6005 // lowered to the following code sequence:
6006 //
6007 // For constant address space:
6008 // s_getpc_b64 s[0:1]
6009 // s_add_u32 s0, s0, $symbol
6010 // s_addc_u32 s1, s1, 0
6011 //
6012 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
6013 // a fixup or relocation is emitted to replace $symbol with a literal
6014 // constant, which is a pc-relative offset from the encoding of the $symbol
6015 // operand to the global variable.
6016 //
6017 // For global address space:
6018 // s_getpc_b64 s[0:1]
6019 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
6020 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
6021 //
6022 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
6023 // fixups or relocations are emitted to replace $symbol@*@lo and
6024 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
6025 // which is a 64-bit pc-relative offset from the encoding of the $symbol
6026 // operand to the global variable.
6027 //
6028 // What we want here is an offset from the value returned by s_getpc
6029 // (which is the address of the s_add_u32 instruction) to the global
6030 // variable, but since the encoding of $symbol starts 4 bytes after the start
6031 // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
6032 // small. This requires us to add 4 to the global variable offset in order to
6033 // compute the correct address. Similarly for the s_addc_u32 instruction, the
6034 // encoding of $symbol starts 12 bytes after the start of the s_add_u32
6035 // instruction.
6036 SDValue PtrLo =
6037 DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, GAFlags);
6038 SDValue PtrHi;
6039 if (GAFlags == SIInstrInfo::MO_NONE) {
6040 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
6041 } else {
6042 PtrHi =
6043 DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 12, GAFlags + 1);
6044 }
6045 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
6046 }
6047
LowerGlobalAddress(AMDGPUMachineFunction * MFI,SDValue Op,SelectionDAG & DAG) const6048 SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
6049 SDValue Op,
6050 SelectionDAG &DAG) const {
6051 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
6052 SDLoc DL(GSD);
6053 EVT PtrVT = Op.getValueType();
6054
6055 const GlobalValue *GV = GSD->getGlobal();
6056 if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
6057 shouldUseLDSConstAddress(GV)) ||
6058 GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
6059 GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
6060 if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
6061 GV->hasExternalLinkage()) {
6062 Type *Ty = GV->getValueType();
6063 // HIP uses an unsized array `extern __shared__ T s[]` or similar
6064 // zero-sized type in other languages to declare the dynamic shared
6065 // memory which size is not known at the compile time. They will be
6066 // allocated by the runtime and placed directly after the static
6067 // allocated ones. They all share the same offset.
6068 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
6069 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
6070 // Adjust alignment for that dynamic shared memory array.
6071 MFI->setDynLDSAlign(DAG.getDataLayout(), *cast<GlobalVariable>(GV));
6072 return SDValue(
6073 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
6074 }
6075 }
6076 return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
6077 }
6078
6079 if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
6080 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
6081 SIInstrInfo::MO_ABS32_LO);
6082 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
6083 }
6084
6085 if (shouldEmitFixup(GV))
6086 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
6087 else if (shouldEmitPCReloc(GV))
6088 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
6089 SIInstrInfo::MO_REL32);
6090
6091 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
6092 SIInstrInfo::MO_GOTPCREL32);
6093
6094 Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
6095 PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
6096 const DataLayout &DataLayout = DAG.getDataLayout();
6097 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
6098 MachinePointerInfo PtrInfo
6099 = MachinePointerInfo::getGOT(DAG.getMachineFunction());
6100
6101 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
6102 MachineMemOperand::MODereferenceable |
6103 MachineMemOperand::MOInvariant);
6104 }
6105
copyToM0(SelectionDAG & DAG,SDValue Chain,const SDLoc & DL,SDValue V) const6106 SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
6107 const SDLoc &DL, SDValue V) const {
6108 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
6109 // the destination register.
6110 //
6111 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
6112 // so we will end up with redundant moves to m0.
6113 //
6114 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
6115
6116 // A Null SDValue creates a glue result.
6117 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
6118 V, Chain);
6119 return SDValue(M0, 0);
6120 }
6121
lowerImplicitZextParam(SelectionDAG & DAG,SDValue Op,MVT VT,unsigned Offset) const6122 SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
6123 SDValue Op,
6124 MVT VT,
6125 unsigned Offset) const {
6126 SDLoc SL(Op);
6127 SDValue Param = lowerKernargMemParameter(
6128 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
6129 // The local size values will have the hi 16-bits as zero.
6130 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
6131 DAG.getValueType(VT));
6132 }
6133
emitNonHSAIntrinsicError(SelectionDAG & DAG,const SDLoc & DL,EVT VT)6134 static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
6135 EVT VT) {
6136 DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
6137 "non-hsa intrinsic with hsa target",
6138 DL.getDebugLoc());
6139 DAG.getContext()->diagnose(BadIntrin);
6140 return DAG.getUNDEF(VT);
6141 }
6142
emitRemovedIntrinsicError(SelectionDAG & DAG,const SDLoc & DL,EVT VT)6143 static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL,
6144 EVT VT) {
6145 DiagnosticInfoUnsupported BadIntrin(DAG.getMachineFunction().getFunction(),
6146 "intrinsic not supported on subtarget",
6147 DL.getDebugLoc());
6148 DAG.getContext()->diagnose(BadIntrin);
6149 return DAG.getUNDEF(VT);
6150 }
6151
getBuildDwordsVector(SelectionDAG & DAG,SDLoc DL,ArrayRef<SDValue> Elts)6152 static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,
6153 ArrayRef<SDValue> Elts) {
6154 assert(!Elts.empty());
6155 MVT Type;
6156 unsigned NumElts = Elts.size();
6157
6158 if (NumElts <= 8) {
6159 Type = MVT::getVectorVT(MVT::f32, NumElts);
6160 } else {
6161 assert(Elts.size() <= 16);
6162 Type = MVT::v16f32;
6163 NumElts = 16;
6164 }
6165
6166 SmallVector<SDValue, 16> VecElts(NumElts);
6167 for (unsigned i = 0; i < Elts.size(); ++i) {
6168 SDValue Elt = Elts[i];
6169 if (Elt.getValueType() != MVT::f32)
6170 Elt = DAG.getBitcast(MVT::f32, Elt);
6171 VecElts[i] = Elt;
6172 }
6173 for (unsigned i = Elts.size(); i < NumElts; ++i)
6174 VecElts[i] = DAG.getUNDEF(MVT::f32);
6175
6176 if (NumElts == 1)
6177 return VecElts[0];
6178 return DAG.getBuildVector(Type, DL, VecElts);
6179 }
6180
padEltsToUndef(SelectionDAG & DAG,const SDLoc & DL,EVT CastVT,SDValue Src,int ExtraElts)6181 static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
6182 SDValue Src, int ExtraElts) {
6183 EVT SrcVT = Src.getValueType();
6184
6185 SmallVector<SDValue, 8> Elts;
6186
6187 if (SrcVT.isVector())
6188 DAG.ExtractVectorElements(Src, Elts);
6189 else
6190 Elts.push_back(Src);
6191
6192 SDValue Undef = DAG.getUNDEF(SrcVT.getScalarType());
6193 while (ExtraElts--)
6194 Elts.push_back(Undef);
6195
6196 return DAG.getBuildVector(CastVT, DL, Elts);
6197 }
6198
6199 // Re-construct the required return value for a image load intrinsic.
6200 // This is more complicated due to the optional use TexFailCtrl which means the required
6201 // return type is an aggregate
constructRetValue(SelectionDAG & DAG,MachineSDNode * Result,ArrayRef<EVT> ResultTypes,bool IsTexFail,bool Unpacked,bool IsD16,int DMaskPop,int NumVDataDwords,const SDLoc & DL)6202 static SDValue constructRetValue(SelectionDAG &DAG,
6203 MachineSDNode *Result,
6204 ArrayRef<EVT> ResultTypes,
6205 bool IsTexFail, bool Unpacked, bool IsD16,
6206 int DMaskPop, int NumVDataDwords,
6207 const SDLoc &DL) {
6208 // Determine the required return type. This is the same regardless of IsTexFail flag
6209 EVT ReqRetVT = ResultTypes[0];
6210 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
6211 int NumDataDwords = (!IsD16 || (IsD16 && Unpacked)) ?
6212 ReqRetNumElts : (ReqRetNumElts + 1) / 2;
6213
6214 int MaskPopDwords = (!IsD16 || (IsD16 && Unpacked)) ?
6215 DMaskPop : (DMaskPop + 1) / 2;
6216
6217 MVT DataDwordVT = NumDataDwords == 1 ?
6218 MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
6219
6220 MVT MaskPopVT = MaskPopDwords == 1 ?
6221 MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
6222
6223 SDValue Data(Result, 0);
6224 SDValue TexFail;
6225
6226 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
6227 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
6228 if (MaskPopVT.isVector()) {
6229 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
6230 SDValue(Result, 0), ZeroIdx);
6231 } else {
6232 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
6233 SDValue(Result, 0), ZeroIdx);
6234 }
6235 }
6236
6237 if (DataDwordVT.isVector())
6238 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
6239 NumDataDwords - MaskPopDwords);
6240
6241 if (IsD16)
6242 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
6243
6244 EVT LegalReqRetVT = ReqRetVT;
6245 if (!ReqRetVT.isVector()) {
6246 if (!Data.getValueType().isInteger())
6247 Data = DAG.getNode(ISD::BITCAST, DL,
6248 Data.getValueType().changeTypeToInteger(), Data);
6249 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
6250 } else {
6251 // We need to widen the return vector to a legal type
6252 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
6253 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
6254 LegalReqRetVT =
6255 EVT::getVectorVT(*DAG.getContext(), ReqRetVT.getVectorElementType(),
6256 ReqRetVT.getVectorNumElements() + 1);
6257 }
6258 }
6259 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
6260
6261 if (IsTexFail) {
6262 TexFail =
6263 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
6264 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
6265
6266 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
6267 }
6268
6269 if (Result->getNumValues() == 1)
6270 return Data;
6271
6272 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
6273 }
6274
parseTexFail(SDValue TexFailCtrl,SelectionDAG & DAG,SDValue * TFE,SDValue * LWE,bool & IsTexFail)6275 static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
6276 SDValue *LWE, bool &IsTexFail) {
6277 auto TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
6278
6279 uint64_t Value = TexFailCtrlConst->getZExtValue();
6280 if (Value) {
6281 IsTexFail = true;
6282 }
6283
6284 SDLoc DL(TexFailCtrlConst);
6285 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
6286 Value &= ~(uint64_t)0x1;
6287 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
6288 Value &= ~(uint64_t)0x2;
6289
6290 return Value == 0;
6291 }
6292
packImage16bitOpsToDwords(SelectionDAG & DAG,SDValue Op,MVT PackVectorVT,SmallVectorImpl<SDValue> & PackedAddrs,unsigned DimIdx,unsigned EndIdx,unsigned NumGradients)6293 static void packImage16bitOpsToDwords(SelectionDAG &DAG, SDValue Op,
6294 MVT PackVectorVT,
6295 SmallVectorImpl<SDValue> &PackedAddrs,
6296 unsigned DimIdx, unsigned EndIdx,
6297 unsigned NumGradients) {
6298 SDLoc DL(Op);
6299 for (unsigned I = DimIdx; I < EndIdx; I++) {
6300 SDValue Addr = Op.getOperand(I);
6301
6302 // Gradients are packed with undef for each coordinate.
6303 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
6304 // 1D: undef,dx/dh; undef,dx/dv
6305 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
6306 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
6307 if (((I + 1) >= EndIdx) ||
6308 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
6309 I == DimIdx + NumGradients - 1))) {
6310 if (Addr.getValueType() != MVT::i16)
6311 Addr = DAG.getBitcast(MVT::i16, Addr);
6312 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
6313 } else {
6314 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
6315 I++;
6316 }
6317 Addr = DAG.getBitcast(MVT::f32, Addr);
6318 PackedAddrs.push_back(Addr);
6319 }
6320 }
6321
lowerImage(SDValue Op,const AMDGPU::ImageDimIntrinsicInfo * Intr,SelectionDAG & DAG,bool WithChain) const6322 SDValue SITargetLowering::lowerImage(SDValue Op,
6323 const AMDGPU::ImageDimIntrinsicInfo *Intr,
6324 SelectionDAG &DAG, bool WithChain) const {
6325 SDLoc DL(Op);
6326 MachineFunction &MF = DAG.getMachineFunction();
6327 const GCNSubtarget* ST = &MF.getSubtarget<GCNSubtarget>();
6328 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
6329 AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
6330 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
6331 unsigned IntrOpcode = Intr->BaseOpcode;
6332 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
6333 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
6334
6335 SmallVector<EVT, 3> ResultTypes(Op->values());
6336 SmallVector<EVT, 3> OrigResultTypes(Op->values());
6337 bool IsD16 = false;
6338 bool IsG16 = false;
6339 bool IsA16 = false;
6340 SDValue VData;
6341 int NumVDataDwords;
6342 bool AdjustRetType = false;
6343
6344 // Offset of intrinsic arguments
6345 const unsigned ArgOffset = WithChain ? 2 : 1;
6346
6347 unsigned DMask;
6348 unsigned DMaskLanes = 0;
6349
6350 if (BaseOpcode->Atomic) {
6351 VData = Op.getOperand(2);
6352
6353 bool Is64Bit = VData.getValueType() == MVT::i64;
6354 if (BaseOpcode->AtomicX2) {
6355 SDValue VData2 = Op.getOperand(3);
6356 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
6357 {VData, VData2});
6358 if (Is64Bit)
6359 VData = DAG.getBitcast(MVT::v4i32, VData);
6360
6361 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
6362 DMask = Is64Bit ? 0xf : 0x3;
6363 NumVDataDwords = Is64Bit ? 4 : 2;
6364 } else {
6365 DMask = Is64Bit ? 0x3 : 0x1;
6366 NumVDataDwords = Is64Bit ? 2 : 1;
6367 }
6368 } else {
6369 auto *DMaskConst =
6370 cast<ConstantSDNode>(Op.getOperand(ArgOffset + Intr->DMaskIndex));
6371 DMask = DMaskConst->getZExtValue();
6372 DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
6373
6374 if (BaseOpcode->Store) {
6375 VData = Op.getOperand(2);
6376
6377 MVT StoreVT = VData.getSimpleValueType();
6378 if (StoreVT.getScalarType() == MVT::f16) {
6379 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
6380 return Op; // D16 is unsupported for this instruction
6381
6382 IsD16 = true;
6383 VData = handleD16VData(VData, DAG, true);
6384 }
6385
6386 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
6387 } else {
6388 // Work out the num dwords based on the dmask popcount and underlying type
6389 // and whether packing is supported.
6390 MVT LoadVT = ResultTypes[0].getSimpleVT();
6391 if (LoadVT.getScalarType() == MVT::f16) {
6392 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
6393 return Op; // D16 is unsupported for this instruction
6394
6395 IsD16 = true;
6396 }
6397
6398 // Confirm that the return type is large enough for the dmask specified
6399 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
6400 (!LoadVT.isVector() && DMaskLanes > 1))
6401 return Op;
6402
6403 // The sq block of gfx8 and gfx9 do not estimate register use correctly
6404 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
6405 // instructions.
6406 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
6407 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
6408 NumVDataDwords = (DMaskLanes + 1) / 2;
6409 else
6410 NumVDataDwords = DMaskLanes;
6411
6412 AdjustRetType = true;
6413 }
6414 }
6415
6416 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
6417 SmallVector<SDValue, 4> VAddrs;
6418
6419 // Check for 16 bit addresses or derivatives and pack if true.
6420 MVT VAddrVT =
6421 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
6422 MVT VAddrScalarVT = VAddrVT.getScalarType();
6423 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
6424 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
6425
6426 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
6427 VAddrScalarVT = VAddrVT.getScalarType();
6428 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
6429 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
6430
6431 // Push back extra arguments.
6432 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
6433 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
6434 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
6435 // Special handling of bias when A16 is on. Bias is of type half but
6436 // occupies full 32-bit.
6437 SDValue Bias = DAG.getBuildVector(
6438 MVT::v2f16, DL,
6439 {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)});
6440 VAddrs.push_back(Bias);
6441 } else {
6442 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
6443 "Bias needs to be converted to 16 bit in A16 mode");
6444 VAddrs.push_back(Op.getOperand(ArgOffset + I));
6445 }
6446 }
6447
6448 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
6449 // 16 bit gradients are supported, but are tied to the A16 control
6450 // so both gradients and addresses must be 16 bit
6451 LLVM_DEBUG(
6452 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
6453 "require 16 bit args for both gradients and addresses");
6454 return Op;
6455 }
6456
6457 if (IsA16) {
6458 if (!ST->hasA16()) {
6459 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
6460 "support 16 bit addresses\n");
6461 return Op;
6462 }
6463 }
6464
6465 // We've dealt with incorrect input so we know that if IsA16, IsG16
6466 // are set then we have to compress/pack operands (either address,
6467 // gradient or both)
6468 // In the case where a16 and gradients are tied (no G16 support) then we
6469 // have already verified that both IsA16 and IsG16 are true
6470 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
6471 // Activate g16
6472 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
6473 AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
6474 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
6475 }
6476
6477 // Add gradients (packed or unpacked)
6478 if (IsG16) {
6479 // Pack the gradients
6480 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
6481 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
6482 ArgOffset + Intr->GradientStart,
6483 ArgOffset + Intr->CoordStart, Intr->NumGradients);
6484 } else {
6485 for (unsigned I = ArgOffset + Intr->GradientStart;
6486 I < ArgOffset + Intr->CoordStart; I++)
6487 VAddrs.push_back(Op.getOperand(I));
6488 }
6489
6490 // Add addresses (packed or unpacked)
6491 if (IsA16) {
6492 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
6493 ArgOffset + Intr->CoordStart, VAddrEnd,
6494 0 /* No gradients */);
6495 } else {
6496 // Add uncompressed address
6497 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
6498 VAddrs.push_back(Op.getOperand(I));
6499 }
6500
6501 // If the register allocator cannot place the address registers contiguously
6502 // without introducing moves, then using the non-sequential address encoding
6503 // is always preferable, since it saves VALU instructions and is usually a
6504 // wash in terms of code size or even better.
6505 //
6506 // However, we currently have no way of hinting to the register allocator that
6507 // MIMG addresses should be placed contiguously when it is possible to do so,
6508 // so force non-NSA for the common 2-address case as a heuristic.
6509 //
6510 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
6511 // allocation when possible.
6512 //
6513 // TODO: we can actually allow partial NSA where the final register is a
6514 // contiguous set of the remaining addresses.
6515 // This could help where there are more addresses than supported.
6516 bool UseNSA = ST->hasFeature(AMDGPU::FeatureNSAEncoding) &&
6517 VAddrs.size() >= 3 &&
6518 VAddrs.size() <= (unsigned)ST->getNSAMaxSize();
6519 SDValue VAddr;
6520 if (!UseNSA)
6521 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
6522
6523 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
6524 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
6525 SDValue Unorm;
6526 if (!BaseOpcode->Sampler) {
6527 Unorm = True;
6528 } else {
6529 auto UnormConst =
6530 cast<ConstantSDNode>(Op.getOperand(ArgOffset + Intr->UnormIndex));
6531
6532 Unorm = UnormConst->getZExtValue() ? True : False;
6533 }
6534
6535 SDValue TFE;
6536 SDValue LWE;
6537 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
6538 bool IsTexFail = false;
6539 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
6540 return Op;
6541
6542 if (IsTexFail) {
6543 if (!DMaskLanes) {
6544 // Expecting to get an error flag since TFC is on - and dmask is 0
6545 // Force dmask to be at least 1 otherwise the instruction will fail
6546 DMask = 0x1;
6547 DMaskLanes = 1;
6548 NumVDataDwords = 1;
6549 }
6550 NumVDataDwords += 1;
6551 AdjustRetType = true;
6552 }
6553
6554 // Has something earlier tagged that the return type needs adjusting
6555 // This happens if the instruction is a load or has set TexFailCtrl flags
6556 if (AdjustRetType) {
6557 // NumVDataDwords reflects the true number of dwords required in the return type
6558 if (DMaskLanes == 0 && !BaseOpcode->Store) {
6559 // This is a no-op load. This can be eliminated
6560 SDValue Undef = DAG.getUNDEF(Op.getValueType());
6561 if (isa<MemSDNode>(Op))
6562 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
6563 return Undef;
6564 }
6565
6566 EVT NewVT = NumVDataDwords > 1 ?
6567 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumVDataDwords)
6568 : MVT::i32;
6569
6570 ResultTypes[0] = NewVT;
6571 if (ResultTypes.size() == 3) {
6572 // Original result was aggregate type used for TexFailCtrl results
6573 // The actual instruction returns as a vector type which has now been
6574 // created. Remove the aggregate result.
6575 ResultTypes.erase(&ResultTypes[1]);
6576 }
6577 }
6578
6579 unsigned CPol = cast<ConstantSDNode>(
6580 Op.getOperand(ArgOffset + Intr->CachePolicyIndex))->getZExtValue();
6581 if (BaseOpcode->Atomic)
6582 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
6583 if (CPol & ~AMDGPU::CPol::ALL)
6584 return Op;
6585
6586 SmallVector<SDValue, 26> Ops;
6587 if (BaseOpcode->Store || BaseOpcode->Atomic)
6588 Ops.push_back(VData); // vdata
6589 if (UseNSA)
6590 append_range(Ops, VAddrs);
6591 else
6592 Ops.push_back(VAddr);
6593 Ops.push_back(Op.getOperand(ArgOffset + Intr->RsrcIndex));
6594 if (BaseOpcode->Sampler)
6595 Ops.push_back(Op.getOperand(ArgOffset + Intr->SampIndex));
6596 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
6597 if (IsGFX10Plus)
6598 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
6599 Ops.push_back(Unorm);
6600 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
6601 Ops.push_back(IsA16 && // r128, a16 for gfx9
6602 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
6603 if (IsGFX10Plus)
6604 Ops.push_back(IsA16 ? True : False);
6605 if (!Subtarget->hasGFX90AInsts()) {
6606 Ops.push_back(TFE); //tfe
6607 } else if (cast<ConstantSDNode>(TFE)->getZExtValue()) {
6608 report_fatal_error("TFE is not supported on this GPU");
6609 }
6610 Ops.push_back(LWE); // lwe
6611 if (!IsGFX10Plus)
6612 Ops.push_back(DimInfo->DA ? True : False);
6613 if (BaseOpcode->HasD16)
6614 Ops.push_back(IsD16 ? True : False);
6615 if (isa<MemSDNode>(Op))
6616 Ops.push_back(Op.getOperand(0)); // chain
6617
6618 int NumVAddrDwords =
6619 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
6620 int Opcode = -1;
6621
6622 if (IsGFX11Plus) {
6623 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
6624 UseNSA ? AMDGPU::MIMGEncGfx11NSA
6625 : AMDGPU::MIMGEncGfx11Default,
6626 NumVDataDwords, NumVAddrDwords);
6627 } else if (IsGFX10Plus) {
6628 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
6629 UseNSA ? AMDGPU::MIMGEncGfx10NSA
6630 : AMDGPU::MIMGEncGfx10Default,
6631 NumVDataDwords, NumVAddrDwords);
6632 } else {
6633 if (Subtarget->hasGFX90AInsts()) {
6634 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
6635 NumVDataDwords, NumVAddrDwords);
6636 if (Opcode == -1)
6637 report_fatal_error(
6638 "requested image instruction is not supported on this GPU");
6639 }
6640 if (Opcode == -1 &&
6641 Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
6642 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
6643 NumVDataDwords, NumVAddrDwords);
6644 if (Opcode == -1)
6645 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
6646 NumVDataDwords, NumVAddrDwords);
6647 }
6648 assert(Opcode != -1);
6649
6650 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
6651 if (auto MemOp = dyn_cast<MemSDNode>(Op)) {
6652 MachineMemOperand *MemRef = MemOp->getMemOperand();
6653 DAG.setNodeMemRefs(NewNode, {MemRef});
6654 }
6655
6656 if (BaseOpcode->AtomicX2) {
6657 SmallVector<SDValue, 1> Elt;
6658 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
6659 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
6660 }
6661 if (BaseOpcode->Store)
6662 return SDValue(NewNode, 0);
6663 return constructRetValue(DAG, NewNode,
6664 OrigResultTypes, IsTexFail,
6665 Subtarget->hasUnpackedD16VMem(), IsD16,
6666 DMaskLanes, NumVDataDwords, DL);
6667 }
6668
lowerSBuffer(EVT VT,SDLoc DL,SDValue Rsrc,SDValue Offset,SDValue CachePolicy,SelectionDAG & DAG) const6669 SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
6670 SDValue Offset, SDValue CachePolicy,
6671 SelectionDAG &DAG) const {
6672 MachineFunction &MF = DAG.getMachineFunction();
6673
6674 const DataLayout &DataLayout = DAG.getDataLayout();
6675 Align Alignment =
6676 DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext()));
6677
6678 MachineMemOperand *MMO = MF.getMachineMemOperand(
6679 MachinePointerInfo(),
6680 MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
6681 MachineMemOperand::MOInvariant,
6682 VT.getStoreSize(), Alignment);
6683
6684 if (!Offset->isDivergent()) {
6685 SDValue Ops[] = {
6686 Rsrc,
6687 Offset, // Offset
6688 CachePolicy
6689 };
6690
6691 // Widen vec3 load to vec4.
6692 if (VT.isVector() && VT.getVectorNumElements() == 3) {
6693 EVT WidenedVT =
6694 EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
6695 auto WidenedOp = DAG.getMemIntrinsicNode(
6696 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
6697 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
6698 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
6699 DAG.getVectorIdxConstant(0, DL));
6700 return Subvector;
6701 }
6702
6703 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
6704 DAG.getVTList(VT), Ops, VT, MMO);
6705 }
6706
6707 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
6708 // assume that the buffer is unswizzled.
6709 SmallVector<SDValue, 4> Loads;
6710 unsigned NumLoads = 1;
6711 MVT LoadVT = VT.getSimpleVT();
6712 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
6713 assert((LoadVT.getScalarType() == MVT::i32 ||
6714 LoadVT.getScalarType() == MVT::f32));
6715
6716 if (NumElts == 8 || NumElts == 16) {
6717 NumLoads = NumElts / 4;
6718 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
6719 }
6720
6721 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
6722 SDValue Ops[] = {
6723 DAG.getEntryNode(), // Chain
6724 Rsrc, // rsrc
6725 DAG.getConstant(0, DL, MVT::i32), // vindex
6726 {}, // voffset
6727 {}, // soffset
6728 {}, // offset
6729 CachePolicy, // cachepolicy
6730 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
6731 };
6732
6733 // Use the alignment to ensure that the required offsets will fit into the
6734 // immediate offsets.
6735 setBufferOffsets(Offset, DAG, &Ops[3],
6736 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
6737
6738 uint64_t InstOffset = cast<ConstantSDNode>(Ops[5])->getZExtValue();
6739 for (unsigned i = 0; i < NumLoads; ++i) {
6740 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
6741 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
6742 LoadVT, MMO, DAG));
6743 }
6744
6745 if (NumElts == 8 || NumElts == 16)
6746 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
6747
6748 return Loads[0];
6749 }
6750
lowerWorkitemID(SelectionDAG & DAG,SDValue Op,unsigned Dim,const ArgDescriptor & Arg) const6751 SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
6752 unsigned Dim,
6753 const ArgDescriptor &Arg) const {
6754 SDLoc SL(Op);
6755 MachineFunction &MF = DAG.getMachineFunction();
6756 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
6757 if (MaxID == 0)
6758 return DAG.getConstant(0, SL, MVT::i32);
6759
6760 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
6761 SDLoc(DAG.getEntryNode()), Arg);
6762
6763 // Don't bother inserting AssertZext for packed IDs since we're emitting the
6764 // masking operations anyway.
6765 //
6766 // TODO: We could assert the top bit is 0 for the source copy.
6767 if (Arg.isMasked())
6768 return Val;
6769
6770 // Preserve the known bits after expansion to a copy.
6771 EVT SmallVT =
6772 EVT::getIntegerVT(*DAG.getContext(), 32 - countLeadingZeros(MaxID));
6773 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
6774 DAG.getValueType(SmallVT));
6775 }
6776
LowerINTRINSIC_WO_CHAIN(SDValue Op,SelectionDAG & DAG) const6777 SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
6778 SelectionDAG &DAG) const {
6779 MachineFunction &MF = DAG.getMachineFunction();
6780 auto MFI = MF.getInfo<SIMachineFunctionInfo>();
6781
6782 EVT VT = Op.getValueType();
6783 SDLoc DL(Op);
6784 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
6785
6786 // TODO: Should this propagate fast-math-flags?
6787
6788 switch (IntrinsicID) {
6789 case Intrinsic::amdgcn_implicit_buffer_ptr: {
6790 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
6791 return emitNonHSAIntrinsicError(DAG, DL, VT);
6792 return getPreloadedValue(DAG, *MFI, VT,
6793 AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR);
6794 }
6795 case Intrinsic::amdgcn_dispatch_ptr:
6796 case Intrinsic::amdgcn_queue_ptr: {
6797 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
6798 DiagnosticInfoUnsupported BadIntrin(
6799 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
6800 DL.getDebugLoc());
6801 DAG.getContext()->diagnose(BadIntrin);
6802 return DAG.getUNDEF(VT);
6803 }
6804
6805 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
6806 AMDGPUFunctionArgInfo::DISPATCH_PTR : AMDGPUFunctionArgInfo::QUEUE_PTR;
6807 return getPreloadedValue(DAG, *MFI, VT, RegID);
6808 }
6809 case Intrinsic::amdgcn_implicitarg_ptr: {
6810 if (MFI->isEntryFunction())
6811 return getImplicitArgPtr(DAG, DL);
6812 return getPreloadedValue(DAG, *MFI, VT,
6813 AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
6814 }
6815 case Intrinsic::amdgcn_kernarg_segment_ptr: {
6816 if (!AMDGPU::isKernel(MF.getFunction().getCallingConv())) {
6817 // This only makes sense to call in a kernel, so just lower to null.
6818 return DAG.getConstant(0, DL, VT);
6819 }
6820
6821 return getPreloadedValue(DAG, *MFI, VT,
6822 AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
6823 }
6824 case Intrinsic::amdgcn_dispatch_id: {
6825 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
6826 }
6827 case Intrinsic::amdgcn_rcp:
6828 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
6829 case Intrinsic::amdgcn_rsq:
6830 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
6831 case Intrinsic::amdgcn_rsq_legacy:
6832 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
6833 return emitRemovedIntrinsicError(DAG, DL, VT);
6834 return SDValue();
6835 case Intrinsic::amdgcn_rcp_legacy:
6836 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
6837 return emitRemovedIntrinsicError(DAG, DL, VT);
6838 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
6839 case Intrinsic::amdgcn_rsq_clamp: {
6840 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
6841 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
6842
6843 Type *Type = VT.getTypeForEVT(*DAG.getContext());
6844 APFloat Max = APFloat::getLargest(Type->getFltSemantics());
6845 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
6846
6847 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
6848 SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
6849 DAG.getConstantFP(Max, DL, VT));
6850 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
6851 DAG.getConstantFP(Min, DL, VT));
6852 }
6853 case Intrinsic::r600_read_ngroups_x:
6854 if (Subtarget->isAmdHsaOS())
6855 return emitNonHSAIntrinsicError(DAG, DL, VT);
6856
6857 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
6858 SI::KernelInputOffsets::NGROUPS_X, Align(4),
6859 false);
6860 case Intrinsic::r600_read_ngroups_y:
6861 if (Subtarget->isAmdHsaOS())
6862 return emitNonHSAIntrinsicError(DAG, DL, VT);
6863
6864 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
6865 SI::KernelInputOffsets::NGROUPS_Y, Align(4),
6866 false);
6867 case Intrinsic::r600_read_ngroups_z:
6868 if (Subtarget->isAmdHsaOS())
6869 return emitNonHSAIntrinsicError(DAG, DL, VT);
6870
6871 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
6872 SI::KernelInputOffsets::NGROUPS_Z, Align(4),
6873 false);
6874 case Intrinsic::r600_read_global_size_x:
6875 if (Subtarget->isAmdHsaOS())
6876 return emitNonHSAIntrinsicError(DAG, DL, VT);
6877
6878 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
6879 SI::KernelInputOffsets::GLOBAL_SIZE_X,
6880 Align(4), false);
6881 case Intrinsic::r600_read_global_size_y:
6882 if (Subtarget->isAmdHsaOS())
6883 return emitNonHSAIntrinsicError(DAG, DL, VT);
6884
6885 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
6886 SI::KernelInputOffsets::GLOBAL_SIZE_Y,
6887 Align(4), false);
6888 case Intrinsic::r600_read_global_size_z:
6889 if (Subtarget->isAmdHsaOS())
6890 return emitNonHSAIntrinsicError(DAG, DL, VT);
6891
6892 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
6893 SI::KernelInputOffsets::GLOBAL_SIZE_Z,
6894 Align(4), false);
6895 case Intrinsic::r600_read_local_size_x:
6896 if (Subtarget->isAmdHsaOS())
6897 return emitNonHSAIntrinsicError(DAG, DL, VT);
6898
6899 return lowerImplicitZextParam(DAG, Op, MVT::i16,
6900 SI::KernelInputOffsets::LOCAL_SIZE_X);
6901 case Intrinsic::r600_read_local_size_y:
6902 if (Subtarget->isAmdHsaOS())
6903 return emitNonHSAIntrinsicError(DAG, DL, VT);
6904
6905 return lowerImplicitZextParam(DAG, Op, MVT::i16,
6906 SI::KernelInputOffsets::LOCAL_SIZE_Y);
6907 case Intrinsic::r600_read_local_size_z:
6908 if (Subtarget->isAmdHsaOS())
6909 return emitNonHSAIntrinsicError(DAG, DL, VT);
6910
6911 return lowerImplicitZextParam(DAG, Op, MVT::i16,
6912 SI::KernelInputOffsets::LOCAL_SIZE_Z);
6913 case Intrinsic::amdgcn_workgroup_id_x:
6914 return getPreloadedValue(DAG, *MFI, VT,
6915 AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
6916 case Intrinsic::amdgcn_workgroup_id_y:
6917 return getPreloadedValue(DAG, *MFI, VT,
6918 AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
6919 case Intrinsic::amdgcn_workgroup_id_z:
6920 return getPreloadedValue(DAG, *MFI, VT,
6921 AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
6922 case Intrinsic::amdgcn_lds_kernel_id: {
6923 if (MFI->isEntryFunction())
6924 return getLDSKernelId(DAG, DL);
6925 return getPreloadedValue(DAG, *MFI, VT,
6926 AMDGPUFunctionArgInfo::LDS_KERNEL_ID);
6927 }
6928 case Intrinsic::amdgcn_workitem_id_x:
6929 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
6930 case Intrinsic::amdgcn_workitem_id_y:
6931 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
6932 case Intrinsic::amdgcn_workitem_id_z:
6933 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
6934 case Intrinsic::amdgcn_wavefrontsize:
6935 return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
6936 SDLoc(Op), MVT::i32);
6937 case Intrinsic::amdgcn_s_buffer_load: {
6938 unsigned CPol = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
6939 if (CPol & ~AMDGPU::CPol::ALL)
6940 return Op;
6941 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
6942 DAG);
6943 }
6944 case Intrinsic::amdgcn_fdiv_fast:
6945 return lowerFDIV_FAST(Op, DAG);
6946 case Intrinsic::amdgcn_sin:
6947 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
6948
6949 case Intrinsic::amdgcn_cos:
6950 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
6951
6952 case Intrinsic::amdgcn_mul_u24:
6953 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1), Op.getOperand(2));
6954 case Intrinsic::amdgcn_mul_i24:
6955 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1), Op.getOperand(2));
6956
6957 case Intrinsic::amdgcn_log_clamp: {
6958 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
6959 return SDValue();
6960
6961 return emitRemovedIntrinsicError(DAG, DL, VT);
6962 }
6963 case Intrinsic::amdgcn_ldexp:
6964 return DAG.getNode(AMDGPUISD::LDEXP, DL, VT,
6965 Op.getOperand(1), Op.getOperand(2));
6966
6967 case Intrinsic::amdgcn_fract:
6968 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
6969
6970 case Intrinsic::amdgcn_class:
6971 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
6972 Op.getOperand(1), Op.getOperand(2));
6973 case Intrinsic::amdgcn_div_fmas:
6974 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
6975 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
6976 Op.getOperand(4));
6977
6978 case Intrinsic::amdgcn_div_fixup:
6979 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
6980 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
6981
6982 case Intrinsic::amdgcn_div_scale: {
6983 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
6984
6985 // Translate to the operands expected by the machine instruction. The
6986 // first parameter must be the same as the first instruction.
6987 SDValue Numerator = Op.getOperand(1);
6988 SDValue Denominator = Op.getOperand(2);
6989
6990 // Note this order is opposite of the machine instruction's operations,
6991 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
6992 // intrinsic has the numerator as the first operand to match a normal
6993 // division operation.
6994
6995 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
6996
6997 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
6998 Denominator, Numerator);
6999 }
7000 case Intrinsic::amdgcn_icmp: {
7001 // There is a Pat that handles this variant, so return it as-is.
7002 if (Op.getOperand(1).getValueType() == MVT::i1 &&
7003 Op.getConstantOperandVal(2) == 0 &&
7004 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
7005 return Op;
7006 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
7007 }
7008 case Intrinsic::amdgcn_fcmp: {
7009 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
7010 }
7011 case Intrinsic::amdgcn_ballot:
7012 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
7013 case Intrinsic::amdgcn_fmed3:
7014 return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
7015 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
7016 case Intrinsic::amdgcn_fdot2:
7017 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT,
7018 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
7019 Op.getOperand(4));
7020 case Intrinsic::amdgcn_fmul_legacy:
7021 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
7022 Op.getOperand(1), Op.getOperand(2));
7023 case Intrinsic::amdgcn_sffbh:
7024 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
7025 case Intrinsic::amdgcn_sbfe:
7026 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
7027 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
7028 case Intrinsic::amdgcn_ubfe:
7029 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
7030 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
7031 case Intrinsic::amdgcn_cvt_pkrtz:
7032 case Intrinsic::amdgcn_cvt_pknorm_i16:
7033 case Intrinsic::amdgcn_cvt_pknorm_u16:
7034 case Intrinsic::amdgcn_cvt_pk_i16:
7035 case Intrinsic::amdgcn_cvt_pk_u16: {
7036 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
7037 EVT VT = Op.getValueType();
7038 unsigned Opcode;
7039
7040 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
7041 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
7042 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
7043 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
7044 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
7045 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
7046 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
7047 Opcode = AMDGPUISD::CVT_PK_I16_I32;
7048 else
7049 Opcode = AMDGPUISD::CVT_PK_U16_U32;
7050
7051 if (isTypeLegal(VT))
7052 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
7053
7054 SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
7055 Op.getOperand(1), Op.getOperand(2));
7056 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
7057 }
7058 case Intrinsic::amdgcn_fmad_ftz:
7059 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
7060 Op.getOperand(2), Op.getOperand(3));
7061
7062 case Intrinsic::amdgcn_if_break:
7063 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
7064 Op->getOperand(1), Op->getOperand(2)), 0);
7065
7066 case Intrinsic::amdgcn_groupstaticsize: {
7067 Triple::OSType OS = getTargetMachine().getTargetTriple().getOS();
7068 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
7069 return Op;
7070
7071 const Module *M = MF.getFunction().getParent();
7072 const GlobalValue *GV =
7073 M->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize));
7074 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
7075 SIInstrInfo::MO_ABS32_LO);
7076 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
7077 }
7078 case Intrinsic::amdgcn_is_shared:
7079 case Intrinsic::amdgcn_is_private: {
7080 SDLoc SL(Op);
7081 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared) ?
7082 AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS;
7083 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
7084 SDValue SrcVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32,
7085 Op.getOperand(1));
7086
7087 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
7088 DAG.getConstant(1, SL, MVT::i32));
7089 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
7090 }
7091 case Intrinsic::amdgcn_perm:
7092 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
7093 Op.getOperand(2), Op.getOperand(3));
7094 case Intrinsic::amdgcn_reloc_constant: {
7095 Module *M = const_cast<Module *>(MF.getFunction().getParent());
7096 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
7097 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
7098 auto RelocSymbol = cast<GlobalVariable>(
7099 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
7100 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
7101 SIInstrInfo::MO_ABS32_LO);
7102 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
7103 }
7104 default:
7105 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
7106 AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
7107 return lowerImage(Op, ImageDimIntr, DAG, false);
7108
7109 return Op;
7110 }
7111 }
7112
7113 /// Update \p MMO based on the offset inputs to an intrinsic.
updateBufferMMO(MachineMemOperand * MMO,SDValue VOffset,SDValue SOffset,SDValue Offset,SDValue VIndex=SDValue ())7114 static void updateBufferMMO(MachineMemOperand *MMO, SDValue VOffset,
7115 SDValue SOffset, SDValue Offset,
7116 SDValue VIndex = SDValue()) {
7117 if (!isa<ConstantSDNode>(VOffset) || !isa<ConstantSDNode>(SOffset) ||
7118 !isa<ConstantSDNode>(Offset)) {
7119 // The combined offset is not known to be constant, so we cannot represent
7120 // it in the MMO. Give up.
7121 MMO->setValue((Value *)nullptr);
7122 return;
7123 }
7124
7125 if (VIndex && (!isa<ConstantSDNode>(VIndex) ||
7126 !cast<ConstantSDNode>(VIndex)->isZero())) {
7127 // The strided index component of the address is not known to be zero, so we
7128 // cannot represent it in the MMO. Give up.
7129 MMO->setValue((Value *)nullptr);
7130 return;
7131 }
7132
7133 MMO->setOffset(cast<ConstantSDNode>(VOffset)->getSExtValue() +
7134 cast<ConstantSDNode>(SOffset)->getSExtValue() +
7135 cast<ConstantSDNode>(Offset)->getSExtValue());
7136 }
7137
lowerRawBufferAtomicIntrin(SDValue Op,SelectionDAG & DAG,unsigned NewOpcode) const7138 SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
7139 SelectionDAG &DAG,
7140 unsigned NewOpcode) const {
7141 SDLoc DL(Op);
7142
7143 SDValue VData = Op.getOperand(2);
7144 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
7145 SDValue Ops[] = {
7146 Op.getOperand(0), // Chain
7147 VData, // vdata
7148 Op.getOperand(3), // rsrc
7149 DAG.getConstant(0, DL, MVT::i32), // vindex
7150 Offsets.first, // voffset
7151 Op.getOperand(5), // soffset
7152 Offsets.second, // offset
7153 Op.getOperand(6), // cachepolicy
7154 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
7155 };
7156
7157 auto *M = cast<MemSDNode>(Op);
7158 updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6]);
7159
7160 EVT MemVT = VData.getValueType();
7161 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
7162 M->getMemOperand());
7163 }
7164
7165 // Return a value to use for the idxen operand by examining the vindex operand.
getIdxEn(SDValue VIndex)7166 static unsigned getIdxEn(SDValue VIndex) {
7167 if (auto VIndexC = dyn_cast<ConstantSDNode>(VIndex))
7168 // No need to set idxen if vindex is known to be zero.
7169 return VIndexC->getZExtValue() != 0;
7170 return 1;
7171 }
7172
7173 SDValue
lowerStructBufferAtomicIntrin(SDValue Op,SelectionDAG & DAG,unsigned NewOpcode) const7174 SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
7175 unsigned NewOpcode) const {
7176 SDLoc DL(Op);
7177
7178 SDValue VData = Op.getOperand(2);
7179 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
7180 SDValue Ops[] = {
7181 Op.getOperand(0), // Chain
7182 VData, // vdata
7183 Op.getOperand(3), // rsrc
7184 Op.getOperand(4), // vindex
7185 Offsets.first, // voffset
7186 Op.getOperand(6), // soffset
7187 Offsets.second, // offset
7188 Op.getOperand(7), // cachepolicy
7189 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
7190 };
7191
7192 auto *M = cast<MemSDNode>(Op);
7193 updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6], Ops[3]);
7194
7195 EVT MemVT = VData.getValueType();
7196 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
7197 M->getMemOperand());
7198 }
7199
LowerINTRINSIC_W_CHAIN(SDValue Op,SelectionDAG & DAG) const7200 SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
7201 SelectionDAG &DAG) const {
7202 unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
7203 SDLoc DL(Op);
7204
7205 switch (IntrID) {
7206 case Intrinsic::amdgcn_ds_ordered_add:
7207 case Intrinsic::amdgcn_ds_ordered_swap: {
7208 MemSDNode *M = cast<MemSDNode>(Op);
7209 SDValue Chain = M->getOperand(0);
7210 SDValue M0 = M->getOperand(2);
7211 SDValue Value = M->getOperand(3);
7212 unsigned IndexOperand = M->getConstantOperandVal(7);
7213 unsigned WaveRelease = M->getConstantOperandVal(8);
7214 unsigned WaveDone = M->getConstantOperandVal(9);
7215
7216 unsigned OrderedCountIndex = IndexOperand & 0x3f;
7217 IndexOperand &= ~0x3f;
7218 unsigned CountDw = 0;
7219
7220 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
7221 CountDw = (IndexOperand >> 24) & 0xf;
7222 IndexOperand &= ~(0xf << 24);
7223
7224 if (CountDw < 1 || CountDw > 4) {
7225 report_fatal_error(
7226 "ds_ordered_count: dword count must be between 1 and 4");
7227 }
7228 }
7229
7230 if (IndexOperand)
7231 report_fatal_error("ds_ordered_count: bad index operand");
7232
7233 if (WaveDone && !WaveRelease)
7234 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
7235
7236 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
7237 unsigned ShaderType =
7238 SIInstrInfo::getDSShaderTypeValue(DAG.getMachineFunction());
7239 unsigned Offset0 = OrderedCountIndex << 2;
7240 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
7241
7242 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
7243 Offset1 |= (CountDw - 1) << 6;
7244
7245 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
7246 Offset1 |= ShaderType << 2;
7247
7248 unsigned Offset = Offset0 | (Offset1 << 8);
7249
7250 SDValue Ops[] = {
7251 Chain,
7252 Value,
7253 DAG.getTargetConstant(Offset, DL, MVT::i16),
7254 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
7255 };
7256 return DAG.getMemIntrinsicNode(AMDGPUISD::DS_ORDERED_COUNT, DL,
7257 M->getVTList(), Ops, M->getMemoryVT(),
7258 M->getMemOperand());
7259 }
7260 case Intrinsic::amdgcn_ds_fadd: {
7261 MemSDNode *M = cast<MemSDNode>(Op);
7262 unsigned Opc;
7263 switch (IntrID) {
7264 case Intrinsic::amdgcn_ds_fadd:
7265 Opc = ISD::ATOMIC_LOAD_FADD;
7266 break;
7267 }
7268
7269 return DAG.getAtomic(Opc, SDLoc(Op), M->getMemoryVT(),
7270 M->getOperand(0), M->getOperand(2), M->getOperand(3),
7271 M->getMemOperand());
7272 }
7273 case Intrinsic::amdgcn_atomic_inc:
7274 case Intrinsic::amdgcn_atomic_dec:
7275 case Intrinsic::amdgcn_ds_fmin:
7276 case Intrinsic::amdgcn_ds_fmax: {
7277 MemSDNode *M = cast<MemSDNode>(Op);
7278 unsigned Opc;
7279 switch (IntrID) {
7280 case Intrinsic::amdgcn_atomic_inc:
7281 Opc = AMDGPUISD::ATOMIC_INC;
7282 break;
7283 case Intrinsic::amdgcn_atomic_dec:
7284 Opc = AMDGPUISD::ATOMIC_DEC;
7285 break;
7286 case Intrinsic::amdgcn_ds_fmin:
7287 Opc = AMDGPUISD::ATOMIC_LOAD_FMIN;
7288 break;
7289 case Intrinsic::amdgcn_ds_fmax:
7290 Opc = AMDGPUISD::ATOMIC_LOAD_FMAX;
7291 break;
7292 default:
7293 llvm_unreachable("Unknown intrinsic!");
7294 }
7295 SDValue Ops[] = {
7296 M->getOperand(0), // Chain
7297 M->getOperand(2), // Ptr
7298 M->getOperand(3) // Value
7299 };
7300
7301 return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
7302 M->getMemoryVT(), M->getMemOperand());
7303 }
7304 case Intrinsic::amdgcn_buffer_load:
7305 case Intrinsic::amdgcn_buffer_load_format: {
7306 unsigned Glc = cast<ConstantSDNode>(Op.getOperand(5))->getZExtValue();
7307 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
7308 unsigned IdxEn = getIdxEn(Op.getOperand(3));
7309 SDValue Ops[] = {
7310 Op.getOperand(0), // Chain
7311 Op.getOperand(2), // rsrc
7312 Op.getOperand(3), // vindex
7313 SDValue(), // voffset -- will be set by setBufferOffsets
7314 SDValue(), // soffset -- will be set by setBufferOffsets
7315 SDValue(), // offset -- will be set by setBufferOffsets
7316 DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
7317 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
7318 };
7319 setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]);
7320
7321 unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
7322 AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
7323
7324 EVT VT = Op.getValueType();
7325 EVT IntVT = VT.changeTypeToInteger();
7326 auto *M = cast<MemSDNode>(Op);
7327 updateBufferMMO(M->getMemOperand(), Ops[3], Ops[4], Ops[5], Ops[2]);
7328 EVT LoadVT = Op.getValueType();
7329
7330 if (LoadVT.getScalarType() == MVT::f16)
7331 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
7332 M, DAG, Ops);
7333
7334 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
7335 if (LoadVT.getScalarType() == MVT::i8 ||
7336 LoadVT.getScalarType() == MVT::i16)
7337 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
7338
7339 return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
7340 M->getMemOperand(), DAG);
7341 }
7342 case Intrinsic::amdgcn_raw_buffer_load:
7343 case Intrinsic::amdgcn_raw_buffer_load_format: {
7344 const bool IsFormat = IntrID == Intrinsic::amdgcn_raw_buffer_load_format;
7345
7346 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
7347 SDValue Ops[] = {
7348 Op.getOperand(0), // Chain
7349 Op.getOperand(2), // rsrc
7350 DAG.getConstant(0, DL, MVT::i32), // vindex
7351 Offsets.first, // voffset
7352 Op.getOperand(4), // soffset
7353 Offsets.second, // offset
7354 Op.getOperand(5), // cachepolicy, swizzled buffer
7355 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
7356 };
7357
7358 auto *M = cast<MemSDNode>(Op);
7359 updateBufferMMO(M->getMemOperand(), Ops[3], Ops[4], Ops[5]);
7360 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
7361 }
7362 case Intrinsic::amdgcn_struct_buffer_load:
7363 case Intrinsic::amdgcn_struct_buffer_load_format: {
7364 const bool IsFormat = IntrID == Intrinsic::amdgcn_struct_buffer_load_format;
7365
7366 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
7367 SDValue Ops[] = {
7368 Op.getOperand(0), // Chain
7369 Op.getOperand(2), // rsrc
7370 Op.getOperand(3), // vindex
7371 Offsets.first, // voffset
7372 Op.getOperand(5), // soffset
7373 Offsets.second, // offset
7374 Op.getOperand(6), // cachepolicy, swizzled buffer
7375 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
7376 };
7377
7378 auto *M = cast<MemSDNode>(Op);
7379 updateBufferMMO(M->getMemOperand(), Ops[3], Ops[4], Ops[5], Ops[2]);
7380 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
7381 }
7382 case Intrinsic::amdgcn_tbuffer_load: {
7383 MemSDNode *M = cast<MemSDNode>(Op);
7384 EVT LoadVT = Op.getValueType();
7385
7386 unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
7387 unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
7388 unsigned Glc = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
7389 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
7390 unsigned IdxEn = getIdxEn(Op.getOperand(3));
7391 SDValue Ops[] = {
7392 Op.getOperand(0), // Chain
7393 Op.getOperand(2), // rsrc
7394 Op.getOperand(3), // vindex
7395 Op.getOperand(4), // voffset
7396 Op.getOperand(5), // soffset
7397 Op.getOperand(6), // offset
7398 DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
7399 DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
7400 DAG.getTargetConstant(IdxEn, DL, MVT::i1) // idxen
7401 };
7402
7403 if (LoadVT.getScalarType() == MVT::f16)
7404 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
7405 M, DAG, Ops);
7406 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
7407 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
7408 DAG);
7409 }
7410 case Intrinsic::amdgcn_raw_tbuffer_load: {
7411 MemSDNode *M = cast<MemSDNode>(Op);
7412 EVT LoadVT = Op.getValueType();
7413 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
7414
7415 SDValue Ops[] = {
7416 Op.getOperand(0), // Chain
7417 Op.getOperand(2), // rsrc
7418 DAG.getConstant(0, DL, MVT::i32), // vindex
7419 Offsets.first, // voffset
7420 Op.getOperand(4), // soffset
7421 Offsets.second, // offset
7422 Op.getOperand(5), // format
7423 Op.getOperand(6), // cachepolicy, swizzled buffer
7424 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
7425 };
7426
7427 if (LoadVT.getScalarType() == MVT::f16)
7428 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
7429 M, DAG, Ops);
7430 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
7431 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
7432 DAG);
7433 }
7434 case Intrinsic::amdgcn_struct_tbuffer_load: {
7435 MemSDNode *M = cast<MemSDNode>(Op);
7436 EVT LoadVT = Op.getValueType();
7437 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
7438
7439 SDValue Ops[] = {
7440 Op.getOperand(0), // Chain
7441 Op.getOperand(2), // rsrc
7442 Op.getOperand(3), // vindex
7443 Offsets.first, // voffset
7444 Op.getOperand(5), // soffset
7445 Offsets.second, // offset
7446 Op.getOperand(6), // format
7447 Op.getOperand(7), // cachepolicy, swizzled buffer
7448 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
7449 };
7450
7451 if (LoadVT.getScalarType() == MVT::f16)
7452 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
7453 M, DAG, Ops);
7454 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
7455 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
7456 DAG);
7457 }
7458 case Intrinsic::amdgcn_buffer_atomic_swap:
7459 case Intrinsic::amdgcn_buffer_atomic_add:
7460 case Intrinsic::amdgcn_buffer_atomic_sub:
7461 case Intrinsic::amdgcn_buffer_atomic_csub:
7462 case Intrinsic::amdgcn_buffer_atomic_smin:
7463 case Intrinsic::amdgcn_buffer_atomic_umin:
7464 case Intrinsic::amdgcn_buffer_atomic_smax:
7465 case Intrinsic::amdgcn_buffer_atomic_umax:
7466 case Intrinsic::amdgcn_buffer_atomic_and:
7467 case Intrinsic::amdgcn_buffer_atomic_or:
7468 case Intrinsic::amdgcn_buffer_atomic_xor:
7469 case Intrinsic::amdgcn_buffer_atomic_fadd: {
7470 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
7471 unsigned IdxEn = getIdxEn(Op.getOperand(4));
7472 SDValue Ops[] = {
7473 Op.getOperand(0), // Chain
7474 Op.getOperand(2), // vdata
7475 Op.getOperand(3), // rsrc
7476 Op.getOperand(4), // vindex
7477 SDValue(), // voffset -- will be set by setBufferOffsets
7478 SDValue(), // soffset -- will be set by setBufferOffsets
7479 SDValue(), // offset -- will be set by setBufferOffsets
7480 DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
7481 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
7482 };
7483 setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
7484
7485 EVT VT = Op.getValueType();
7486
7487 auto *M = cast<MemSDNode>(Op);
7488 updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6], Ops[3]);
7489 unsigned Opcode = 0;
7490
7491 switch (IntrID) {
7492 case Intrinsic::amdgcn_buffer_atomic_swap:
7493 Opcode = AMDGPUISD::BUFFER_ATOMIC_SWAP;
7494 break;
7495 case Intrinsic::amdgcn_buffer_atomic_add:
7496 Opcode = AMDGPUISD::BUFFER_ATOMIC_ADD;
7497 break;
7498 case Intrinsic::amdgcn_buffer_atomic_sub:
7499 Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
7500 break;
7501 case Intrinsic::amdgcn_buffer_atomic_csub:
7502 Opcode = AMDGPUISD::BUFFER_ATOMIC_CSUB;
7503 break;
7504 case Intrinsic::amdgcn_buffer_atomic_smin:
7505 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
7506 break;
7507 case Intrinsic::amdgcn_buffer_atomic_umin:
7508 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMIN;
7509 break;
7510 case Intrinsic::amdgcn_buffer_atomic_smax:
7511 Opcode = AMDGPUISD::BUFFER_ATOMIC_SMAX;
7512 break;
7513 case Intrinsic::amdgcn_buffer_atomic_umax:
7514 Opcode = AMDGPUISD::BUFFER_ATOMIC_UMAX;
7515 break;
7516 case Intrinsic::amdgcn_buffer_atomic_and:
7517 Opcode = AMDGPUISD::BUFFER_ATOMIC_AND;
7518 break;
7519 case Intrinsic::amdgcn_buffer_atomic_or:
7520 Opcode = AMDGPUISD::BUFFER_ATOMIC_OR;
7521 break;
7522 case Intrinsic::amdgcn_buffer_atomic_xor:
7523 Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
7524 break;
7525 case Intrinsic::amdgcn_buffer_atomic_fadd:
7526 if (!Op.getValue(0).use_empty() && !hasAtomicFaddRtnForTy(Op)) {
7527 DiagnosticInfoUnsupported
7528 NoFpRet(DAG.getMachineFunction().getFunction(),
7529 "return versions of fp atomics not supported",
7530 DL.getDebugLoc(), DS_Error);
7531 DAG.getContext()->diagnose(NoFpRet);
7532 return SDValue();
7533 }
7534 Opcode = AMDGPUISD::BUFFER_ATOMIC_FADD;
7535 break;
7536 default:
7537 llvm_unreachable("unhandled atomic opcode");
7538 }
7539
7540 return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
7541 M->getMemOperand());
7542 }
7543 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
7544 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
7545 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
7546 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
7547 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
7548 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
7549 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
7550 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
7551 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
7552 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
7553 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
7554 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
7555 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
7556 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
7557 case Intrinsic::amdgcn_raw_buffer_atomic_add:
7558 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
7559 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
7560 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
7561 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
7562 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
7563 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
7564 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
7565 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
7566 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
7567 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
7568 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
7569 case Intrinsic::amdgcn_raw_buffer_atomic_and:
7570 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
7571 case Intrinsic::amdgcn_raw_buffer_atomic_or:
7572 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
7573 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
7574 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
7575 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
7576 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
7577 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
7578 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
7579 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
7580 return lowerStructBufferAtomicIntrin(Op, DAG,
7581 AMDGPUISD::BUFFER_ATOMIC_SWAP);
7582 case Intrinsic::amdgcn_struct_buffer_atomic_add:
7583 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
7584 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
7585 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
7586 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
7587 return lowerStructBufferAtomicIntrin(Op, DAG,
7588 AMDGPUISD::BUFFER_ATOMIC_SMIN);
7589 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
7590 return lowerStructBufferAtomicIntrin(Op, DAG,
7591 AMDGPUISD::BUFFER_ATOMIC_UMIN);
7592 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
7593 return lowerStructBufferAtomicIntrin(Op, DAG,
7594 AMDGPUISD::BUFFER_ATOMIC_SMAX);
7595 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
7596 return lowerStructBufferAtomicIntrin(Op, DAG,
7597 AMDGPUISD::BUFFER_ATOMIC_UMAX);
7598 case Intrinsic::amdgcn_struct_buffer_atomic_and:
7599 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
7600 case Intrinsic::amdgcn_struct_buffer_atomic_or:
7601 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
7602 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
7603 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
7604 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
7605 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
7606 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
7607 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
7608
7609 case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
7610 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
7611 unsigned IdxEn = getIdxEn(Op.getOperand(5));
7612 SDValue Ops[] = {
7613 Op.getOperand(0), // Chain
7614 Op.getOperand(2), // src
7615 Op.getOperand(3), // cmp
7616 Op.getOperand(4), // rsrc
7617 Op.getOperand(5), // vindex
7618 SDValue(), // voffset -- will be set by setBufferOffsets
7619 SDValue(), // soffset -- will be set by setBufferOffsets
7620 SDValue(), // offset -- will be set by setBufferOffsets
7621 DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
7622 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
7623 };
7624 setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]);
7625
7626 EVT VT = Op.getValueType();
7627 auto *M = cast<MemSDNode>(Op);
7628 updateBufferMMO(M->getMemOperand(), Ops[5], Ops[6], Ops[7], Ops[4]);
7629
7630 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
7631 Op->getVTList(), Ops, VT, M->getMemOperand());
7632 }
7633 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: {
7634 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
7635 SDValue Ops[] = {
7636 Op.getOperand(0), // Chain
7637 Op.getOperand(2), // src
7638 Op.getOperand(3), // cmp
7639 Op.getOperand(4), // rsrc
7640 DAG.getConstant(0, DL, MVT::i32), // vindex
7641 Offsets.first, // voffset
7642 Op.getOperand(6), // soffset
7643 Offsets.second, // offset
7644 Op.getOperand(7), // cachepolicy
7645 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
7646 };
7647 EVT VT = Op.getValueType();
7648 auto *M = cast<MemSDNode>(Op);
7649 updateBufferMMO(M->getMemOperand(), Ops[5], Ops[6], Ops[7]);
7650
7651 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
7652 Op->getVTList(), Ops, VT, M->getMemOperand());
7653 }
7654 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: {
7655 auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG);
7656 SDValue Ops[] = {
7657 Op.getOperand(0), // Chain
7658 Op.getOperand(2), // src
7659 Op.getOperand(3), // cmp
7660 Op.getOperand(4), // rsrc
7661 Op.getOperand(5), // vindex
7662 Offsets.first, // voffset
7663 Op.getOperand(7), // soffset
7664 Offsets.second, // offset
7665 Op.getOperand(8), // cachepolicy
7666 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
7667 };
7668 EVT VT = Op.getValueType();
7669 auto *M = cast<MemSDNode>(Op);
7670 updateBufferMMO(M->getMemOperand(), Ops[5], Ops[6], Ops[7], Ops[4]);
7671
7672 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
7673 Op->getVTList(), Ops, VT, M->getMemOperand());
7674 }
7675 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
7676 MemSDNode *M = cast<MemSDNode>(Op);
7677 SDValue NodePtr = M->getOperand(2);
7678 SDValue RayExtent = M->getOperand(3);
7679 SDValue RayOrigin = M->getOperand(4);
7680 SDValue RayDir = M->getOperand(5);
7681 SDValue RayInvDir = M->getOperand(6);
7682 SDValue TDescr = M->getOperand(7);
7683
7684 assert(NodePtr.getValueType() == MVT::i32 ||
7685 NodePtr.getValueType() == MVT::i64);
7686 assert(RayDir.getValueType() == MVT::v3f16 ||
7687 RayDir.getValueType() == MVT::v3f32);
7688
7689 if (!Subtarget->hasGFX10_AEncoding()) {
7690 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
7691 return SDValue();
7692 }
7693
7694 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
7695 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
7696 const bool Is64 = NodePtr.getValueType() == MVT::i64;
7697 const unsigned NumVDataDwords = 4;
7698 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7699 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7700 const bool UseNSA =
7701 Subtarget->hasNSAEncoding() && NumVAddrs <= Subtarget->getNSAMaxSize();
7702 const unsigned BaseOpcodes[2][2] = {
7703 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7704 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7705 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7706 int Opcode;
7707 if (UseNSA) {
7708 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7709 IsGFX11Plus ? AMDGPU::MIMGEncGfx11NSA
7710 : AMDGPU::MIMGEncGfx10NSA,
7711 NumVDataDwords, NumVAddrDwords);
7712 } else {
7713 Opcode =
7714 AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7715 IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default
7716 : AMDGPU::MIMGEncGfx10Default,
7717 NumVDataDwords, PowerOf2Ceil(NumVAddrDwords));
7718 }
7719 assert(Opcode != -1);
7720
7721 SmallVector<SDValue, 16> Ops;
7722
7723 auto packLanes = [&DAG, &Ops, &DL] (SDValue Op, bool IsAligned) {
7724 SmallVector<SDValue, 3> Lanes;
7725 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
7726 if (Lanes[0].getValueSizeInBits() == 32) {
7727 for (unsigned I = 0; I < 3; ++I)
7728 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
7729 } else {
7730 if (IsAligned) {
7731 Ops.push_back(
7732 DAG.getBitcast(MVT::i32,
7733 DAG.getBuildVector(MVT::v2f16, DL,
7734 { Lanes[0], Lanes[1] })));
7735 Ops.push_back(Lanes[2]);
7736 } else {
7737 SDValue Elt0 = Ops.pop_back_val();
7738 Ops.push_back(
7739 DAG.getBitcast(MVT::i32,
7740 DAG.getBuildVector(MVT::v2f16, DL,
7741 { Elt0, Lanes[0] })));
7742 Ops.push_back(
7743 DAG.getBitcast(MVT::i32,
7744 DAG.getBuildVector(MVT::v2f16, DL,
7745 { Lanes[1], Lanes[2] })));
7746 }
7747 }
7748 };
7749
7750 if (UseNSA && IsGFX11Plus) {
7751 Ops.push_back(NodePtr);
7752 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
7753 Ops.push_back(RayOrigin);
7754 if (IsA16) {
7755 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
7756 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
7757 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
7758 for (unsigned I = 0; I < 3; ++I) {
7759 MergedLanes.push_back(DAG.getBitcast(
7760 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
7761 {DirLanes[I], InvDirLanes[I]})));
7762 }
7763 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
7764 } else {
7765 Ops.push_back(RayDir);
7766 Ops.push_back(RayInvDir);
7767 }
7768 } else {
7769 if (Is64)
7770 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
7771 2);
7772 else
7773 Ops.push_back(NodePtr);
7774
7775 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
7776 packLanes(RayOrigin, true);
7777 packLanes(RayDir, true);
7778 packLanes(RayInvDir, false);
7779 }
7780
7781 if (!UseNSA) {
7782 // Build a single vector containing all the operands so far prepared.
7783 if (NumVAddrDwords > 8) {
7784 SDValue Undef = DAG.getUNDEF(MVT::i32);
7785 Ops.append(16 - Ops.size(), Undef);
7786 }
7787 assert(Ops.size() == 8 || Ops.size() == 16);
7788 SDValue MergedOps = DAG.getBuildVector(
7789 Ops.size() == 16 ? MVT::v16i32 : MVT::v8i32, DL, Ops);
7790 Ops.clear();
7791 Ops.push_back(MergedOps);
7792 }
7793
7794 Ops.push_back(TDescr);
7795 if (IsA16)
7796 Ops.push_back(DAG.getTargetConstant(1, DL, MVT::i1));
7797 Ops.push_back(M->getChain());
7798
7799 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
7800 MachineMemOperand *MemRef = M->getMemOperand();
7801 DAG.setNodeMemRefs(NewNode, {MemRef});
7802 return SDValue(NewNode, 0);
7803 }
7804 case Intrinsic::amdgcn_global_atomic_fadd:
7805 if (!Op.getValue(0).use_empty() && !Subtarget->hasGFX90AInsts()) {
7806 DiagnosticInfoUnsupported
7807 NoFpRet(DAG.getMachineFunction().getFunction(),
7808 "return versions of fp atomics not supported",
7809 DL.getDebugLoc(), DS_Error);
7810 DAG.getContext()->diagnose(NoFpRet);
7811 return SDValue();
7812 }
7813 LLVM_FALLTHROUGH;
7814 case Intrinsic::amdgcn_global_atomic_fmin:
7815 case Intrinsic::amdgcn_global_atomic_fmax:
7816 case Intrinsic::amdgcn_flat_atomic_fadd:
7817 case Intrinsic::amdgcn_flat_atomic_fmin:
7818 case Intrinsic::amdgcn_flat_atomic_fmax: {
7819 MemSDNode *M = cast<MemSDNode>(Op);
7820 SDValue Ops[] = {
7821 M->getOperand(0), // Chain
7822 M->getOperand(2), // Ptr
7823 M->getOperand(3) // Value
7824 };
7825 unsigned Opcode = 0;
7826 switch (IntrID) {
7827 case Intrinsic::amdgcn_global_atomic_fadd:
7828 case Intrinsic::amdgcn_flat_atomic_fadd: {
7829 EVT VT = Op.getOperand(3).getValueType();
7830 return DAG.getAtomic(ISD::ATOMIC_LOAD_FADD, DL, VT,
7831 DAG.getVTList(VT, MVT::Other), Ops,
7832 M->getMemOperand());
7833 }
7834 case Intrinsic::amdgcn_global_atomic_fmin:
7835 case Intrinsic::amdgcn_flat_atomic_fmin: {
7836 Opcode = AMDGPUISD::ATOMIC_LOAD_FMIN;
7837 break;
7838 }
7839 case Intrinsic::amdgcn_global_atomic_fmax:
7840 case Intrinsic::amdgcn_flat_atomic_fmax: {
7841 Opcode = AMDGPUISD::ATOMIC_LOAD_FMAX;
7842 break;
7843 }
7844 default:
7845 llvm_unreachable("unhandled atomic opcode");
7846 }
7847 return DAG.getMemIntrinsicNode(Opcode, SDLoc(Op),
7848 M->getVTList(), Ops, M->getMemoryVT(),
7849 M->getMemOperand());
7850 }
7851 default:
7852
7853 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
7854 AMDGPU::getImageDimIntrinsicInfo(IntrID))
7855 return lowerImage(Op, ImageDimIntr, DAG, true);
7856
7857 return SDValue();
7858 }
7859 }
7860
7861 // Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
7862 // dwordx4 if on SI.
getMemIntrinsicNode(unsigned Opcode,const SDLoc & DL,SDVTList VTList,ArrayRef<SDValue> Ops,EVT MemVT,MachineMemOperand * MMO,SelectionDAG & DAG) const7863 SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
7864 SDVTList VTList,
7865 ArrayRef<SDValue> Ops, EVT MemVT,
7866 MachineMemOperand *MMO,
7867 SelectionDAG &DAG) const {
7868 EVT VT = VTList.VTs[0];
7869 EVT WidenedVT = VT;
7870 EVT WidenedMemVT = MemVT;
7871 if (!Subtarget->hasDwordx3LoadStores() &&
7872 (WidenedVT == MVT::v3i32 || WidenedVT == MVT::v3f32)) {
7873 WidenedVT = EVT::getVectorVT(*DAG.getContext(),
7874 WidenedVT.getVectorElementType(), 4);
7875 WidenedMemVT = EVT::getVectorVT(*DAG.getContext(),
7876 WidenedMemVT.getVectorElementType(), 4);
7877 MMO = DAG.getMachineFunction().getMachineMemOperand(MMO, 0, 16);
7878 }
7879
7880 assert(VTList.NumVTs == 2);
7881 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
7882
7883 auto NewOp = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
7884 WidenedMemVT, MMO);
7885 if (WidenedVT != VT) {
7886 auto Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, NewOp,
7887 DAG.getVectorIdxConstant(0, DL));
7888 NewOp = DAG.getMergeValues({ Extract, SDValue(NewOp.getNode(), 1) }, DL);
7889 }
7890 return NewOp;
7891 }
7892
handleD16VData(SDValue VData,SelectionDAG & DAG,bool ImageStore) const7893 SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
7894 bool ImageStore) const {
7895 EVT StoreVT = VData.getValueType();
7896
7897 // No change for f16 and legal vector D16 types.
7898 if (!StoreVT.isVector())
7899 return VData;
7900
7901 SDLoc DL(VData);
7902 unsigned NumElements = StoreVT.getVectorNumElements();
7903
7904 if (Subtarget->hasUnpackedD16VMem()) {
7905 // We need to unpack the packed data to store.
7906 EVT IntStoreVT = StoreVT.changeTypeToInteger();
7907 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
7908
7909 EVT EquivStoreVT =
7910 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
7911 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
7912 return DAG.UnrollVectorOp(ZExt.getNode());
7913 }
7914
7915 // The sq block of gfx8.1 does not estimate register use correctly for d16
7916 // image store instructions. The data operand is computed as if it were not a
7917 // d16 image instruction.
7918 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
7919 // Bitcast to i16
7920 EVT IntStoreVT = StoreVT.changeTypeToInteger();
7921 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
7922
7923 // Decompose into scalars
7924 SmallVector<SDValue, 4> Elts;
7925 DAG.ExtractVectorElements(IntVData, Elts);
7926
7927 // Group pairs of i16 into v2i16 and bitcast to i32
7928 SmallVector<SDValue, 4> PackedElts;
7929 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
7930 SDValue Pair =
7931 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
7932 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
7933 PackedElts.push_back(IntPair);
7934 }
7935 if ((NumElements % 2) == 1) {
7936 // Handle v3i16
7937 unsigned I = Elts.size() / 2;
7938 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
7939 {Elts[I * 2], DAG.getUNDEF(MVT::i16)});
7940 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
7941 PackedElts.push_back(IntPair);
7942 }
7943
7944 // Pad using UNDEF
7945 PackedElts.resize(Elts.size(), DAG.getUNDEF(MVT::i32));
7946
7947 // Build final vector
7948 EVT VecVT =
7949 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
7950 return DAG.getBuildVector(VecVT, DL, PackedElts);
7951 }
7952
7953 if (NumElements == 3) {
7954 EVT IntStoreVT =
7955 EVT::getIntegerVT(*DAG.getContext(), StoreVT.getStoreSizeInBits());
7956 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
7957
7958 EVT WidenedStoreVT = EVT::getVectorVT(
7959 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
7960 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
7961 WidenedStoreVT.getStoreSizeInBits());
7962 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
7963 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
7964 }
7965
7966 assert(isTypeLegal(StoreVT));
7967 return VData;
7968 }
7969
LowerINTRINSIC_VOID(SDValue Op,SelectionDAG & DAG) const7970 SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
7971 SelectionDAG &DAG) const {
7972 SDLoc DL(Op);
7973 SDValue Chain = Op.getOperand(0);
7974 unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
7975 MachineFunction &MF = DAG.getMachineFunction();
7976
7977 switch (IntrinsicID) {
7978 case Intrinsic::amdgcn_exp_compr: {
7979 if (!Subtarget->hasCompressedExport()) {
7980 DiagnosticInfoUnsupported BadIntrin(
7981 DAG.getMachineFunction().getFunction(),
7982 "intrinsic not supported on subtarget", DL.getDebugLoc());
7983 DAG.getContext()->diagnose(BadIntrin);
7984 }
7985 SDValue Src0 = Op.getOperand(4);
7986 SDValue Src1 = Op.getOperand(5);
7987 // Hack around illegal type on SI by directly selecting it.
7988 if (isTypeLegal(Src0.getValueType()))
7989 return SDValue();
7990
7991 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
7992 SDValue Undef = DAG.getUNDEF(MVT::f32);
7993 const SDValue Ops[] = {
7994 Op.getOperand(2), // tgt
7995 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
7996 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
7997 Undef, // src2
7998 Undef, // src3
7999 Op.getOperand(7), // vm
8000 DAG.getTargetConstant(1, DL, MVT::i1), // compr
8001 Op.getOperand(3), // en
8002 Op.getOperand(0) // Chain
8003 };
8004
8005 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
8006 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
8007 }
8008 case Intrinsic::amdgcn_s_barrier: {
8009 if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
8010 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
8011 unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
8012 if (WGSize <= ST.getWavefrontSize())
8013 return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
8014 Op.getOperand(0)), 0);
8015 }
8016 return SDValue();
8017 };
8018 case Intrinsic::amdgcn_tbuffer_store: {
8019 SDValue VData = Op.getOperand(2);
8020 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
8021 if (IsD16)
8022 VData = handleD16VData(VData, DAG);
8023 unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
8024 unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
8025 unsigned Glc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
8026 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(11))->getZExtValue();
8027 unsigned IdxEn = getIdxEn(Op.getOperand(4));
8028 SDValue Ops[] = {
8029 Chain,
8030 VData, // vdata
8031 Op.getOperand(3), // rsrc
8032 Op.getOperand(4), // vindex
8033 Op.getOperand(5), // voffset
8034 Op.getOperand(6), // soffset
8035 Op.getOperand(7), // offset
8036 DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
8037 DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
8038 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
8039 };
8040 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
8041 AMDGPUISD::TBUFFER_STORE_FORMAT;
8042 MemSDNode *M = cast<MemSDNode>(Op);
8043 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
8044 M->getMemoryVT(), M->getMemOperand());
8045 }
8046
8047 case Intrinsic::amdgcn_struct_tbuffer_store: {
8048 SDValue VData = Op.getOperand(2);
8049 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
8050 if (IsD16)
8051 VData = handleD16VData(VData, DAG);
8052 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
8053 SDValue Ops[] = {
8054 Chain,
8055 VData, // vdata
8056 Op.getOperand(3), // rsrc
8057 Op.getOperand(4), // vindex
8058 Offsets.first, // voffset
8059 Op.getOperand(6), // soffset
8060 Offsets.second, // offset
8061 Op.getOperand(7), // format
8062 Op.getOperand(8), // cachepolicy, swizzled buffer
8063 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8064 };
8065 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
8066 AMDGPUISD::TBUFFER_STORE_FORMAT;
8067 MemSDNode *M = cast<MemSDNode>(Op);
8068 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
8069 M->getMemoryVT(), M->getMemOperand());
8070 }
8071
8072 case Intrinsic::amdgcn_raw_tbuffer_store: {
8073 SDValue VData = Op.getOperand(2);
8074 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
8075 if (IsD16)
8076 VData = handleD16VData(VData, DAG);
8077 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8078 SDValue Ops[] = {
8079 Chain,
8080 VData, // vdata
8081 Op.getOperand(3), // rsrc
8082 DAG.getConstant(0, DL, MVT::i32), // vindex
8083 Offsets.first, // voffset
8084 Op.getOperand(5), // soffset
8085 Offsets.second, // offset
8086 Op.getOperand(6), // format
8087 Op.getOperand(7), // cachepolicy, swizzled buffer
8088 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8089 };
8090 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
8091 AMDGPUISD::TBUFFER_STORE_FORMAT;
8092 MemSDNode *M = cast<MemSDNode>(Op);
8093 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
8094 M->getMemoryVT(), M->getMemOperand());
8095 }
8096
8097 case Intrinsic::amdgcn_buffer_store:
8098 case Intrinsic::amdgcn_buffer_store_format: {
8099 SDValue VData = Op.getOperand(2);
8100 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
8101 if (IsD16)
8102 VData = handleD16VData(VData, DAG);
8103 unsigned Glc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue();
8104 unsigned Slc = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
8105 unsigned IdxEn = getIdxEn(Op.getOperand(4));
8106 SDValue Ops[] = {
8107 Chain,
8108 VData,
8109 Op.getOperand(3), // rsrc
8110 Op.getOperand(4), // vindex
8111 SDValue(), // voffset -- will be set by setBufferOffsets
8112 SDValue(), // soffset -- will be set by setBufferOffsets
8113 SDValue(), // offset -- will be set by setBufferOffsets
8114 DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
8115 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
8116 };
8117 setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
8118
8119 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
8120 AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
8121 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
8122 MemSDNode *M = cast<MemSDNode>(Op);
8123 updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6], Ops[3]);
8124
8125 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
8126 EVT VDataType = VData.getValueType().getScalarType();
8127 if (VDataType == MVT::i8 || VDataType == MVT::i16)
8128 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
8129
8130 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
8131 M->getMemoryVT(), M->getMemOperand());
8132 }
8133
8134 case Intrinsic::amdgcn_raw_buffer_store:
8135 case Intrinsic::amdgcn_raw_buffer_store_format: {
8136 const bool IsFormat =
8137 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format;
8138
8139 SDValue VData = Op.getOperand(2);
8140 EVT VDataVT = VData.getValueType();
8141 EVT EltType = VDataVT.getScalarType();
8142 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
8143 if (IsD16) {
8144 VData = handleD16VData(VData, DAG);
8145 VDataVT = VData.getValueType();
8146 }
8147
8148 if (!isTypeLegal(VDataVT)) {
8149 VData =
8150 DAG.getNode(ISD::BITCAST, DL,
8151 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
8152 }
8153
8154 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8155 SDValue Ops[] = {
8156 Chain,
8157 VData,
8158 Op.getOperand(3), // rsrc
8159 DAG.getConstant(0, DL, MVT::i32), // vindex
8160 Offsets.first, // voffset
8161 Op.getOperand(5), // soffset
8162 Offsets.second, // offset
8163 Op.getOperand(6), // cachepolicy, swizzled buffer
8164 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8165 };
8166 unsigned Opc =
8167 IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
8168 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
8169 MemSDNode *M = cast<MemSDNode>(Op);
8170 updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6]);
8171
8172 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
8173 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
8174 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
8175
8176 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
8177 M->getMemoryVT(), M->getMemOperand());
8178 }
8179
8180 case Intrinsic::amdgcn_struct_buffer_store:
8181 case Intrinsic::amdgcn_struct_buffer_store_format: {
8182 const bool IsFormat =
8183 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format;
8184
8185 SDValue VData = Op.getOperand(2);
8186 EVT VDataVT = VData.getValueType();
8187 EVT EltType = VDataVT.getScalarType();
8188 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
8189
8190 if (IsD16) {
8191 VData = handleD16VData(VData, DAG);
8192 VDataVT = VData.getValueType();
8193 }
8194
8195 if (!isTypeLegal(VDataVT)) {
8196 VData =
8197 DAG.getNode(ISD::BITCAST, DL,
8198 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
8199 }
8200
8201 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
8202 SDValue Ops[] = {
8203 Chain,
8204 VData,
8205 Op.getOperand(3), // rsrc
8206 Op.getOperand(4), // vindex
8207 Offsets.first, // voffset
8208 Op.getOperand(6), // soffset
8209 Offsets.second, // offset
8210 Op.getOperand(7), // cachepolicy, swizzled buffer
8211 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8212 };
8213 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_struct_buffer_store ?
8214 AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
8215 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
8216 MemSDNode *M = cast<MemSDNode>(Op);
8217 updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6], Ops[3]);
8218
8219 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
8220 EVT VDataType = VData.getValueType().getScalarType();
8221 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
8222 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
8223
8224 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
8225 M->getMemoryVT(), M->getMemOperand());
8226 }
8227 case Intrinsic::amdgcn_raw_buffer_load_lds:
8228 case Intrinsic::amdgcn_struct_buffer_load_lds: {
8229 unsigned Opc;
8230 bool HasVIndex = IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds;
8231 unsigned OpOffset = HasVIndex ? 1 : 0;
8232 SDValue VOffset = Op.getOperand(5 + OpOffset);
8233 auto CVOffset = dyn_cast<ConstantSDNode>(VOffset);
8234 bool HasVOffset = !CVOffset || !CVOffset->isZero();
8235 unsigned Size = Op->getConstantOperandVal(4);
8236
8237 switch (Size) {
8238 default:
8239 return SDValue();
8240 case 1:
8241 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
8242 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
8243 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
8244 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
8245 break;
8246 case 2:
8247 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
8248 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
8249 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
8250 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
8251 break;
8252 case 4:
8253 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
8254 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
8255 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
8256 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
8257 break;
8258 }
8259
8260 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
8261
8262 SmallVector<SDValue, 8> Ops;
8263
8264 if (HasVIndex && HasVOffset)
8265 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
8266 { Op.getOperand(5), // VIndex
8267 VOffset }));
8268 else if (HasVIndex)
8269 Ops.push_back(Op.getOperand(5));
8270 else if (HasVOffset)
8271 Ops.push_back(VOffset);
8272
8273 Ops.push_back(Op.getOperand(2)); // rsrc
8274 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
8275 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
8276 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
8277 Ops.push_back(
8278 DAG.getTargetConstant(Aux & AMDGPU::CPol::ALL, DL, MVT::i8)); // cpol
8279 Ops.push_back(
8280 DAG.getTargetConstant((Aux >> 3) & 1, DL, MVT::i8)); // swz
8281 Ops.push_back(M0Val.getValue(0)); // Chain
8282 Ops.push_back(M0Val.getValue(1)); // Glue
8283
8284 auto *M = cast<MemSDNode>(Op);
8285 MachineMemOperand *LoadMMO = M->getMemOperand();
8286 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
8287 LoadPtrI.Offset = Op->getConstantOperandVal(7 + OpOffset);
8288 MachinePointerInfo StorePtrI = LoadPtrI;
8289 StorePtrI.V = nullptr;
8290 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
8291
8292 auto F = LoadMMO->getFlags() &
8293 ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
8294 LoadMMO = MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
8295 Size, LoadMMO->getBaseAlign());
8296
8297 MachineMemOperand *StoreMMO =
8298 MF.getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
8299 sizeof(int32_t), LoadMMO->getBaseAlign());
8300
8301 auto Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
8302 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
8303
8304 return SDValue(Load, 0);
8305 }
8306 case Intrinsic::amdgcn_global_load_lds: {
8307 unsigned Opc;
8308 unsigned Size = Op->getConstantOperandVal(4);
8309 switch (Size) {
8310 default:
8311 return SDValue();
8312 case 1:
8313 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
8314 break;
8315 case 2:
8316 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
8317 break;
8318 case 4:
8319 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
8320 break;
8321 }
8322
8323 auto *M = cast<MemSDNode>(Op);
8324 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
8325
8326 SmallVector<SDValue, 6> Ops;
8327
8328 SDValue Addr = Op.getOperand(2); // Global ptr
8329 SDValue VOffset;
8330 // Try to split SAddr and VOffset. Global and LDS pointers share the same
8331 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
8332 if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
8333 SDValue LHS = Addr.getOperand(0);
8334 SDValue RHS = Addr.getOperand(1);
8335
8336 if (LHS->isDivergent())
8337 std::swap(LHS, RHS);
8338
8339 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
8340 RHS.getOperand(0).getValueType() == MVT::i32) {
8341 // add (i64 sgpr), (zero_extend (i32 vgpr))
8342 Addr = LHS;
8343 VOffset = RHS.getOperand(0);
8344 }
8345 }
8346
8347 Ops.push_back(Addr);
8348 if (!Addr->isDivergent()) {
8349 Opc = AMDGPU::getGlobalSaddrOp(Opc);
8350 if (!VOffset)
8351 VOffset = SDValue(
8352 DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
8353 DAG.getTargetConstant(0, DL, MVT::i32)), 0);
8354 Ops.push_back(VOffset);
8355 }
8356
8357 Ops.push_back(Op.getOperand(5)); // Offset
8358 Ops.push_back(Op.getOperand(6)); // CPol
8359 Ops.push_back(M0Val.getValue(0)); // Chain
8360 Ops.push_back(M0Val.getValue(1)); // Glue
8361
8362 MachineMemOperand *LoadMMO = M->getMemOperand();
8363 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
8364 LoadPtrI.Offset = Op->getConstantOperandVal(5);
8365 MachinePointerInfo StorePtrI = LoadPtrI;
8366 LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
8367 StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
8368 auto F = LoadMMO->getFlags() &
8369 ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
8370 LoadMMO = MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
8371 Size, LoadMMO->getBaseAlign());
8372 MachineMemOperand *StoreMMO =
8373 MF.getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
8374 sizeof(int32_t), Align(4));
8375
8376 auto Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
8377 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
8378
8379 return SDValue(Load, 0);
8380 }
8381 case Intrinsic::amdgcn_end_cf:
8382 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
8383 Op->getOperand(2), Chain), 0);
8384
8385 default: {
8386 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
8387 AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
8388 return lowerImage(Op, ImageDimIntr, DAG, true);
8389
8390 return Op;
8391 }
8392 }
8393 }
8394
8395 // The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
8396 // offset (the offset that is included in bounds checking and swizzling, to be
8397 // split between the instruction's voffset and immoffset fields) and soffset
8398 // (the offset that is excluded from bounds checking and swizzling, to go in
8399 // the instruction's soffset field). This function takes the first kind of
8400 // offset and figures out how to split it between voffset and immoffset.
splitBufferOffsets(SDValue Offset,SelectionDAG & DAG) const8401 std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
8402 SDValue Offset, SelectionDAG &DAG) const {
8403 SDLoc DL(Offset);
8404 const unsigned MaxImm = 4095;
8405 SDValue N0 = Offset;
8406 ConstantSDNode *C1 = nullptr;
8407
8408 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
8409 N0 = SDValue();
8410 else if (DAG.isBaseWithConstantOffset(N0)) {
8411 C1 = cast<ConstantSDNode>(N0.getOperand(1));
8412 N0 = N0.getOperand(0);
8413 }
8414
8415 if (C1) {
8416 unsigned ImmOffset = C1->getZExtValue();
8417 // If the immediate value is too big for the immoffset field, put the value
8418 // and -4096 into the immoffset field so that the value that is copied/added
8419 // for the voffset field is a multiple of 4096, and it stands more chance
8420 // of being CSEd with the copy/add for another similar load/store.
8421 // However, do not do that rounding down to a multiple of 4096 if that is a
8422 // negative number, as it appears to be illegal to have a negative offset
8423 // in the vgpr, even if adding the immediate offset makes it positive.
8424 unsigned Overflow = ImmOffset & ~MaxImm;
8425 ImmOffset -= Overflow;
8426 if ((int32_t)Overflow < 0) {
8427 Overflow += ImmOffset;
8428 ImmOffset = 0;
8429 }
8430 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
8431 if (Overflow) {
8432 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
8433 if (!N0)
8434 N0 = OverflowVal;
8435 else {
8436 SDValue Ops[] = { N0, OverflowVal };
8437 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
8438 }
8439 }
8440 }
8441 if (!N0)
8442 N0 = DAG.getConstant(0, DL, MVT::i32);
8443 if (!C1)
8444 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
8445 return {N0, SDValue(C1, 0)};
8446 }
8447
8448 // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
8449 // three offsets (voffset, soffset and instoffset) into the SDValue[3] array
8450 // pointed to by Offsets.
setBufferOffsets(SDValue CombinedOffset,SelectionDAG & DAG,SDValue * Offsets,Align Alignment) const8451 void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
8452 SelectionDAG &DAG, SDValue *Offsets,
8453 Align Alignment) const {
8454 SDLoc DL(CombinedOffset);
8455 if (auto C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
8456 uint32_t Imm = C->getZExtValue();
8457 uint32_t SOffset, ImmOffset;
8458 if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget,
8459 Alignment)) {
8460 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
8461 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
8462 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
8463 return;
8464 }
8465 }
8466 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
8467 SDValue N0 = CombinedOffset.getOperand(0);
8468 SDValue N1 = CombinedOffset.getOperand(1);
8469 uint32_t SOffset, ImmOffset;
8470 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
8471 if (Offset >= 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
8472 Subtarget, Alignment)) {
8473 Offsets[0] = N0;
8474 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
8475 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
8476 return;
8477 }
8478 }
8479 Offsets[0] = CombinedOffset;
8480 Offsets[1] = DAG.getConstant(0, DL, MVT::i32);
8481 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
8482 }
8483
8484 // Handle 8 bit and 16 bit buffer loads
handleByteShortBufferLoads(SelectionDAG & DAG,EVT LoadVT,SDLoc DL,ArrayRef<SDValue> Ops,MemSDNode * M) const8485 SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
8486 EVT LoadVT, SDLoc DL,
8487 ArrayRef<SDValue> Ops,
8488 MemSDNode *M) const {
8489 EVT IntVT = LoadVT.changeTypeToInteger();
8490 unsigned Opc = (LoadVT.getScalarType() == MVT::i8) ?
8491 AMDGPUISD::BUFFER_LOAD_UBYTE : AMDGPUISD::BUFFER_LOAD_USHORT;
8492
8493 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
8494 SDValue BufferLoad = DAG.getMemIntrinsicNode(Opc, DL, ResList,
8495 Ops, IntVT,
8496 M->getMemOperand());
8497 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
8498 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
8499
8500 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
8501 }
8502
8503 // Handle 8 bit and 16 bit buffer stores
handleByteShortBufferStores(SelectionDAG & DAG,EVT VDataType,SDLoc DL,SDValue Ops[],MemSDNode * M) const8504 SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
8505 EVT VDataType, SDLoc DL,
8506 SDValue Ops[],
8507 MemSDNode *M) const {
8508 if (VDataType == MVT::f16)
8509 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
8510
8511 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
8512 Ops[1] = BufferStoreExt;
8513 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE :
8514 AMDGPUISD::BUFFER_STORE_SHORT;
8515 ArrayRef<SDValue> OpsRef = makeArrayRef(&Ops[0], 9);
8516 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
8517 M->getMemOperand());
8518 }
8519
getLoadExtOrTrunc(SelectionDAG & DAG,ISD::LoadExtType ExtType,SDValue Op,const SDLoc & SL,EVT VT)8520 static SDValue getLoadExtOrTrunc(SelectionDAG &DAG,
8521 ISD::LoadExtType ExtType, SDValue Op,
8522 const SDLoc &SL, EVT VT) {
8523 if (VT.bitsLT(Op.getValueType()))
8524 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
8525
8526 switch (ExtType) {
8527 case ISD::SEXTLOAD:
8528 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
8529 case ISD::ZEXTLOAD:
8530 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
8531 case ISD::EXTLOAD:
8532 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
8533 case ISD::NON_EXTLOAD:
8534 return Op;
8535 }
8536
8537 llvm_unreachable("invalid ext type");
8538 }
8539
widenLoad(LoadSDNode * Ld,DAGCombinerInfo & DCI) const8540 SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const {
8541 SelectionDAG &DAG = DCI.DAG;
8542 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
8543 return SDValue();
8544
8545 // FIXME: Constant loads should all be marked invariant.
8546 unsigned AS = Ld->getAddressSpace();
8547 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
8548 AS != AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
8549 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
8550 return SDValue();
8551
8552 // Don't do this early, since it may interfere with adjacent load merging for
8553 // illegal types. We can avoid losing alignment information for exotic types
8554 // pre-legalize.
8555 EVT MemVT = Ld->getMemoryVT();
8556 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
8557 MemVT.getSizeInBits() >= 32)
8558 return SDValue();
8559
8560 SDLoc SL(Ld);
8561
8562 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
8563 "unexpected vector extload");
8564
8565 // TODO: Drop only high part of range.
8566 SDValue Ptr = Ld->getBasePtr();
8567 SDValue NewLoad = DAG.getLoad(
8568 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
8569 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
8570 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
8571 nullptr); // Drop ranges
8572
8573 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
8574 if (MemVT.isFloatingPoint()) {
8575 assert(Ld->getExtensionType() == ISD::NON_EXTLOAD &&
8576 "unexpected fp extload");
8577 TruncVT = MemVT.changeTypeToInteger();
8578 }
8579
8580 SDValue Cvt = NewLoad;
8581 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
8582 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
8583 DAG.getValueType(TruncVT));
8584 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
8585 Ld->getExtensionType() == ISD::NON_EXTLOAD) {
8586 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
8587 } else {
8588 assert(Ld->getExtensionType() == ISD::EXTLOAD);
8589 }
8590
8591 EVT VT = Ld->getValueType(0);
8592 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
8593
8594 DCI.AddToWorklist(Cvt.getNode());
8595
8596 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
8597 // the appropriate extension from the 32-bit load.
8598 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
8599 DCI.AddToWorklist(Cvt.getNode());
8600
8601 // Handle conversion back to floating point if necessary.
8602 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
8603
8604 return DAG.getMergeValues({ Cvt, NewLoad.getValue(1) }, SL);
8605 }
8606
LowerLOAD(SDValue Op,SelectionDAG & DAG) const8607 SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
8608 SDLoc DL(Op);
8609 LoadSDNode *Load = cast<LoadSDNode>(Op);
8610 ISD::LoadExtType ExtType = Load->getExtensionType();
8611 EVT MemVT = Load->getMemoryVT();
8612
8613 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
8614 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
8615 return SDValue();
8616
8617 // FIXME: Copied from PPC
8618 // First, load into 32 bits, then truncate to 1 bit.
8619
8620 SDValue Chain = Load->getChain();
8621 SDValue BasePtr = Load->getBasePtr();
8622 MachineMemOperand *MMO = Load->getMemOperand();
8623
8624 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
8625
8626 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
8627 BasePtr, RealMemVT, MMO);
8628
8629 if (!MemVT.isVector()) {
8630 SDValue Ops[] = {
8631 DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
8632 NewLD.getValue(1)
8633 };
8634
8635 return DAG.getMergeValues(Ops, DL);
8636 }
8637
8638 SmallVector<SDValue, 3> Elts;
8639 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
8640 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
8641 DAG.getConstant(I, DL, MVT::i32));
8642
8643 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
8644 }
8645
8646 SDValue Ops[] = {
8647 DAG.getBuildVector(MemVT, DL, Elts),
8648 NewLD.getValue(1)
8649 };
8650
8651 return DAG.getMergeValues(Ops, DL);
8652 }
8653
8654 if (!MemVT.isVector())
8655 return SDValue();
8656
8657 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
8658 "Custom lowering for non-i32 vectors hasn't been implemented.");
8659
8660 Align Alignment = Load->getAlign();
8661 unsigned AS = Load->getAddressSpace();
8662 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
8663 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
8664 return SplitVectorLoad(Op, DAG);
8665 }
8666
8667 MachineFunction &MF = DAG.getMachineFunction();
8668 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
8669 // If there is a possibility that flat instruction access scratch memory
8670 // then we need to use the same legalization rules we use for private.
8671 if (AS == AMDGPUAS::FLAT_ADDRESS &&
8672 !Subtarget->hasMultiDwordFlatScratchAddressing())
8673 AS = MFI->hasFlatScratchInit() ?
8674 AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
8675
8676 unsigned NumElements = MemVT.getVectorNumElements();
8677
8678 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
8679 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
8680 if (!Op->isDivergent() && Alignment >= Align(4) && NumElements < 32) {
8681 if (MemVT.isPow2VectorType())
8682 return SDValue();
8683 return WidenOrSplitVectorLoad(Op, DAG);
8684 }
8685 // Non-uniform loads will be selected to MUBUF instructions, so they
8686 // have the same legalization requirements as global and private
8687 // loads.
8688 //
8689 }
8690
8691 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
8692 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
8693 AS == AMDGPUAS::GLOBAL_ADDRESS) {
8694 if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
8695 Load->isSimple() && isMemOpHasNoClobberedMemOperand(Load) &&
8696 Alignment >= Align(4) && NumElements < 32) {
8697 if (MemVT.isPow2VectorType())
8698 return SDValue();
8699 return WidenOrSplitVectorLoad(Op, DAG);
8700 }
8701 // Non-uniform loads will be selected to MUBUF instructions, so they
8702 // have the same legalization requirements as global and private
8703 // loads.
8704 //
8705 }
8706 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
8707 AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
8708 AS == AMDGPUAS::GLOBAL_ADDRESS ||
8709 AS == AMDGPUAS::FLAT_ADDRESS) {
8710 if (NumElements > 4)
8711 return SplitVectorLoad(Op, DAG);
8712 // v3 loads not supported on SI.
8713 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
8714 return WidenOrSplitVectorLoad(Op, DAG);
8715
8716 // v3 and v4 loads are supported for private and global memory.
8717 return SDValue();
8718 }
8719 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
8720 // Depending on the setting of the private_element_size field in the
8721 // resource descriptor, we can only make private accesses up to a certain
8722 // size.
8723 switch (Subtarget->getMaxPrivateElementSize()) {
8724 case 4: {
8725 SDValue Ops[2];
8726 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
8727 return DAG.getMergeValues(Ops, DL);
8728 }
8729 case 8:
8730 if (NumElements > 2)
8731 return SplitVectorLoad(Op, DAG);
8732 return SDValue();
8733 case 16:
8734 // Same as global/flat
8735 if (NumElements > 4)
8736 return SplitVectorLoad(Op, DAG);
8737 // v3 loads not supported on SI.
8738 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
8739 return WidenOrSplitVectorLoad(Op, DAG);
8740
8741 return SDValue();
8742 default:
8743 llvm_unreachable("unsupported private_element_size");
8744 }
8745 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
8746 bool Fast = false;
8747 auto Flags = Load->getMemOperand()->getFlags();
8748 if (allowsMisalignedMemoryAccessesImpl(MemVT.getSizeInBits(), AS,
8749 Load->getAlign(), Flags, &Fast) &&
8750 Fast)
8751 return SDValue();
8752
8753 if (MemVT.isVector())
8754 return SplitVectorLoad(Op, DAG);
8755 }
8756
8757 if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
8758 MemVT, *Load->getMemOperand())) {
8759 SDValue Ops[2];
8760 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
8761 return DAG.getMergeValues(Ops, DL);
8762 }
8763
8764 return SDValue();
8765 }
8766
LowerSELECT(SDValue Op,SelectionDAG & DAG) const8767 SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
8768 EVT VT = Op.getValueType();
8769 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256)
8770 return splitTernaryVectorOp(Op, DAG);
8771
8772 assert(VT.getSizeInBits() == 64);
8773
8774 SDLoc DL(Op);
8775 SDValue Cond = Op.getOperand(0);
8776
8777 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
8778 SDValue One = DAG.getConstant(1, DL, MVT::i32);
8779
8780 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
8781 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
8782
8783 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
8784 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
8785
8786 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
8787
8788 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
8789 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
8790
8791 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
8792
8793 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
8794 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
8795 }
8796
8797 // Catch division cases where we can use shortcuts with rcp and rsq
8798 // instructions.
lowerFastUnsafeFDIV(SDValue Op,SelectionDAG & DAG) const8799 SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
8800 SelectionDAG &DAG) const {
8801 SDLoc SL(Op);
8802 SDValue LHS = Op.getOperand(0);
8803 SDValue RHS = Op.getOperand(1);
8804 EVT VT = Op.getValueType();
8805 const SDNodeFlags Flags = Op->getFlags();
8806
8807 bool AllowInaccurateRcp = Flags.hasApproximateFuncs();
8808
8809 // Without !fpmath accuracy information, we can't do more because we don't
8810 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
8811 if (!AllowInaccurateRcp)
8812 return SDValue();
8813
8814 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
8815 if (CLHS->isExactlyValue(1.0)) {
8816 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
8817 // the CI documentation has a worst case error of 1 ulp.
8818 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
8819 // use it as long as we aren't trying to use denormals.
8820 //
8821 // v_rcp_f16 and v_rsq_f16 DO support denormals.
8822
8823 // 1.0 / sqrt(x) -> rsq(x)
8824
8825 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
8826 // error seems really high at 2^29 ULP.
8827 if (RHS.getOpcode() == ISD::FSQRT)
8828 return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
8829
8830 // 1.0 / x -> rcp(x)
8831 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
8832 }
8833
8834 // Same as for 1.0, but expand the sign out of the constant.
8835 if (CLHS->isExactlyValue(-1.0)) {
8836 // -1.0 / x -> rcp (fneg x)
8837 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
8838 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
8839 }
8840 }
8841
8842 // Turn into multiply by the reciprocal.
8843 // x / y -> x * (1.0 / y)
8844 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
8845 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
8846 }
8847
lowerFastUnsafeFDIV64(SDValue Op,SelectionDAG & DAG) const8848 SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
8849 SelectionDAG &DAG) const {
8850 SDLoc SL(Op);
8851 SDValue X = Op.getOperand(0);
8852 SDValue Y = Op.getOperand(1);
8853 EVT VT = Op.getValueType();
8854 const SDNodeFlags Flags = Op->getFlags();
8855
8856 bool AllowInaccurateDiv = Flags.hasApproximateFuncs() ||
8857 DAG.getTarget().Options.UnsafeFPMath;
8858 if (!AllowInaccurateDiv)
8859 return SDValue();
8860
8861 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
8862 SDValue One = DAG.getConstantFP(1.0, SL, VT);
8863
8864 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
8865 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
8866
8867 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
8868 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
8869 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
8870 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
8871 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
8872 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
8873 }
8874
getFPBinOp(SelectionDAG & DAG,unsigned Opcode,const SDLoc & SL,EVT VT,SDValue A,SDValue B,SDValue GlueChain,SDNodeFlags Flags)8875 static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
8876 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
8877 SDNodeFlags Flags) {
8878 if (GlueChain->getNumValues() <= 1) {
8879 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
8880 }
8881
8882 assert(GlueChain->getNumValues() == 3);
8883
8884 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
8885 switch (Opcode) {
8886 default: llvm_unreachable("no chain equivalent for opcode");
8887 case ISD::FMUL:
8888 Opcode = AMDGPUISD::FMUL_W_CHAIN;
8889 break;
8890 }
8891
8892 return DAG.getNode(Opcode, SL, VTList,
8893 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
8894 Flags);
8895 }
8896
getFPTernOp(SelectionDAG & DAG,unsigned Opcode,const SDLoc & SL,EVT VT,SDValue A,SDValue B,SDValue C,SDValue GlueChain,SDNodeFlags Flags)8897 static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
8898 EVT VT, SDValue A, SDValue B, SDValue C,
8899 SDValue GlueChain, SDNodeFlags Flags) {
8900 if (GlueChain->getNumValues() <= 1) {
8901 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
8902 }
8903
8904 assert(GlueChain->getNumValues() == 3);
8905
8906 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
8907 switch (Opcode) {
8908 default: llvm_unreachable("no chain equivalent for opcode");
8909 case ISD::FMA:
8910 Opcode = AMDGPUISD::FMA_W_CHAIN;
8911 break;
8912 }
8913
8914 return DAG.getNode(Opcode, SL, VTList,
8915 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
8916 Flags);
8917 }
8918
LowerFDIV16(SDValue Op,SelectionDAG & DAG) const8919 SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
8920 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
8921 return FastLowered;
8922
8923 SDLoc SL(Op);
8924 SDValue Src0 = Op.getOperand(0);
8925 SDValue Src1 = Op.getOperand(1);
8926
8927 SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
8928 SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
8929
8930 SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
8931 SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
8932
8933 SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
8934 SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
8935
8936 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
8937 }
8938
8939 // Faster 2.5 ULP division that does not support denormals.
lowerFDIV_FAST(SDValue Op,SelectionDAG & DAG) const8940 SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
8941 SDLoc SL(Op);
8942 SDValue LHS = Op.getOperand(1);
8943 SDValue RHS = Op.getOperand(2);
8944
8945 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
8946
8947 const APFloat K0Val(BitsToFloat(0x6f800000));
8948 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
8949
8950 const APFloat K1Val(BitsToFloat(0x2f800000));
8951 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
8952
8953 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
8954
8955 EVT SetCCVT =
8956 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
8957
8958 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
8959
8960 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
8961
8962 // TODO: Should this propagate fast-math-flags?
8963 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
8964
8965 // rcp does not support denormals.
8966 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
8967
8968 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
8969
8970 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
8971 }
8972
8973 // Returns immediate value for setting the F32 denorm mode when using the
8974 // S_DENORM_MODE instruction.
getSPDenormModeValue(int SPDenormMode,SelectionDAG & DAG,const SDLoc & SL,const GCNSubtarget * ST)8975 static SDValue getSPDenormModeValue(int SPDenormMode, SelectionDAG &DAG,
8976 const SDLoc &SL, const GCNSubtarget *ST) {
8977 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
8978 int DPDenormModeDefault = hasFP64FP16Denormals(DAG.getMachineFunction())
8979 ? FP_DENORM_FLUSH_NONE
8980 : FP_DENORM_FLUSH_IN_FLUSH_OUT;
8981
8982 int Mode = SPDenormMode | (DPDenormModeDefault << 2);
8983 return DAG.getTargetConstant(Mode, SL, MVT::i32);
8984 }
8985
LowerFDIV32(SDValue Op,SelectionDAG & DAG) const8986 SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
8987 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
8988 return FastLowered;
8989
8990 // The selection matcher assumes anything with a chain selecting to a
8991 // mayRaiseFPException machine instruction. Since we're introducing a chain
8992 // here, we need to explicitly report nofpexcept for the regular fdiv
8993 // lowering.
8994 SDNodeFlags Flags = Op->getFlags();
8995 Flags.setNoFPExcept(true);
8996
8997 SDLoc SL(Op);
8998 SDValue LHS = Op.getOperand(0);
8999 SDValue RHS = Op.getOperand(1);
9000
9001 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
9002
9003 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
9004
9005 SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
9006 {RHS, RHS, LHS}, Flags);
9007 SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
9008 {LHS, RHS, LHS}, Flags);
9009
9010 // Denominator is scaled to not be denormal, so using rcp is ok.
9011 SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
9012 DenominatorScaled, Flags);
9013 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
9014 DenominatorScaled, Flags);
9015
9016 const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE |
9017 (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
9018 (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
9019 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
9020
9021 const bool HasFP32Denormals = hasFP32Denormals(DAG.getMachineFunction());
9022
9023 if (!HasFP32Denormals) {
9024 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
9025 // lowering. The chain dependence is insufficient, and we need glue. We do
9026 // not need the glue variants in a strictfp function.
9027
9028 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
9029
9030 SDNode *EnableDenorm;
9031 if (Subtarget->hasDenormModeInst()) {
9032 const SDValue EnableDenormValue =
9033 getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, SL, Subtarget);
9034
9035 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
9036 DAG.getEntryNode(), EnableDenormValue).getNode();
9037 } else {
9038 const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
9039 SL, MVT::i32);
9040 EnableDenorm =
9041 DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
9042 {EnableDenormValue, BitField, DAG.getEntryNode()});
9043 }
9044
9045 SDValue Ops[3] = {
9046 NegDivScale0,
9047 SDValue(EnableDenorm, 0),
9048 SDValue(EnableDenorm, 1)
9049 };
9050
9051 NegDivScale0 = DAG.getMergeValues(Ops, SL);
9052 }
9053
9054 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
9055 ApproxRcp, One, NegDivScale0, Flags);
9056
9057 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
9058 ApproxRcp, Fma0, Flags);
9059
9060 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
9061 Fma1, Fma1, Flags);
9062
9063 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
9064 NumeratorScaled, Mul, Flags);
9065
9066 SDValue Fma3 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32,
9067 Fma2, Fma1, Mul, Fma2, Flags);
9068
9069 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
9070 NumeratorScaled, Fma3, Flags);
9071
9072 if (!HasFP32Denormals) {
9073 SDNode *DisableDenorm;
9074 if (Subtarget->hasDenormModeInst()) {
9075 const SDValue DisableDenormValue =
9076 getSPDenormModeValue(FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, SL, Subtarget);
9077
9078 DisableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other,
9079 Fma4.getValue(1), DisableDenormValue,
9080 Fma4.getValue(2)).getNode();
9081 } else {
9082 const SDValue DisableDenormValue =
9083 DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
9084
9085 DisableDenorm = DAG.getMachineNode(
9086 AMDGPU::S_SETREG_B32, SL, MVT::Other,
9087 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
9088 }
9089
9090 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
9091 SDValue(DisableDenorm, 0), DAG.getRoot());
9092 DAG.setRoot(OutputChain);
9093 }
9094
9095 SDValue Scale = NumeratorScaled.getValue(1);
9096 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
9097 {Fma4, Fma1, Fma3, Scale}, Flags);
9098
9099 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
9100 }
9101
LowerFDIV64(SDValue Op,SelectionDAG & DAG) const9102 SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
9103 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
9104 return FastLowered;
9105
9106 SDLoc SL(Op);
9107 SDValue X = Op.getOperand(0);
9108 SDValue Y = Op.getOperand(1);
9109
9110 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
9111
9112 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
9113
9114 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
9115
9116 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
9117
9118 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
9119
9120 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
9121
9122 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
9123
9124 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
9125
9126 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
9127
9128 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
9129 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
9130
9131 SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
9132 NegDivScale0, Mul, DivScale1);
9133
9134 SDValue Scale;
9135
9136 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
9137 // Workaround a hardware bug on SI where the condition output from div_scale
9138 // is not usable.
9139
9140 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
9141
9142 // Figure out if the scale to use for div_fmas.
9143 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
9144 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
9145 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
9146 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
9147
9148 SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
9149 SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
9150
9151 SDValue Scale0Hi
9152 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
9153 SDValue Scale1Hi
9154 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
9155
9156 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
9157 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
9158 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
9159 } else {
9160 Scale = DivScale1.getValue(1);
9161 }
9162
9163 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
9164 Fma4, Fma3, Mul, Scale);
9165
9166 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
9167 }
9168
LowerFDIV(SDValue Op,SelectionDAG & DAG) const9169 SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
9170 EVT VT = Op.getValueType();
9171
9172 if (VT == MVT::f32)
9173 return LowerFDIV32(Op, DAG);
9174
9175 if (VT == MVT::f64)
9176 return LowerFDIV64(Op, DAG);
9177
9178 if (VT == MVT::f16)
9179 return LowerFDIV16(Op, DAG);
9180
9181 llvm_unreachable("Unexpected type for fdiv");
9182 }
9183
LowerSTORE(SDValue Op,SelectionDAG & DAG) const9184 SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
9185 SDLoc DL(Op);
9186 StoreSDNode *Store = cast<StoreSDNode>(Op);
9187 EVT VT = Store->getMemoryVT();
9188
9189 if (VT == MVT::i1) {
9190 return DAG.getTruncStore(Store->getChain(), DL,
9191 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
9192 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
9193 }
9194
9195 assert(VT.isVector() &&
9196 Store->getValue().getValueType().getScalarType() == MVT::i32);
9197
9198 unsigned AS = Store->getAddressSpace();
9199 if (Subtarget->hasLDSMisalignedBug() &&
9200 AS == AMDGPUAS::FLAT_ADDRESS &&
9201 Store->getAlign().value() < VT.getStoreSize() && VT.getSizeInBits() > 32) {
9202 return SplitVectorStore(Op, DAG);
9203 }
9204
9205 MachineFunction &MF = DAG.getMachineFunction();
9206 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
9207 // If there is a possibility that flat instruction access scratch memory
9208 // then we need to use the same legalization rules we use for private.
9209 if (AS == AMDGPUAS::FLAT_ADDRESS &&
9210 !Subtarget->hasMultiDwordFlatScratchAddressing())
9211 AS = MFI->hasFlatScratchInit() ?
9212 AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
9213
9214 unsigned NumElements = VT.getVectorNumElements();
9215 if (AS == AMDGPUAS::GLOBAL_ADDRESS ||
9216 AS == AMDGPUAS::FLAT_ADDRESS) {
9217 if (NumElements > 4)
9218 return SplitVectorStore(Op, DAG);
9219 // v3 stores not supported on SI.
9220 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
9221 return SplitVectorStore(Op, DAG);
9222
9223 if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
9224 VT, *Store->getMemOperand()))
9225 return expandUnalignedStore(Store, DAG);
9226
9227 return SDValue();
9228 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
9229 switch (Subtarget->getMaxPrivateElementSize()) {
9230 case 4:
9231 return scalarizeVectorStore(Store, DAG);
9232 case 8:
9233 if (NumElements > 2)
9234 return SplitVectorStore(Op, DAG);
9235 return SDValue();
9236 case 16:
9237 if (NumElements > 4 ||
9238 (NumElements == 3 && !Subtarget->enableFlatScratch()))
9239 return SplitVectorStore(Op, DAG);
9240 return SDValue();
9241 default:
9242 llvm_unreachable("unsupported private_element_size");
9243 }
9244 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
9245 bool Fast = false;
9246 auto Flags = Store->getMemOperand()->getFlags();
9247 if (allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AS,
9248 Store->getAlign(), Flags, &Fast) &&
9249 Fast)
9250 return SDValue();
9251
9252 if (VT.isVector())
9253 return SplitVectorStore(Op, DAG);
9254
9255 return expandUnalignedStore(Store, DAG);
9256 }
9257
9258 // Probably an invalid store. If so we'll end up emitting a selection error.
9259 return SDValue();
9260 }
9261
LowerTrig(SDValue Op,SelectionDAG & DAG) const9262 SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
9263 SDLoc DL(Op);
9264 EVT VT = Op.getValueType();
9265 SDValue Arg = Op.getOperand(0);
9266 SDValue TrigVal;
9267
9268 // Propagate fast-math flags so that the multiply we introduce can be folded
9269 // if Arg is already the result of a multiply by constant.
9270 auto Flags = Op->getFlags();
9271
9272 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
9273
9274 if (Subtarget->hasTrigReducedRange()) {
9275 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
9276 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
9277 } else {
9278 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
9279 }
9280
9281 switch (Op.getOpcode()) {
9282 case ISD::FCOS:
9283 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
9284 case ISD::FSIN:
9285 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
9286 default:
9287 llvm_unreachable("Wrong trig opcode");
9288 }
9289 }
9290
LowerATOMIC_CMP_SWAP(SDValue Op,SelectionDAG & DAG) const9291 SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
9292 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
9293 assert(AtomicNode->isCompareAndSwap());
9294 unsigned AS = AtomicNode->getAddressSpace();
9295
9296 // No custom lowering required for local address space
9297 if (!AMDGPU::isFlatGlobalAddrSpace(AS))
9298 return Op;
9299
9300 // Non-local address space requires custom lowering for atomic compare
9301 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
9302 SDLoc DL(Op);
9303 SDValue ChainIn = Op.getOperand(0);
9304 SDValue Addr = Op.getOperand(1);
9305 SDValue Old = Op.getOperand(2);
9306 SDValue New = Op.getOperand(3);
9307 EVT VT = Op.getValueType();
9308 MVT SimpleVT = VT.getSimpleVT();
9309 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
9310
9311 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
9312 SDValue Ops[] = { ChainIn, Addr, NewOld };
9313
9314 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
9315 Ops, VT, AtomicNode->getMemOperand());
9316 }
9317
9318 //===----------------------------------------------------------------------===//
9319 // Custom DAG optimizations
9320 //===----------------------------------------------------------------------===//
9321
performUCharToFloatCombine(SDNode * N,DAGCombinerInfo & DCI) const9322 SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
9323 DAGCombinerInfo &DCI) const {
9324 EVT VT = N->getValueType(0);
9325 EVT ScalarVT = VT.getScalarType();
9326 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
9327 return SDValue();
9328
9329 SelectionDAG &DAG = DCI.DAG;
9330 SDLoc DL(N);
9331
9332 SDValue Src = N->getOperand(0);
9333 EVT SrcVT = Src.getValueType();
9334
9335 // TODO: We could try to match extracting the higher bytes, which would be
9336 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
9337 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
9338 // about in practice.
9339 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
9340 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
9341 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
9342 DCI.AddToWorklist(Cvt.getNode());
9343
9344 // For the f16 case, fold to a cast to f32 and then cast back to f16.
9345 if (ScalarVT != MVT::f32) {
9346 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
9347 DAG.getTargetConstant(0, DL, MVT::i32));
9348 }
9349 return Cvt;
9350 }
9351 }
9352
9353 return SDValue();
9354 }
9355
9356 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
9357
9358 // This is a variant of
9359 // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
9360 //
9361 // The normal DAG combiner will do this, but only if the add has one use since
9362 // that would increase the number of instructions.
9363 //
9364 // This prevents us from seeing a constant offset that can be folded into a
9365 // memory instruction's addressing mode. If we know the resulting add offset of
9366 // a pointer can be folded into an addressing offset, we can replace the pointer
9367 // operand with the add of new constant offset. This eliminates one of the uses,
9368 // and may allow the remaining use to also be simplified.
9369 //
performSHLPtrCombine(SDNode * N,unsigned AddrSpace,EVT MemVT,DAGCombinerInfo & DCI) const9370 SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
9371 unsigned AddrSpace,
9372 EVT MemVT,
9373 DAGCombinerInfo &DCI) const {
9374 SDValue N0 = N->getOperand(0);
9375 SDValue N1 = N->getOperand(1);
9376
9377 // We only do this to handle cases where it's profitable when there are
9378 // multiple uses of the add, so defer to the standard combine.
9379 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
9380 N0->hasOneUse())
9381 return SDValue();
9382
9383 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
9384 if (!CN1)
9385 return SDValue();
9386
9387 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
9388 if (!CAdd)
9389 return SDValue();
9390
9391 // If the resulting offset is too large, we can't fold it into the addressing
9392 // mode offset.
9393 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
9394 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
9395
9396 AddrMode AM;
9397 AM.HasBaseReg = true;
9398 AM.BaseOffs = Offset.getSExtValue();
9399 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
9400 return SDValue();
9401
9402 SelectionDAG &DAG = DCI.DAG;
9403 SDLoc SL(N);
9404 EVT VT = N->getValueType(0);
9405
9406 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
9407 SDValue COffset = DAG.getConstant(Offset, SL, VT);
9408
9409 SDNodeFlags Flags;
9410 Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() &&
9411 (N0.getOpcode() == ISD::OR ||
9412 N0->getFlags().hasNoUnsignedWrap()));
9413
9414 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
9415 }
9416
9417 /// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
9418 /// by the chain and intrinsic ID. Theoretically we would also need to check the
9419 /// specific intrinsic, but they all place the pointer operand first.
getBasePtrIndex(const MemSDNode * N)9420 static unsigned getBasePtrIndex(const MemSDNode *N) {
9421 switch (N->getOpcode()) {
9422 case ISD::STORE:
9423 case ISD::INTRINSIC_W_CHAIN:
9424 case ISD::INTRINSIC_VOID:
9425 return 2;
9426 default:
9427 return 1;
9428 }
9429 }
9430
performMemSDNodeCombine(MemSDNode * N,DAGCombinerInfo & DCI) const9431 SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
9432 DAGCombinerInfo &DCI) const {
9433 SelectionDAG &DAG = DCI.DAG;
9434 SDLoc SL(N);
9435
9436 unsigned PtrIdx = getBasePtrIndex(N);
9437 SDValue Ptr = N->getOperand(PtrIdx);
9438
9439 // TODO: We could also do this for multiplies.
9440 if (Ptr.getOpcode() == ISD::SHL) {
9441 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
9442 N->getMemoryVT(), DCI);
9443 if (NewPtr) {
9444 SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
9445
9446 NewOps[PtrIdx] = NewPtr;
9447 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
9448 }
9449 }
9450
9451 return SDValue();
9452 }
9453
bitOpWithConstantIsReducible(unsigned Opc,uint32_t Val)9454 static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
9455 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
9456 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
9457 (Opc == ISD::XOR && Val == 0);
9458 }
9459
9460 // Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
9461 // will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
9462 // integer combine opportunities since most 64-bit operations are decomposed
9463 // this way. TODO: We won't want this for SALU especially if it is an inline
9464 // immediate.
splitBinaryBitConstantOp(DAGCombinerInfo & DCI,const SDLoc & SL,unsigned Opc,SDValue LHS,const ConstantSDNode * CRHS) const9465 SDValue SITargetLowering::splitBinaryBitConstantOp(
9466 DAGCombinerInfo &DCI,
9467 const SDLoc &SL,
9468 unsigned Opc, SDValue LHS,
9469 const ConstantSDNode *CRHS) const {
9470 uint64_t Val = CRHS->getZExtValue();
9471 uint32_t ValLo = Lo_32(Val);
9472 uint32_t ValHi = Hi_32(Val);
9473 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
9474
9475 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
9476 bitOpWithConstantIsReducible(Opc, ValHi)) ||
9477 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
9478 // If we need to materialize a 64-bit immediate, it will be split up later
9479 // anyway. Avoid creating the harder to understand 64-bit immediate
9480 // materialization.
9481 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
9482 }
9483
9484 return SDValue();
9485 }
9486
9487 // Returns true if argument is a boolean value which is not serialized into
9488 // memory or argument and does not require v_cndmask_b32 to be deserialized.
isBoolSGPR(SDValue V)9489 static bool isBoolSGPR(SDValue V) {
9490 if (V.getValueType() != MVT::i1)
9491 return false;
9492 switch (V.getOpcode()) {
9493 default:
9494 break;
9495 case ISD::SETCC:
9496 case AMDGPUISD::FP_CLASS:
9497 return true;
9498 case ISD::AND:
9499 case ISD::OR:
9500 case ISD::XOR:
9501 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
9502 }
9503 return false;
9504 }
9505
9506 // If a constant has all zeroes or all ones within each byte return it.
9507 // Otherwise return 0.
getConstantPermuteMask(uint32_t C)9508 static uint32_t getConstantPermuteMask(uint32_t C) {
9509 // 0xff for any zero byte in the mask
9510 uint32_t ZeroByteMask = 0;
9511 if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
9512 if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
9513 if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
9514 if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000;
9515 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
9516 if ((NonZeroByteMask & C) != NonZeroByteMask)
9517 return 0; // Partial bytes selected.
9518 return C;
9519 }
9520
9521 // Check if a node selects whole bytes from its operand 0 starting at a byte
9522 // boundary while masking the rest. Returns select mask as in the v_perm_b32
9523 // or -1 if not succeeded.
9524 // Note byte select encoding:
9525 // value 0-3 selects corresponding source byte;
9526 // value 0xc selects zero;
9527 // value 0xff selects 0xff.
getPermuteMask(SelectionDAG & DAG,SDValue V)9528 static uint32_t getPermuteMask(SelectionDAG &DAG, SDValue V) {
9529 assert(V.getValueSizeInBits() == 32);
9530
9531 if (V.getNumOperands() != 2)
9532 return ~0;
9533
9534 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
9535 if (!N1)
9536 return ~0;
9537
9538 uint32_t C = N1->getZExtValue();
9539
9540 switch (V.getOpcode()) {
9541 default:
9542 break;
9543 case ISD::AND:
9544 if (uint32_t ConstMask = getConstantPermuteMask(C)) {
9545 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
9546 }
9547 break;
9548
9549 case ISD::OR:
9550 if (uint32_t ConstMask = getConstantPermuteMask(C)) {
9551 return (0x03020100 & ~ConstMask) | ConstMask;
9552 }
9553 break;
9554
9555 case ISD::SHL:
9556 if (C % 8)
9557 return ~0;
9558
9559 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
9560
9561 case ISD::SRL:
9562 if (C % 8)
9563 return ~0;
9564
9565 return uint32_t(0x0c0c0c0c03020100ull >> C);
9566 }
9567
9568 return ~0;
9569 }
9570
performAndCombine(SDNode * N,DAGCombinerInfo & DCI) const9571 SDValue SITargetLowering::performAndCombine(SDNode *N,
9572 DAGCombinerInfo &DCI) const {
9573 if (DCI.isBeforeLegalize())
9574 return SDValue();
9575
9576 SelectionDAG &DAG = DCI.DAG;
9577 EVT VT = N->getValueType(0);
9578 SDValue LHS = N->getOperand(0);
9579 SDValue RHS = N->getOperand(1);
9580
9581
9582 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
9583 if (VT == MVT::i64 && CRHS) {
9584 if (SDValue Split
9585 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
9586 return Split;
9587 }
9588
9589 if (CRHS && VT == MVT::i32) {
9590 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
9591 // nb = number of trailing zeroes in mask
9592 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
9593 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
9594 uint64_t Mask = CRHS->getZExtValue();
9595 unsigned Bits = countPopulation(Mask);
9596 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
9597 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
9598 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
9599 unsigned Shift = CShift->getZExtValue();
9600 unsigned NB = CRHS->getAPIntValue().countTrailingZeros();
9601 unsigned Offset = NB + Shift;
9602 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
9603 SDLoc SL(N);
9604 SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
9605 LHS->getOperand(0),
9606 DAG.getConstant(Offset, SL, MVT::i32),
9607 DAG.getConstant(Bits, SL, MVT::i32));
9608 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
9609 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
9610 DAG.getValueType(NarrowVT));
9611 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
9612 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
9613 return Shl;
9614 }
9615 }
9616 }
9617
9618 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
9619 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
9620 isa<ConstantSDNode>(LHS.getOperand(2))) {
9621 uint32_t Sel = getConstantPermuteMask(Mask);
9622 if (!Sel)
9623 return SDValue();
9624
9625 // Select 0xc for all zero bytes
9626 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
9627 SDLoc DL(N);
9628 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
9629 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
9630 }
9631 }
9632
9633 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
9634 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
9635 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
9636 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
9637 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
9638
9639 SDValue X = LHS.getOperand(0);
9640 SDValue Y = RHS.getOperand(0);
9641 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X)
9642 return SDValue();
9643
9644 if (LCC == ISD::SETO) {
9645 if (X != LHS.getOperand(1))
9646 return SDValue();
9647
9648 if (RCC == ISD::SETUNE) {
9649 const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
9650 if (!C1 || !C1->isInfinity() || C1->isNegative())
9651 return SDValue();
9652
9653 const uint32_t Mask = SIInstrFlags::N_NORMAL |
9654 SIInstrFlags::N_SUBNORMAL |
9655 SIInstrFlags::N_ZERO |
9656 SIInstrFlags::P_ZERO |
9657 SIInstrFlags::P_SUBNORMAL |
9658 SIInstrFlags::P_NORMAL;
9659
9660 static_assert(((~(SIInstrFlags::S_NAN |
9661 SIInstrFlags::Q_NAN |
9662 SIInstrFlags::N_INFINITY |
9663 SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
9664 "mask not equal");
9665
9666 SDLoc DL(N);
9667 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
9668 X, DAG.getConstant(Mask, DL, MVT::i32));
9669 }
9670 }
9671 }
9672
9673 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
9674 std::swap(LHS, RHS);
9675
9676 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
9677 RHS.hasOneUse()) {
9678 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
9679 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | n_nan)
9680 // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | n_nan)
9681 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
9682 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
9683 (RHS.getOperand(0) == LHS.getOperand(0) &&
9684 LHS.getOperand(0) == LHS.getOperand(1))) {
9685 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
9686 unsigned NewMask = LCC == ISD::SETO ?
9687 Mask->getZExtValue() & ~OrdMask :
9688 Mask->getZExtValue() & OrdMask;
9689
9690 SDLoc DL(N);
9691 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
9692 DAG.getConstant(NewMask, DL, MVT::i32));
9693 }
9694 }
9695
9696 if (VT == MVT::i32 &&
9697 (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) {
9698 // and x, (sext cc from i1) => select cc, x, 0
9699 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
9700 std::swap(LHS, RHS);
9701 if (isBoolSGPR(RHS.getOperand(0)))
9702 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0),
9703 LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
9704 }
9705
9706 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
9707 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
9708 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
9709 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
9710 uint32_t LHSMask = getPermuteMask(DAG, LHS);
9711 uint32_t RHSMask = getPermuteMask(DAG, RHS);
9712 if (LHSMask != ~0u && RHSMask != ~0u) {
9713 // Canonicalize the expression in an attempt to have fewer unique masks
9714 // and therefore fewer registers used to hold the masks.
9715 if (LHSMask > RHSMask) {
9716 std::swap(LHSMask, RHSMask);
9717 std::swap(LHS, RHS);
9718 }
9719
9720 // Select 0xc for each lane used from source operand. Zero has 0xc mask
9721 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
9722 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
9723 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
9724
9725 // Check of we need to combine values from two sources within a byte.
9726 if (!(LHSUsedLanes & RHSUsedLanes) &&
9727 // If we select high and lower word keep it for SDWA.
9728 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
9729 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
9730 // Each byte in each mask is either selector mask 0-3, or has higher
9731 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
9732 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
9733 // mask which is not 0xff wins. By anding both masks we have a correct
9734 // result except that 0x0c shall be corrected to give 0x0c only.
9735 uint32_t Mask = LHSMask & RHSMask;
9736 for (unsigned I = 0; I < 32; I += 8) {
9737 uint32_t ByteSel = 0xff << I;
9738 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
9739 Mask &= (0x0c << I) & 0xffffffff;
9740 }
9741
9742 // Add 4 to each active LHS lane. It will not affect any existing 0xff
9743 // or 0x0c.
9744 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
9745 SDLoc DL(N);
9746
9747 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
9748 LHS.getOperand(0), RHS.getOperand(0),
9749 DAG.getConstant(Sel, DL, MVT::i32));
9750 }
9751 }
9752 }
9753
9754 return SDValue();
9755 }
9756
performOrCombine(SDNode * N,DAGCombinerInfo & DCI) const9757 SDValue SITargetLowering::performOrCombine(SDNode *N,
9758 DAGCombinerInfo &DCI) const {
9759 SelectionDAG &DAG = DCI.DAG;
9760 SDValue LHS = N->getOperand(0);
9761 SDValue RHS = N->getOperand(1);
9762
9763 EVT VT = N->getValueType(0);
9764 if (VT == MVT::i1) {
9765 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
9766 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
9767 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
9768 SDValue Src = LHS.getOperand(0);
9769 if (Src != RHS.getOperand(0))
9770 return SDValue();
9771
9772 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
9773 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
9774 if (!CLHS || !CRHS)
9775 return SDValue();
9776
9777 // Only 10 bits are used.
9778 static const uint32_t MaxMask = 0x3ff;
9779
9780 uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
9781 SDLoc DL(N);
9782 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
9783 Src, DAG.getConstant(NewMask, DL, MVT::i32));
9784 }
9785
9786 return SDValue();
9787 }
9788
9789 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
9790 if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
9791 LHS.getOpcode() == AMDGPUISD::PERM &&
9792 isa<ConstantSDNode>(LHS.getOperand(2))) {
9793 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
9794 if (!Sel)
9795 return SDValue();
9796
9797 Sel |= LHS.getConstantOperandVal(2);
9798 SDLoc DL(N);
9799 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
9800 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
9801 }
9802
9803 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
9804 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
9805 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
9806 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
9807 uint32_t LHSMask = getPermuteMask(DAG, LHS);
9808 uint32_t RHSMask = getPermuteMask(DAG, RHS);
9809 if (LHSMask != ~0u && RHSMask != ~0u) {
9810 // Canonicalize the expression in an attempt to have fewer unique masks
9811 // and therefore fewer registers used to hold the masks.
9812 if (LHSMask > RHSMask) {
9813 std::swap(LHSMask, RHSMask);
9814 std::swap(LHS, RHS);
9815 }
9816
9817 // Select 0xc for each lane used from source operand. Zero has 0xc mask
9818 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
9819 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
9820 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
9821
9822 // Check of we need to combine values from two sources within a byte.
9823 if (!(LHSUsedLanes & RHSUsedLanes) &&
9824 // If we select high and lower word keep it for SDWA.
9825 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
9826 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
9827 // Kill zero bytes selected by other mask. Zero value is 0xc.
9828 LHSMask &= ~RHSUsedLanes;
9829 RHSMask &= ~LHSUsedLanes;
9830 // Add 4 to each active LHS lane
9831 LHSMask |= LHSUsedLanes & 0x04040404;
9832 // Combine masks
9833 uint32_t Sel = LHSMask | RHSMask;
9834 SDLoc DL(N);
9835
9836 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
9837 LHS.getOperand(0), RHS.getOperand(0),
9838 DAG.getConstant(Sel, DL, MVT::i32));
9839 }
9840 }
9841 }
9842
9843 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
9844 return SDValue();
9845
9846 // TODO: This could be a generic combine with a predicate for extracting the
9847 // high half of an integer being free.
9848
9849 // (or i64:x, (zero_extend i32:y)) ->
9850 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
9851 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
9852 RHS.getOpcode() != ISD::ZERO_EXTEND)
9853 std::swap(LHS, RHS);
9854
9855 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
9856 SDValue ExtSrc = RHS.getOperand(0);
9857 EVT SrcVT = ExtSrc.getValueType();
9858 if (SrcVT == MVT::i32) {
9859 SDLoc SL(N);
9860 SDValue LowLHS, HiBits;
9861 std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
9862 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
9863
9864 DCI.AddToWorklist(LowOr.getNode());
9865 DCI.AddToWorklist(HiBits.getNode());
9866
9867 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
9868 LowOr, HiBits);
9869 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
9870 }
9871 }
9872
9873 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
9874 if (CRHS) {
9875 if (SDValue Split
9876 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
9877 N->getOperand(0), CRHS))
9878 return Split;
9879 }
9880
9881 return SDValue();
9882 }
9883
performXorCombine(SDNode * N,DAGCombinerInfo & DCI) const9884 SDValue SITargetLowering::performXorCombine(SDNode *N,
9885 DAGCombinerInfo &DCI) const {
9886 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
9887 return RV;
9888
9889 EVT VT = N->getValueType(0);
9890 if (VT != MVT::i64)
9891 return SDValue();
9892
9893 SDValue LHS = N->getOperand(0);
9894 SDValue RHS = N->getOperand(1);
9895
9896 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
9897 if (CRHS) {
9898 if (SDValue Split
9899 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
9900 return Split;
9901 }
9902
9903 return SDValue();
9904 }
9905
performZeroExtendCombine(SDNode * N,DAGCombinerInfo & DCI) const9906 SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
9907 DAGCombinerInfo &DCI) const {
9908 if (!Subtarget->has16BitInsts() ||
9909 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
9910 return SDValue();
9911
9912 EVT VT = N->getValueType(0);
9913 if (VT != MVT::i32)
9914 return SDValue();
9915
9916 SDValue Src = N->getOperand(0);
9917 if (Src.getValueType() != MVT::i16)
9918 return SDValue();
9919
9920 return SDValue();
9921 }
9922
performSignExtendInRegCombine(SDNode * N,DAGCombinerInfo & DCI) const9923 SDValue SITargetLowering::performSignExtendInRegCombine(SDNode *N,
9924 DAGCombinerInfo &DCI)
9925 const {
9926 SDValue Src = N->getOperand(0);
9927 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
9928
9929 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
9930 VTSign->getVT() == MVT::i8) ||
9931 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
9932 VTSign->getVT() == MVT::i16)) &&
9933 Src.hasOneUse()) {
9934 auto *M = cast<MemSDNode>(Src);
9935 SDValue Ops[] = {
9936 Src.getOperand(0), // Chain
9937 Src.getOperand(1), // rsrc
9938 Src.getOperand(2), // vindex
9939 Src.getOperand(3), // voffset
9940 Src.getOperand(4), // soffset
9941 Src.getOperand(5), // offset
9942 Src.getOperand(6),
9943 Src.getOperand(7)
9944 };
9945 // replace with BUFFER_LOAD_BYTE/SHORT
9946 SDVTList ResList = DCI.DAG.getVTList(MVT::i32,
9947 Src.getOperand(0).getValueType());
9948 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE) ?
9949 AMDGPUISD::BUFFER_LOAD_BYTE : AMDGPUISD::BUFFER_LOAD_SHORT;
9950 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(Opc, SDLoc(N),
9951 ResList,
9952 Ops, M->getMemoryVT(),
9953 M->getMemOperand());
9954 return DCI.DAG.getMergeValues({BufferLoadSignExt,
9955 BufferLoadSignExt.getValue(1)}, SDLoc(N));
9956 }
9957 return SDValue();
9958 }
9959
performClassCombine(SDNode * N,DAGCombinerInfo & DCI) const9960 SDValue SITargetLowering::performClassCombine(SDNode *N,
9961 DAGCombinerInfo &DCI) const {
9962 SelectionDAG &DAG = DCI.DAG;
9963 SDValue Mask = N->getOperand(1);
9964
9965 // fp_class x, 0 -> false
9966 if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) {
9967 if (CMask->isZero())
9968 return DAG.getConstant(0, SDLoc(N), MVT::i1);
9969 }
9970
9971 if (N->getOperand(0).isUndef())
9972 return DAG.getUNDEF(MVT::i1);
9973
9974 return SDValue();
9975 }
9976
performRcpCombine(SDNode * N,DAGCombinerInfo & DCI) const9977 SDValue SITargetLowering::performRcpCombine(SDNode *N,
9978 DAGCombinerInfo &DCI) const {
9979 EVT VT = N->getValueType(0);
9980 SDValue N0 = N->getOperand(0);
9981
9982 if (N0.isUndef())
9983 return N0;
9984
9985 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
9986 N0.getOpcode() == ISD::SINT_TO_FP)) {
9987 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
9988 N->getFlags());
9989 }
9990
9991 if ((VT == MVT::f32 || VT == MVT::f16) && N0.getOpcode() == ISD::FSQRT) {
9992 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT,
9993 N0.getOperand(0), N->getFlags());
9994 }
9995
9996 return AMDGPUTargetLowering::performRcpCombine(N, DCI);
9997 }
9998
isCanonicalized(SelectionDAG & DAG,SDValue Op,unsigned MaxDepth) const9999 bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
10000 unsigned MaxDepth) const {
10001 unsigned Opcode = Op.getOpcode();
10002 if (Opcode == ISD::FCANONICALIZE)
10003 return true;
10004
10005 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
10006 auto F = CFP->getValueAPF();
10007 if (F.isNaN() && F.isSignaling())
10008 return false;
10009 return !F.isDenormal() || denormalsEnabledForType(DAG, Op.getValueType());
10010 }
10011
10012 // If source is a result of another standard FP operation it is already in
10013 // canonical form.
10014 if (MaxDepth == 0)
10015 return false;
10016
10017 switch (Opcode) {
10018 // These will flush denorms if required.
10019 case ISD::FADD:
10020 case ISD::FSUB:
10021 case ISD::FMUL:
10022 case ISD::FCEIL:
10023 case ISD::FFLOOR:
10024 case ISD::FMA:
10025 case ISD::FMAD:
10026 case ISD::FSQRT:
10027 case ISD::FDIV:
10028 case ISD::FREM:
10029 case ISD::FP_ROUND:
10030 case ISD::FP_EXTEND:
10031 case AMDGPUISD::FMUL_LEGACY:
10032 case AMDGPUISD::FMAD_FTZ:
10033 case AMDGPUISD::RCP:
10034 case AMDGPUISD::RSQ:
10035 case AMDGPUISD::RSQ_CLAMP:
10036 case AMDGPUISD::RCP_LEGACY:
10037 case AMDGPUISD::RCP_IFLAG:
10038 case AMDGPUISD::DIV_SCALE:
10039 case AMDGPUISD::DIV_FMAS:
10040 case AMDGPUISD::DIV_FIXUP:
10041 case AMDGPUISD::FRACT:
10042 case AMDGPUISD::LDEXP:
10043 case AMDGPUISD::CVT_PKRTZ_F16_F32:
10044 case AMDGPUISD::CVT_F32_UBYTE0:
10045 case AMDGPUISD::CVT_F32_UBYTE1:
10046 case AMDGPUISD::CVT_F32_UBYTE2:
10047 case AMDGPUISD::CVT_F32_UBYTE3:
10048 return true;
10049
10050 // It can/will be lowered or combined as a bit operation.
10051 // Need to check their input recursively to handle.
10052 case ISD::FNEG:
10053 case ISD::FABS:
10054 case ISD::FCOPYSIGN:
10055 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
10056
10057 case ISD::FSIN:
10058 case ISD::FCOS:
10059 case ISD::FSINCOS:
10060 return Op.getValueType().getScalarType() != MVT::f16;
10061
10062 case ISD::FMINNUM:
10063 case ISD::FMAXNUM:
10064 case ISD::FMINNUM_IEEE:
10065 case ISD::FMAXNUM_IEEE:
10066 case AMDGPUISD::CLAMP:
10067 case AMDGPUISD::FMED3:
10068 case AMDGPUISD::FMAX3:
10069 case AMDGPUISD::FMIN3: {
10070 // FIXME: Shouldn't treat the generic operations different based these.
10071 // However, we aren't really required to flush the result from
10072 // minnum/maxnum..
10073
10074 // snans will be quieted, so we only need to worry about denormals.
10075 if (Subtarget->supportsMinMaxDenormModes() ||
10076 denormalsEnabledForType(DAG, Op.getValueType()))
10077 return true;
10078
10079 // Flushing may be required.
10080 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
10081 // targets need to check their input recursively.
10082
10083 // FIXME: Does this apply with clamp? It's implemented with max.
10084 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
10085 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
10086 return false;
10087 }
10088
10089 return true;
10090 }
10091 case ISD::SELECT: {
10092 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
10093 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
10094 }
10095 case ISD::BUILD_VECTOR: {
10096 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
10097 SDValue SrcOp = Op.getOperand(i);
10098 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
10099 return false;
10100 }
10101
10102 return true;
10103 }
10104 case ISD::EXTRACT_VECTOR_ELT:
10105 case ISD::EXTRACT_SUBVECTOR: {
10106 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
10107 }
10108 case ISD::INSERT_VECTOR_ELT: {
10109 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
10110 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
10111 }
10112 case ISD::UNDEF:
10113 // Could be anything.
10114 return false;
10115
10116 case ISD::BITCAST:
10117 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
10118 case ISD::TRUNCATE: {
10119 // Hack round the mess we make when legalizing extract_vector_elt
10120 if (Op.getValueType() == MVT::i16) {
10121 SDValue TruncSrc = Op.getOperand(0);
10122 if (TruncSrc.getValueType() == MVT::i32 &&
10123 TruncSrc.getOpcode() == ISD::BITCAST &&
10124 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
10125 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
10126 }
10127 }
10128 return false;
10129 }
10130 case ISD::INTRINSIC_WO_CHAIN: {
10131 unsigned IntrinsicID
10132 = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
10133 // TODO: Handle more intrinsics
10134 switch (IntrinsicID) {
10135 case Intrinsic::amdgcn_cvt_pkrtz:
10136 case Intrinsic::amdgcn_cubeid:
10137 case Intrinsic::amdgcn_frexp_mant:
10138 case Intrinsic::amdgcn_fdot2:
10139 case Intrinsic::amdgcn_rcp:
10140 case Intrinsic::amdgcn_rsq:
10141 case Intrinsic::amdgcn_rsq_clamp:
10142 case Intrinsic::amdgcn_rcp_legacy:
10143 case Intrinsic::amdgcn_rsq_legacy:
10144 case Intrinsic::amdgcn_trig_preop:
10145 return true;
10146 default:
10147 break;
10148 }
10149
10150 LLVM_FALLTHROUGH;
10151 }
10152 default:
10153 return denormalsEnabledForType(DAG, Op.getValueType()) &&
10154 DAG.isKnownNeverSNaN(Op);
10155 }
10156
10157 llvm_unreachable("invalid operation");
10158 }
10159
isCanonicalized(Register Reg,MachineFunction & MF,unsigned MaxDepth) const10160 bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF,
10161 unsigned MaxDepth) const {
10162 MachineRegisterInfo &MRI = MF.getRegInfo();
10163 MachineInstr *MI = MRI.getVRegDef(Reg);
10164 unsigned Opcode = MI->getOpcode();
10165
10166 if (Opcode == AMDGPU::G_FCANONICALIZE)
10167 return true;
10168
10169 Optional<FPValueAndVReg> FCR;
10170 // Constant splat (can be padded with undef) or scalar constant.
10171 if (mi_match(Reg, MRI, MIPatternMatch::m_GFCstOrSplat(FCR))) {
10172 if (FCR->Value.isSignaling())
10173 return false;
10174 return !FCR->Value.isDenormal() ||
10175 denormalsEnabledForType(MRI.getType(FCR->VReg), MF);
10176 }
10177
10178 if (MaxDepth == 0)
10179 return false;
10180
10181 switch (Opcode) {
10182 case AMDGPU::G_FMINNUM_IEEE:
10183 case AMDGPU::G_FMAXNUM_IEEE: {
10184 if (Subtarget->supportsMinMaxDenormModes() ||
10185 denormalsEnabledForType(MRI.getType(Reg), MF))
10186 return true;
10187 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
10188 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
10189 return false;
10190 return true;
10191 }
10192 default:
10193 return denormalsEnabledForType(MRI.getType(Reg), MF) &&
10194 isKnownNeverSNaN(Reg, MRI);
10195 }
10196
10197 llvm_unreachable("invalid operation");
10198 }
10199
10200 // Constant fold canonicalize.
getCanonicalConstantFP(SelectionDAG & DAG,const SDLoc & SL,EVT VT,const APFloat & C) const10201 SDValue SITargetLowering::getCanonicalConstantFP(
10202 SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {
10203 // Flush denormals to 0 if not enabled.
10204 if (C.isDenormal() && !denormalsEnabledForType(DAG, VT))
10205 return DAG.getConstantFP(0.0, SL, VT);
10206
10207 if (C.isNaN()) {
10208 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
10209 if (C.isSignaling()) {
10210 // Quiet a signaling NaN.
10211 // FIXME: Is this supposed to preserve payload bits?
10212 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
10213 }
10214
10215 // Make sure it is the canonical NaN bitpattern.
10216 //
10217 // TODO: Can we use -1 as the canonical NaN value since it's an inline
10218 // immediate?
10219 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
10220 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
10221 }
10222
10223 // Already canonical.
10224 return DAG.getConstantFP(C, SL, VT);
10225 }
10226
vectorEltWillFoldAway(SDValue Op)10227 static bool vectorEltWillFoldAway(SDValue Op) {
10228 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
10229 }
10230
performFCanonicalizeCombine(SDNode * N,DAGCombinerInfo & DCI) const10231 SDValue SITargetLowering::performFCanonicalizeCombine(
10232 SDNode *N,
10233 DAGCombinerInfo &DCI) const {
10234 SelectionDAG &DAG = DCI.DAG;
10235 SDValue N0 = N->getOperand(0);
10236 EVT VT = N->getValueType(0);
10237
10238 // fcanonicalize undef -> qnan
10239 if (N0.isUndef()) {
10240 APFloat QNaN = APFloat::getQNaN(SelectionDAG::EVTToAPFloatSemantics(VT));
10241 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
10242 }
10243
10244 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
10245 EVT VT = N->getValueType(0);
10246 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
10247 }
10248
10249 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
10250 // (fcanonicalize k)
10251 //
10252 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
10253
10254 // TODO: This could be better with wider vectors that will be split to v2f16,
10255 // and to consider uses since there aren't that many packed operations.
10256 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
10257 isTypeLegal(MVT::v2f16)) {
10258 SDLoc SL(N);
10259 SDValue NewElts[2];
10260 SDValue Lo = N0.getOperand(0);
10261 SDValue Hi = N0.getOperand(1);
10262 EVT EltVT = Lo.getValueType();
10263
10264 if (vectorEltWillFoldAway(Lo) || vectorEltWillFoldAway(Hi)) {
10265 for (unsigned I = 0; I != 2; ++I) {
10266 SDValue Op = N0.getOperand(I);
10267 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
10268 NewElts[I] = getCanonicalConstantFP(DAG, SL, EltVT,
10269 CFP->getValueAPF());
10270 } else if (Op.isUndef()) {
10271 // Handled below based on what the other operand is.
10272 NewElts[I] = Op;
10273 } else {
10274 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
10275 }
10276 }
10277
10278 // If one half is undef, and one is constant, prefer a splat vector rather
10279 // than the normal qNaN. If it's a register, prefer 0.0 since that's
10280 // cheaper to use and may be free with a packed operation.
10281 if (NewElts[0].isUndef()) {
10282 if (isa<ConstantFPSDNode>(NewElts[1]))
10283 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) ?
10284 NewElts[1]: DAG.getConstantFP(0.0f, SL, EltVT);
10285 }
10286
10287 if (NewElts[1].isUndef()) {
10288 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) ?
10289 NewElts[0] : DAG.getConstantFP(0.0f, SL, EltVT);
10290 }
10291
10292 return DAG.getBuildVector(VT, SL, NewElts);
10293 }
10294 }
10295
10296 unsigned SrcOpc = N0.getOpcode();
10297
10298 // If it's free to do so, push canonicalizes further up the source, which may
10299 // find a canonical source.
10300 //
10301 // TODO: More opcodes. Note this is unsafe for the the _ieee minnum/maxnum for
10302 // sNaNs.
10303 if (SrcOpc == ISD::FMINNUM || SrcOpc == ISD::FMAXNUM) {
10304 auto *CRHS = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
10305 if (CRHS && N0.hasOneUse()) {
10306 SDLoc SL(N);
10307 SDValue Canon0 = DAG.getNode(ISD::FCANONICALIZE, SL, VT,
10308 N0.getOperand(0));
10309 SDValue Canon1 = getCanonicalConstantFP(DAG, SL, VT, CRHS->getValueAPF());
10310 DCI.AddToWorklist(Canon0.getNode());
10311
10312 return DAG.getNode(N0.getOpcode(), SL, VT, Canon0, Canon1);
10313 }
10314 }
10315
10316 return isCanonicalized(DAG, N0) ? N0 : SDValue();
10317 }
10318
minMaxOpcToMin3Max3Opc(unsigned Opc)10319 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
10320 switch (Opc) {
10321 case ISD::FMAXNUM:
10322 case ISD::FMAXNUM_IEEE:
10323 return AMDGPUISD::FMAX3;
10324 case ISD::SMAX:
10325 return AMDGPUISD::SMAX3;
10326 case ISD::UMAX:
10327 return AMDGPUISD::UMAX3;
10328 case ISD::FMINNUM:
10329 case ISD::FMINNUM_IEEE:
10330 return AMDGPUISD::FMIN3;
10331 case ISD::SMIN:
10332 return AMDGPUISD::SMIN3;
10333 case ISD::UMIN:
10334 return AMDGPUISD::UMIN3;
10335 default:
10336 llvm_unreachable("Not a min/max opcode");
10337 }
10338 }
10339
performIntMed3ImmCombine(SelectionDAG & DAG,const SDLoc & SL,SDValue Op0,SDValue Op1,bool Signed) const10340 SDValue SITargetLowering::performIntMed3ImmCombine(
10341 SelectionDAG &DAG, const SDLoc &SL,
10342 SDValue Op0, SDValue Op1, bool Signed) const {
10343 ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1);
10344 if (!K1)
10345 return SDValue();
10346
10347 ConstantSDNode *K0 = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
10348 if (!K0)
10349 return SDValue();
10350
10351 if (Signed) {
10352 if (K0->getAPIntValue().sge(K1->getAPIntValue()))
10353 return SDValue();
10354 } else {
10355 if (K0->getAPIntValue().uge(K1->getAPIntValue()))
10356 return SDValue();
10357 }
10358
10359 EVT VT = K0->getValueType(0);
10360 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
10361 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16())) {
10362 return DAG.getNode(Med3Opc, SL, VT,
10363 Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
10364 }
10365
10366 // If there isn't a 16-bit med3 operation, convert to 32-bit.
10367 if (VT == MVT::i16) {
10368 MVT NVT = MVT::i32;
10369 unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
10370
10371 SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
10372 SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
10373 SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
10374
10375 SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3);
10376 return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3);
10377 }
10378
10379 return SDValue();
10380 }
10381
getSplatConstantFP(SDValue Op)10382 static ConstantFPSDNode *getSplatConstantFP(SDValue Op) {
10383 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
10384 return C;
10385
10386 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
10387 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
10388 return C;
10389 }
10390
10391 return nullptr;
10392 }
10393
performFPMed3ImmCombine(SelectionDAG & DAG,const SDLoc & SL,SDValue Op0,SDValue Op1) const10394 SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
10395 const SDLoc &SL,
10396 SDValue Op0,
10397 SDValue Op1) const {
10398 ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
10399 if (!K1)
10400 return SDValue();
10401
10402 ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
10403 if (!K0)
10404 return SDValue();
10405
10406 // Ordered >= (although NaN inputs should have folded away by now).
10407 if (K0->getValueAPF() > K1->getValueAPF())
10408 return SDValue();
10409
10410 const MachineFunction &MF = DAG.getMachineFunction();
10411 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
10412
10413 // TODO: Check IEEE bit enabled?
10414 EVT VT = Op0.getValueType();
10415 if (Info->getMode().DX10Clamp) {
10416 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
10417 // hardware fmed3 behavior converting to a min.
10418 // FIXME: Should this be allowing -0.0?
10419 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
10420 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
10421 }
10422
10423 // med3 for f16 is only available on gfx9+, and not available for v2f16.
10424 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
10425 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
10426 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
10427 // then give the other result, which is different from med3 with a NaN
10428 // input.
10429 SDValue Var = Op0.getOperand(0);
10430 if (!DAG.isKnownNeverSNaN(Var))
10431 return SDValue();
10432
10433 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
10434
10435 if ((!K0->hasOneUse() ||
10436 TII->isInlineConstant(K0->getValueAPF().bitcastToAPInt())) &&
10437 (!K1->hasOneUse() ||
10438 TII->isInlineConstant(K1->getValueAPF().bitcastToAPInt()))) {
10439 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
10440 Var, SDValue(K0, 0), SDValue(K1, 0));
10441 }
10442 }
10443
10444 return SDValue();
10445 }
10446
performMinMaxCombine(SDNode * N,DAGCombinerInfo & DCI) const10447 SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
10448 DAGCombinerInfo &DCI) const {
10449 SelectionDAG &DAG = DCI.DAG;
10450
10451 EVT VT = N->getValueType(0);
10452 unsigned Opc = N->getOpcode();
10453 SDValue Op0 = N->getOperand(0);
10454 SDValue Op1 = N->getOperand(1);
10455
10456 // Only do this if the inner op has one use since this will just increases
10457 // register pressure for no benefit.
10458
10459 if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
10460 !VT.isVector() &&
10461 (VT == MVT::i32 || VT == MVT::f32 ||
10462 ((VT == MVT::f16 || VT == MVT::i16) && Subtarget->hasMin3Max3_16()))) {
10463 // max(max(a, b), c) -> max3(a, b, c)
10464 // min(min(a, b), c) -> min3(a, b, c)
10465 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
10466 SDLoc DL(N);
10467 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
10468 DL,
10469 N->getValueType(0),
10470 Op0.getOperand(0),
10471 Op0.getOperand(1),
10472 Op1);
10473 }
10474
10475 // Try commuted.
10476 // max(a, max(b, c)) -> max3(a, b, c)
10477 // min(a, min(b, c)) -> min3(a, b, c)
10478 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
10479 SDLoc DL(N);
10480 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
10481 DL,
10482 N->getValueType(0),
10483 Op0,
10484 Op1.getOperand(0),
10485 Op1.getOperand(1));
10486 }
10487 }
10488
10489 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
10490 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
10491 if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true))
10492 return Med3;
10493 }
10494
10495 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
10496 if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false))
10497 return Med3;
10498 }
10499
10500 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
10501 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
10502 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
10503 (Opc == AMDGPUISD::FMIN_LEGACY &&
10504 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
10505 (VT == MVT::f32 || VT == MVT::f64 ||
10506 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
10507 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
10508 Op0.hasOneUse()) {
10509 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
10510 return Res;
10511 }
10512
10513 return SDValue();
10514 }
10515
isClampZeroToOne(SDValue A,SDValue B)10516 static bool isClampZeroToOne(SDValue A, SDValue B) {
10517 if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
10518 if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
10519 // FIXME: Should this be allowing -0.0?
10520 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
10521 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
10522 }
10523 }
10524
10525 return false;
10526 }
10527
10528 // FIXME: Should only worry about snans for version with chain.
performFMed3Combine(SDNode * N,DAGCombinerInfo & DCI) const10529 SDValue SITargetLowering::performFMed3Combine(SDNode *N,
10530 DAGCombinerInfo &DCI) const {
10531 EVT VT = N->getValueType(0);
10532 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
10533 // NaNs. With a NaN input, the order of the operands may change the result.
10534
10535 SelectionDAG &DAG = DCI.DAG;
10536 SDLoc SL(N);
10537
10538 SDValue Src0 = N->getOperand(0);
10539 SDValue Src1 = N->getOperand(1);
10540 SDValue Src2 = N->getOperand(2);
10541
10542 if (isClampZeroToOne(Src0, Src1)) {
10543 // const_a, const_b, x -> clamp is safe in all cases including signaling
10544 // nans.
10545 // FIXME: Should this be allowing -0.0?
10546 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
10547 }
10548
10549 const MachineFunction &MF = DAG.getMachineFunction();
10550 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
10551
10552 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
10553 // handling no dx10-clamp?
10554 if (Info->getMode().DX10Clamp) {
10555 // If NaNs is clamped to 0, we are free to reorder the inputs.
10556
10557 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
10558 std::swap(Src0, Src1);
10559
10560 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
10561 std::swap(Src1, Src2);
10562
10563 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
10564 std::swap(Src0, Src1);
10565
10566 if (isClampZeroToOne(Src1, Src2))
10567 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
10568 }
10569
10570 return SDValue();
10571 }
10572
performCvtPkRTZCombine(SDNode * N,DAGCombinerInfo & DCI) const10573 SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
10574 DAGCombinerInfo &DCI) const {
10575 SDValue Src0 = N->getOperand(0);
10576 SDValue Src1 = N->getOperand(1);
10577 if (Src0.isUndef() && Src1.isUndef())
10578 return DCI.DAG.getUNDEF(N->getValueType(0));
10579 return SDValue();
10580 }
10581
10582 // Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
10583 // expanded into a set of cmp/select instructions.
shouldExpandVectorDynExt(unsigned EltSize,unsigned NumElem,bool IsDivergentIdx,const GCNSubtarget * Subtarget)10584 bool SITargetLowering::shouldExpandVectorDynExt(unsigned EltSize,
10585 unsigned NumElem,
10586 bool IsDivergentIdx,
10587 const GCNSubtarget *Subtarget) {
10588 if (UseDivergentRegisterIndexing)
10589 return false;
10590
10591 unsigned VecSize = EltSize * NumElem;
10592
10593 // Sub-dword vectors of size 2 dword or less have better implementation.
10594 if (VecSize <= 64 && EltSize < 32)
10595 return false;
10596
10597 // Always expand the rest of sub-dword instructions, otherwise it will be
10598 // lowered via memory.
10599 if (EltSize < 32)
10600 return true;
10601
10602 // Always do this if var-idx is divergent, otherwise it will become a loop.
10603 if (IsDivergentIdx)
10604 return true;
10605
10606 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
10607 unsigned NumInsts = NumElem /* Number of compares */ +
10608 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
10609
10610 // On some architectures (GFX9) movrel is not available and it's better
10611 // to expand.
10612 if (!Subtarget->hasMovrel())
10613 return NumInsts <= 16;
10614
10615 // If movrel is available, use it instead of expanding for vector of 8
10616 // elements.
10617 return NumInsts <= 15;
10618 }
10619
shouldExpandVectorDynExt(SDNode * N) const10620 bool SITargetLowering::shouldExpandVectorDynExt(SDNode *N) const {
10621 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
10622 if (isa<ConstantSDNode>(Idx))
10623 return false;
10624
10625 SDValue Vec = N->getOperand(0);
10626 EVT VecVT = Vec.getValueType();
10627 EVT EltVT = VecVT.getVectorElementType();
10628 unsigned EltSize = EltVT.getSizeInBits();
10629 unsigned NumElem = VecVT.getVectorNumElements();
10630
10631 return SITargetLowering::shouldExpandVectorDynExt(
10632 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
10633 }
10634
performExtractVectorEltCombine(SDNode * N,DAGCombinerInfo & DCI) const10635 SDValue SITargetLowering::performExtractVectorEltCombine(
10636 SDNode *N, DAGCombinerInfo &DCI) const {
10637 SDValue Vec = N->getOperand(0);
10638 SelectionDAG &DAG = DCI.DAG;
10639
10640 EVT VecVT = Vec.getValueType();
10641 EVT EltVT = VecVT.getVectorElementType();
10642
10643 if ((Vec.getOpcode() == ISD::FNEG ||
10644 Vec.getOpcode() == ISD::FABS) && allUsesHaveSourceMods(N)) {
10645 SDLoc SL(N);
10646 EVT EltVT = N->getValueType(0);
10647 SDValue Idx = N->getOperand(1);
10648 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
10649 Vec.getOperand(0), Idx);
10650 return DAG.getNode(Vec.getOpcode(), SL, EltVT, Elt);
10651 }
10652
10653 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
10654 // =>
10655 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
10656 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
10657 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
10658 if (Vec.hasOneUse() && DCI.isBeforeLegalize()) {
10659 SDLoc SL(N);
10660 EVT EltVT = N->getValueType(0);
10661 SDValue Idx = N->getOperand(1);
10662 unsigned Opc = Vec.getOpcode();
10663
10664 switch(Opc) {
10665 default:
10666 break;
10667 // TODO: Support other binary operations.
10668 case ISD::FADD:
10669 case ISD::FSUB:
10670 case ISD::FMUL:
10671 case ISD::ADD:
10672 case ISD::UMIN:
10673 case ISD::UMAX:
10674 case ISD::SMIN:
10675 case ISD::SMAX:
10676 case ISD::FMAXNUM:
10677 case ISD::FMINNUM:
10678 case ISD::FMAXNUM_IEEE:
10679 case ISD::FMINNUM_IEEE: {
10680 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
10681 Vec.getOperand(0), Idx);
10682 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
10683 Vec.getOperand(1), Idx);
10684
10685 DCI.AddToWorklist(Elt0.getNode());
10686 DCI.AddToWorklist(Elt1.getNode());
10687 return DAG.getNode(Opc, SL, EltVT, Elt0, Elt1, Vec->getFlags());
10688 }
10689 }
10690 }
10691
10692 unsigned VecSize = VecVT.getSizeInBits();
10693 unsigned EltSize = EltVT.getSizeInBits();
10694
10695 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
10696 if (shouldExpandVectorDynExt(N)) {
10697 SDLoc SL(N);
10698 SDValue Idx = N->getOperand(1);
10699 SDValue V;
10700 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
10701 SDValue IC = DAG.getVectorIdxConstant(I, SL);
10702 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
10703 if (I == 0)
10704 V = Elt;
10705 else
10706 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
10707 }
10708 return V;
10709 }
10710
10711 if (!DCI.isBeforeLegalize())
10712 return SDValue();
10713
10714 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
10715 // elements. This exposes more load reduction opportunities by replacing
10716 // multiple small extract_vector_elements with a single 32-bit extract.
10717 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
10718 if (isa<MemSDNode>(Vec) &&
10719 EltSize <= 16 &&
10720 EltVT.isByteSized() &&
10721 VecSize > 32 &&
10722 VecSize % 32 == 0 &&
10723 Idx) {
10724 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
10725
10726 unsigned BitIndex = Idx->getZExtValue() * EltSize;
10727 unsigned EltIdx = BitIndex / 32;
10728 unsigned LeftoverBitIdx = BitIndex % 32;
10729 SDLoc SL(N);
10730
10731 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
10732 DCI.AddToWorklist(Cast.getNode());
10733
10734 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
10735 DAG.getConstant(EltIdx, SL, MVT::i32));
10736 DCI.AddToWorklist(Elt.getNode());
10737 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
10738 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
10739 DCI.AddToWorklist(Srl.getNode());
10740
10741 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, EltVT.changeTypeToInteger(), Srl);
10742 DCI.AddToWorklist(Trunc.getNode());
10743 return DAG.getNode(ISD::BITCAST, SL, EltVT, Trunc);
10744 }
10745
10746 return SDValue();
10747 }
10748
10749 SDValue
performInsertVectorEltCombine(SDNode * N,DAGCombinerInfo & DCI) const10750 SITargetLowering::performInsertVectorEltCombine(SDNode *N,
10751 DAGCombinerInfo &DCI) const {
10752 SDValue Vec = N->getOperand(0);
10753 SDValue Idx = N->getOperand(2);
10754 EVT VecVT = Vec.getValueType();
10755 EVT EltVT = VecVT.getVectorElementType();
10756
10757 // INSERT_VECTOR_ELT (<n x e>, var-idx)
10758 // => BUILD_VECTOR n x select (e, const-idx)
10759 if (!shouldExpandVectorDynExt(N))
10760 return SDValue();
10761
10762 SelectionDAG &DAG = DCI.DAG;
10763 SDLoc SL(N);
10764 SDValue Ins = N->getOperand(1);
10765 EVT IdxVT = Idx.getValueType();
10766
10767 SmallVector<SDValue, 16> Ops;
10768 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
10769 SDValue IC = DAG.getConstant(I, SL, IdxVT);
10770 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
10771 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
10772 Ops.push_back(V);
10773 }
10774
10775 return DAG.getBuildVector(VecVT, SL, Ops);
10776 }
10777
getFusedOpcode(const SelectionDAG & DAG,const SDNode * N0,const SDNode * N1) const10778 unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
10779 const SDNode *N0,
10780 const SDNode *N1) const {
10781 EVT VT = N0->getValueType(0);
10782
10783 // Only do this if we are not trying to support denormals. v_mad_f32 does not
10784 // support denormals ever.
10785 if (((VT == MVT::f32 && !hasFP32Denormals(DAG.getMachineFunction())) ||
10786 (VT == MVT::f16 && !hasFP64FP16Denormals(DAG.getMachineFunction()) &&
10787 getSubtarget()->hasMadF16())) &&
10788 isOperationLegal(ISD::FMAD, VT))
10789 return ISD::FMAD;
10790
10791 const TargetOptions &Options = DAG.getTarget().Options;
10792 if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
10793 (N0->getFlags().hasAllowContract() &&
10794 N1->getFlags().hasAllowContract())) &&
10795 isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT)) {
10796 return ISD::FMA;
10797 }
10798
10799 return 0;
10800 }
10801
10802 // For a reassociatable opcode perform:
10803 // op x, (op y, z) -> op (op x, z), y, if x and z are uniform
reassociateScalarOps(SDNode * N,SelectionDAG & DAG) const10804 SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
10805 SelectionDAG &DAG) const {
10806 EVT VT = N->getValueType(0);
10807 if (VT != MVT::i32 && VT != MVT::i64)
10808 return SDValue();
10809
10810 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
10811 return SDValue();
10812
10813 unsigned Opc = N->getOpcode();
10814 SDValue Op0 = N->getOperand(0);
10815 SDValue Op1 = N->getOperand(1);
10816
10817 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
10818 return SDValue();
10819
10820 if (Op0->isDivergent())
10821 std::swap(Op0, Op1);
10822
10823 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
10824 return SDValue();
10825
10826 SDValue Op2 = Op1.getOperand(1);
10827 Op1 = Op1.getOperand(0);
10828 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
10829 return SDValue();
10830
10831 if (Op1->isDivergent())
10832 std::swap(Op1, Op2);
10833
10834 SDLoc SL(N);
10835 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
10836 return DAG.getNode(Opc, SL, VT, Add1, Op2);
10837 }
10838
getMad64_32(SelectionDAG & DAG,const SDLoc & SL,EVT VT,SDValue N0,SDValue N1,SDValue N2,bool Signed)10839 static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
10840 EVT VT,
10841 SDValue N0, SDValue N1, SDValue N2,
10842 bool Signed) {
10843 unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32;
10844 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
10845 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
10846 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
10847 }
10848
10849 // Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
10850 // multiplies, if any.
10851 //
10852 // Full 64-bit multiplies that feed into an addition are lowered here instead
10853 // of using the generic expansion. The generic expansion ends up with
10854 // a tree of ADD nodes that prevents us from using the "add" part of the
10855 // MAD instruction. The expansion produced here results in a chain of ADDs
10856 // instead of a tree.
tryFoldToMad64_32(SDNode * N,DAGCombinerInfo & DCI) const10857 SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
10858 DAGCombinerInfo &DCI) const {
10859 assert(N->getOpcode() == ISD::ADD);
10860
10861 SelectionDAG &DAG = DCI.DAG;
10862 EVT VT = N->getValueType(0);
10863 SDLoc SL(N);
10864 SDValue LHS = N->getOperand(0);
10865 SDValue RHS = N->getOperand(1);
10866
10867 if (VT.isVector())
10868 return SDValue();
10869
10870 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
10871 // result in scalar registers for uniform values.
10872 if (!N->isDivergent() && Subtarget->hasSMulHi())
10873 return SDValue();
10874
10875 unsigned NumBits = VT.getScalarSizeInBits();
10876 if (NumBits <= 32 || NumBits > 64)
10877 return SDValue();
10878
10879 if (LHS.getOpcode() != ISD::MUL) {
10880 assert(RHS.getOpcode() == ISD::MUL);
10881 std::swap(LHS, RHS);
10882 }
10883
10884 // Avoid the fold if it would unduly increase the number of multiplies due to
10885 // multiple uses, except on hardware with full-rate multiply-add (which is
10886 // part of full-rate 64-bit ops).
10887 if (!Subtarget->hasFullRate64Ops()) {
10888 unsigned NumUsers = 0;
10889 for (SDNode *Use : LHS->uses()) {
10890 // There is a use that does not feed into addition, so the multiply can't
10891 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
10892 if (Use->getOpcode() != ISD::ADD)
10893 return SDValue();
10894
10895 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
10896 // MUL + 3xADD + 3xADDC over 3xMAD.
10897 ++NumUsers;
10898 if (NumUsers >= 3)
10899 return SDValue();
10900 }
10901 }
10902
10903 SDValue MulLHS = LHS.getOperand(0);
10904 SDValue MulRHS = LHS.getOperand(1);
10905 SDValue AddRHS = RHS;
10906
10907 // Always check whether operands are small unsigned values, since that
10908 // knowledge is useful in more cases. Check for small signed values only if
10909 // doing so can unlock a shorter code sequence.
10910 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
10911 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
10912
10913 bool MulSignedLo = false;
10914 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
10915 MulSignedLo = numBitsSigned(MulLHS, DAG) <= 32 &&
10916 numBitsSigned(MulRHS, DAG) <= 32;
10917 }
10918
10919 // The operands and final result all have the same number of bits. If
10920 // operands need to be extended, they can be extended with garbage. The
10921 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
10922 // truncated away in the end.
10923 if (VT != MVT::i64) {
10924 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
10925 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
10926 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
10927 }
10928
10929 // The basic code generated is conceptually straightforward. Pseudo code:
10930 //
10931 // accum = mad_64_32 lhs.lo, rhs.lo, accum
10932 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
10933 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
10934 //
10935 // The second and third lines are optional, depending on whether the factors
10936 // are {sign,zero}-extended or not.
10937 //
10938 // The actual DAG is noisier than the pseudo code, but only due to
10939 // instructions that disassemble values into low and high parts, and
10940 // assemble the final result.
10941 SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
10942 SDValue One = DAG.getConstant(1, SL, MVT::i32);
10943
10944 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
10945 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
10946 SDValue Accum =
10947 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
10948
10949 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
10950 auto AccumLo = DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, Accum, Zero);
10951 auto AccumHi = DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, Accum, One);
10952
10953 if (!MulLHSUnsigned32) {
10954 auto MulLHSHi =
10955 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
10956 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
10957 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
10958 }
10959
10960 if (!MulRHSUnsigned32) {
10961 auto MulRHSHi =
10962 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
10963 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
10964 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
10965 }
10966
10967 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
10968 Accum = DAG.getBitcast(MVT::i64, Accum);
10969 }
10970
10971 if (VT != MVT::i64)
10972 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
10973 return Accum;
10974 }
10975
performAddCombine(SDNode * N,DAGCombinerInfo & DCI) const10976 SDValue SITargetLowering::performAddCombine(SDNode *N,
10977 DAGCombinerInfo &DCI) const {
10978 SelectionDAG &DAG = DCI.DAG;
10979 EVT VT = N->getValueType(0);
10980 SDLoc SL(N);
10981 SDValue LHS = N->getOperand(0);
10982 SDValue RHS = N->getOperand(1);
10983
10984 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
10985 if (Subtarget->hasMad64_32()) {
10986 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
10987 return Folded;
10988 }
10989
10990 return SDValue();
10991 }
10992
10993 if (SDValue V = reassociateScalarOps(N, DAG)) {
10994 return V;
10995 }
10996
10997 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
10998 return SDValue();
10999
11000 // add x, zext (setcc) => addcarry x, 0, setcc
11001 // add x, sext (setcc) => subcarry x, 0, setcc
11002 unsigned Opc = LHS.getOpcode();
11003 if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
11004 Opc == ISD::ANY_EXTEND || Opc == ISD::ADDCARRY)
11005 std::swap(RHS, LHS);
11006
11007 Opc = RHS.getOpcode();
11008 switch (Opc) {
11009 default: break;
11010 case ISD::ZERO_EXTEND:
11011 case ISD::SIGN_EXTEND:
11012 case ISD::ANY_EXTEND: {
11013 auto Cond = RHS.getOperand(0);
11014 // If this won't be a real VOPC output, we would still need to insert an
11015 // extra instruction anyway.
11016 if (!isBoolSGPR(Cond))
11017 break;
11018 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
11019 SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
11020 Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::SUBCARRY : ISD::ADDCARRY;
11021 return DAG.getNode(Opc, SL, VTList, Args);
11022 }
11023 case ISD::ADDCARRY: {
11024 // add x, (addcarry y, 0, cc) => addcarry x, y, cc
11025 auto C = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
11026 if (!C || C->getZExtValue() != 0) break;
11027 SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) };
11028 return DAG.getNode(ISD::ADDCARRY, SDLoc(N), RHS->getVTList(), Args);
11029 }
11030 }
11031 return SDValue();
11032 }
11033
performSubCombine(SDNode * N,DAGCombinerInfo & DCI) const11034 SDValue SITargetLowering::performSubCombine(SDNode *N,
11035 DAGCombinerInfo &DCI) const {
11036 SelectionDAG &DAG = DCI.DAG;
11037 EVT VT = N->getValueType(0);
11038
11039 if (VT != MVT::i32)
11040 return SDValue();
11041
11042 SDLoc SL(N);
11043 SDValue LHS = N->getOperand(0);
11044 SDValue RHS = N->getOperand(1);
11045
11046 // sub x, zext (setcc) => subcarry x, 0, setcc
11047 // sub x, sext (setcc) => addcarry x, 0, setcc
11048 unsigned Opc = RHS.getOpcode();
11049 switch (Opc) {
11050 default: break;
11051 case ISD::ZERO_EXTEND:
11052 case ISD::SIGN_EXTEND:
11053 case ISD::ANY_EXTEND: {
11054 auto Cond = RHS.getOperand(0);
11055 // If this won't be a real VOPC output, we would still need to insert an
11056 // extra instruction anyway.
11057 if (!isBoolSGPR(Cond))
11058 break;
11059 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
11060 SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
11061 Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::ADDCARRY : ISD::SUBCARRY;
11062 return DAG.getNode(Opc, SL, VTList, Args);
11063 }
11064 }
11065
11066 if (LHS.getOpcode() == ISD::SUBCARRY) {
11067 // sub (subcarry x, 0, cc), y => subcarry x, y, cc
11068 auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
11069 if (!C || !C->isZero())
11070 return SDValue();
11071 SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
11072 return DAG.getNode(ISD::SUBCARRY, SDLoc(N), LHS->getVTList(), Args);
11073 }
11074 return SDValue();
11075 }
11076
performAddCarrySubCarryCombine(SDNode * N,DAGCombinerInfo & DCI) const11077 SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
11078 DAGCombinerInfo &DCI) const {
11079
11080 if (N->getValueType(0) != MVT::i32)
11081 return SDValue();
11082
11083 auto C = dyn_cast<ConstantSDNode>(N->getOperand(1));
11084 if (!C || C->getZExtValue() != 0)
11085 return SDValue();
11086
11087 SelectionDAG &DAG = DCI.DAG;
11088 SDValue LHS = N->getOperand(0);
11089
11090 // addcarry (add x, y), 0, cc => addcarry x, y, cc
11091 // subcarry (sub x, y), 0, cc => subcarry x, y, cc
11092 unsigned LHSOpc = LHS.getOpcode();
11093 unsigned Opc = N->getOpcode();
11094 if ((LHSOpc == ISD::ADD && Opc == ISD::ADDCARRY) ||
11095 (LHSOpc == ISD::SUB && Opc == ISD::SUBCARRY)) {
11096 SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) };
11097 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
11098 }
11099 return SDValue();
11100 }
11101
performFAddCombine(SDNode * N,DAGCombinerInfo & DCI) const11102 SDValue SITargetLowering::performFAddCombine(SDNode *N,
11103 DAGCombinerInfo &DCI) const {
11104 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
11105 return SDValue();
11106
11107 SelectionDAG &DAG = DCI.DAG;
11108 EVT VT = N->getValueType(0);
11109
11110 SDLoc SL(N);
11111 SDValue LHS = N->getOperand(0);
11112 SDValue RHS = N->getOperand(1);
11113
11114 // These should really be instruction patterns, but writing patterns with
11115 // source modifiers is a pain.
11116
11117 // fadd (fadd (a, a), b) -> mad 2.0, a, b
11118 if (LHS.getOpcode() == ISD::FADD) {
11119 SDValue A = LHS.getOperand(0);
11120 if (A == LHS.getOperand(1)) {
11121 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
11122 if (FusedOp != 0) {
11123 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
11124 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
11125 }
11126 }
11127 }
11128
11129 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
11130 if (RHS.getOpcode() == ISD::FADD) {
11131 SDValue A = RHS.getOperand(0);
11132 if (A == RHS.getOperand(1)) {
11133 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
11134 if (FusedOp != 0) {
11135 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
11136 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
11137 }
11138 }
11139 }
11140
11141 return SDValue();
11142 }
11143
performFSubCombine(SDNode * N,DAGCombinerInfo & DCI) const11144 SDValue SITargetLowering::performFSubCombine(SDNode *N,
11145 DAGCombinerInfo &DCI) const {
11146 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
11147 return SDValue();
11148
11149 SelectionDAG &DAG = DCI.DAG;
11150 SDLoc SL(N);
11151 EVT VT = N->getValueType(0);
11152 assert(!VT.isVector());
11153
11154 // Try to get the fneg to fold into the source modifier. This undoes generic
11155 // DAG combines and folds them into the mad.
11156 //
11157 // Only do this if we are not trying to support denormals. v_mad_f32 does
11158 // not support denormals ever.
11159 SDValue LHS = N->getOperand(0);
11160 SDValue RHS = N->getOperand(1);
11161 if (LHS.getOpcode() == ISD::FADD) {
11162 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
11163 SDValue A = LHS.getOperand(0);
11164 if (A == LHS.getOperand(1)) {
11165 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
11166 if (FusedOp != 0){
11167 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
11168 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
11169
11170 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
11171 }
11172 }
11173 }
11174
11175 if (RHS.getOpcode() == ISD::FADD) {
11176 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
11177
11178 SDValue A = RHS.getOperand(0);
11179 if (A == RHS.getOperand(1)) {
11180 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
11181 if (FusedOp != 0){
11182 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
11183 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
11184 }
11185 }
11186 }
11187
11188 return SDValue();
11189 }
11190
performFMACombine(SDNode * N,DAGCombinerInfo & DCI) const11191 SDValue SITargetLowering::performFMACombine(SDNode *N,
11192 DAGCombinerInfo &DCI) const {
11193 SelectionDAG &DAG = DCI.DAG;
11194 EVT VT = N->getValueType(0);
11195 SDLoc SL(N);
11196
11197 if (!Subtarget->hasDot7Insts() || VT != MVT::f32)
11198 return SDValue();
11199
11200 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
11201 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
11202 SDValue Op1 = N->getOperand(0);
11203 SDValue Op2 = N->getOperand(1);
11204 SDValue FMA = N->getOperand(2);
11205
11206 if (FMA.getOpcode() != ISD::FMA ||
11207 Op1.getOpcode() != ISD::FP_EXTEND ||
11208 Op2.getOpcode() != ISD::FP_EXTEND)
11209 return SDValue();
11210
11211 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
11212 // regardless of the denorm mode setting. Therefore,
11213 // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2.
11214 const TargetOptions &Options = DAG.getTarget().Options;
11215 if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
11216 (N->getFlags().hasAllowContract() &&
11217 FMA->getFlags().hasAllowContract())) {
11218 Op1 = Op1.getOperand(0);
11219 Op2 = Op2.getOperand(0);
11220 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
11221 Op2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11222 return SDValue();
11223
11224 SDValue Vec1 = Op1.getOperand(0);
11225 SDValue Idx1 = Op1.getOperand(1);
11226 SDValue Vec2 = Op2.getOperand(0);
11227
11228 SDValue FMAOp1 = FMA.getOperand(0);
11229 SDValue FMAOp2 = FMA.getOperand(1);
11230 SDValue FMAAcc = FMA.getOperand(2);
11231
11232 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
11233 FMAOp2.getOpcode() != ISD::FP_EXTEND)
11234 return SDValue();
11235
11236 FMAOp1 = FMAOp1.getOperand(0);
11237 FMAOp2 = FMAOp2.getOperand(0);
11238 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
11239 FMAOp2.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11240 return SDValue();
11241
11242 SDValue Vec3 = FMAOp1.getOperand(0);
11243 SDValue Vec4 = FMAOp2.getOperand(0);
11244 SDValue Idx2 = FMAOp1.getOperand(1);
11245
11246 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
11247 // Idx1 and Idx2 cannot be the same.
11248 Idx1 == Idx2)
11249 return SDValue();
11250
11251 if (Vec1 == Vec2 || Vec3 == Vec4)
11252 return SDValue();
11253
11254 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
11255 return SDValue();
11256
11257 if ((Vec1 == Vec3 && Vec2 == Vec4) ||
11258 (Vec1 == Vec4 && Vec2 == Vec3)) {
11259 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
11260 DAG.getTargetConstant(0, SL, MVT::i1));
11261 }
11262 }
11263 return SDValue();
11264 }
11265
performSetCCCombine(SDNode * N,DAGCombinerInfo & DCI) const11266 SDValue SITargetLowering::performSetCCCombine(SDNode *N,
11267 DAGCombinerInfo &DCI) const {
11268 SelectionDAG &DAG = DCI.DAG;
11269 SDLoc SL(N);
11270
11271 SDValue LHS = N->getOperand(0);
11272 SDValue RHS = N->getOperand(1);
11273 EVT VT = LHS.getValueType();
11274 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
11275
11276 auto CRHS = dyn_cast<ConstantSDNode>(RHS);
11277 if (!CRHS) {
11278 CRHS = dyn_cast<ConstantSDNode>(LHS);
11279 if (CRHS) {
11280 std::swap(LHS, RHS);
11281 CC = getSetCCSwappedOperands(CC);
11282 }
11283 }
11284
11285 if (CRHS) {
11286 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
11287 isBoolSGPR(LHS.getOperand(0))) {
11288 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
11289 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
11290 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
11291 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
11292 if ((CRHS->isAllOnes() &&
11293 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
11294 (CRHS->isZero() &&
11295 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
11296 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
11297 DAG.getConstant(-1, SL, MVT::i1));
11298 if ((CRHS->isAllOnes() &&
11299 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
11300 (CRHS->isZero() &&
11301 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
11302 return LHS.getOperand(0);
11303 }
11304
11305 const APInt &CRHSVal = CRHS->getAPIntValue();
11306 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
11307 LHS.getOpcode() == ISD::SELECT &&
11308 isa<ConstantSDNode>(LHS.getOperand(1)) &&
11309 isa<ConstantSDNode>(LHS.getOperand(2)) &&
11310 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
11311 isBoolSGPR(LHS.getOperand(0))) {
11312 // Given CT != FT:
11313 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
11314 // setcc (select cc, CT, CF), CF, ne => cc
11315 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
11316 // setcc (select cc, CT, CF), CT, eq => cc
11317 const APInt &CT = LHS.getConstantOperandAPInt(1);
11318 const APInt &CF = LHS.getConstantOperandAPInt(2);
11319
11320 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
11321 (CT == CRHSVal && CC == ISD::SETNE))
11322 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
11323 DAG.getConstant(-1, SL, MVT::i1));
11324 if ((CF == CRHSVal && CC == ISD::SETNE) ||
11325 (CT == CRHSVal && CC == ISD::SETEQ))
11326 return LHS.getOperand(0);
11327 }
11328 }
11329
11330 if (VT != MVT::f32 && VT != MVT::f64 && (Subtarget->has16BitInsts() &&
11331 VT != MVT::f16))
11332 return SDValue();
11333
11334 // Match isinf/isfinite pattern
11335 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
11336 // (fcmp one (fabs x), inf) -> (fp_class x,
11337 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
11338 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) && LHS.getOpcode() == ISD::FABS) {
11339 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
11340 if (!CRHS)
11341 return SDValue();
11342
11343 const APFloat &APF = CRHS->getValueAPF();
11344 if (APF.isInfinity() && !APF.isNegative()) {
11345 const unsigned IsInfMask = SIInstrFlags::P_INFINITY |
11346 SIInstrFlags::N_INFINITY;
11347 const unsigned IsFiniteMask = SIInstrFlags::N_ZERO |
11348 SIInstrFlags::P_ZERO |
11349 SIInstrFlags::N_NORMAL |
11350 SIInstrFlags::P_NORMAL |
11351 SIInstrFlags::N_SUBNORMAL |
11352 SIInstrFlags::P_SUBNORMAL;
11353 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
11354 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
11355 DAG.getConstant(Mask, SL, MVT::i32));
11356 }
11357 }
11358
11359 return SDValue();
11360 }
11361
performCvtF32UByteNCombine(SDNode * N,DAGCombinerInfo & DCI) const11362 SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
11363 DAGCombinerInfo &DCI) const {
11364 SelectionDAG &DAG = DCI.DAG;
11365 SDLoc SL(N);
11366 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
11367
11368 SDValue Src = N->getOperand(0);
11369 SDValue Shift = N->getOperand(0);
11370
11371 // TODO: Extend type shouldn't matter (assuming legal types).
11372 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
11373 Shift = Shift.getOperand(0);
11374
11375 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
11376 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
11377 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
11378 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
11379 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
11380 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
11381 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
11382 SDValue Shifted = DAG.getZExtOrTrunc(Shift.getOperand(0),
11383 SDLoc(Shift.getOperand(0)), MVT::i32);
11384
11385 unsigned ShiftOffset = 8 * Offset;
11386 if (Shift.getOpcode() == ISD::SHL)
11387 ShiftOffset -= C->getZExtValue();
11388 else
11389 ShiftOffset += C->getZExtValue();
11390
11391 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
11392 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
11393 MVT::f32, Shifted);
11394 }
11395 }
11396 }
11397
11398 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
11399 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
11400 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
11401 // We simplified Src. If this node is not dead, visit it again so it is
11402 // folded properly.
11403 if (N->getOpcode() != ISD::DELETED_NODE)
11404 DCI.AddToWorklist(N);
11405 return SDValue(N, 0);
11406 }
11407
11408 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
11409 if (SDValue DemandedSrc =
11410 TLI.SimplifyMultipleUseDemandedBits(Src, DemandedBits, DAG))
11411 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
11412
11413 return SDValue();
11414 }
11415
performClampCombine(SDNode * N,DAGCombinerInfo & DCI) const11416 SDValue SITargetLowering::performClampCombine(SDNode *N,
11417 DAGCombinerInfo &DCI) const {
11418 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
11419 if (!CSrc)
11420 return SDValue();
11421
11422 const MachineFunction &MF = DCI.DAG.getMachineFunction();
11423 const APFloat &F = CSrc->getValueAPF();
11424 APFloat Zero = APFloat::getZero(F.getSemantics());
11425 if (F < Zero ||
11426 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
11427 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
11428 }
11429
11430 APFloat One(F.getSemantics(), "1.0");
11431 if (F > One)
11432 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
11433
11434 return SDValue(CSrc, 0);
11435 }
11436
11437
PerformDAGCombine(SDNode * N,DAGCombinerInfo & DCI) const11438 SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
11439 DAGCombinerInfo &DCI) const {
11440 if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
11441 return SDValue();
11442 switch (N->getOpcode()) {
11443 case ISD::ADD:
11444 return performAddCombine(N, DCI);
11445 case ISD::SUB:
11446 return performSubCombine(N, DCI);
11447 case ISD::ADDCARRY:
11448 case ISD::SUBCARRY:
11449 return performAddCarrySubCarryCombine(N, DCI);
11450 case ISD::FADD:
11451 return performFAddCombine(N, DCI);
11452 case ISD::FSUB:
11453 return performFSubCombine(N, DCI);
11454 case ISD::SETCC:
11455 return performSetCCCombine(N, DCI);
11456 case ISD::FMAXNUM:
11457 case ISD::FMINNUM:
11458 case ISD::FMAXNUM_IEEE:
11459 case ISD::FMINNUM_IEEE:
11460 case ISD::SMAX:
11461 case ISD::SMIN:
11462 case ISD::UMAX:
11463 case ISD::UMIN:
11464 case AMDGPUISD::FMIN_LEGACY:
11465 case AMDGPUISD::FMAX_LEGACY:
11466 return performMinMaxCombine(N, DCI);
11467 case ISD::FMA:
11468 return performFMACombine(N, DCI);
11469 case ISD::AND:
11470 return performAndCombine(N, DCI);
11471 case ISD::OR:
11472 return performOrCombine(N, DCI);
11473 case ISD::XOR:
11474 return performXorCombine(N, DCI);
11475 case ISD::ZERO_EXTEND:
11476 return performZeroExtendCombine(N, DCI);
11477 case ISD::SIGN_EXTEND_INREG:
11478 return performSignExtendInRegCombine(N , DCI);
11479 case AMDGPUISD::FP_CLASS:
11480 return performClassCombine(N, DCI);
11481 case ISD::FCANONICALIZE:
11482 return performFCanonicalizeCombine(N, DCI);
11483 case AMDGPUISD::RCP:
11484 return performRcpCombine(N, DCI);
11485 case AMDGPUISD::FRACT:
11486 case AMDGPUISD::RSQ:
11487 case AMDGPUISD::RCP_LEGACY:
11488 case AMDGPUISD::RCP_IFLAG:
11489 case AMDGPUISD::RSQ_CLAMP:
11490 case AMDGPUISD::LDEXP: {
11491 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
11492 SDValue Src = N->getOperand(0);
11493 if (Src.isUndef())
11494 return Src;
11495 break;
11496 }
11497 case ISD::SINT_TO_FP:
11498 case ISD::UINT_TO_FP:
11499 return performUCharToFloatCombine(N, DCI);
11500 case AMDGPUISD::CVT_F32_UBYTE0:
11501 case AMDGPUISD::CVT_F32_UBYTE1:
11502 case AMDGPUISD::CVT_F32_UBYTE2:
11503 case AMDGPUISD::CVT_F32_UBYTE3:
11504 return performCvtF32UByteNCombine(N, DCI);
11505 case AMDGPUISD::FMED3:
11506 return performFMed3Combine(N, DCI);
11507 case AMDGPUISD::CVT_PKRTZ_F16_F32:
11508 return performCvtPkRTZCombine(N, DCI);
11509 case AMDGPUISD::CLAMP:
11510 return performClampCombine(N, DCI);
11511 case ISD::SCALAR_TO_VECTOR: {
11512 SelectionDAG &DAG = DCI.DAG;
11513 EVT VT = N->getValueType(0);
11514
11515 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
11516 if (VT == MVT::v2i16 || VT == MVT::v2f16) {
11517 SDLoc SL(N);
11518 SDValue Src = N->getOperand(0);
11519 EVT EltVT = Src.getValueType();
11520 if (EltVT == MVT::f16)
11521 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
11522
11523 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
11524 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
11525 }
11526
11527 break;
11528 }
11529 case ISD::EXTRACT_VECTOR_ELT:
11530 return performExtractVectorEltCombine(N, DCI);
11531 case ISD::INSERT_VECTOR_ELT:
11532 return performInsertVectorEltCombine(N, DCI);
11533 case ISD::LOAD: {
11534 if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI))
11535 return Widended;
11536 LLVM_FALLTHROUGH;
11537 }
11538 default: {
11539 if (!DCI.isBeforeLegalize()) {
11540 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
11541 return performMemSDNodeCombine(MemNode, DCI);
11542 }
11543
11544 break;
11545 }
11546 }
11547
11548 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
11549 }
11550
11551 /// Helper function for adjustWritemask
SubIdx2Lane(unsigned Idx)11552 static unsigned SubIdx2Lane(unsigned Idx) {
11553 switch (Idx) {
11554 default: return ~0u;
11555 case AMDGPU::sub0: return 0;
11556 case AMDGPU::sub1: return 1;
11557 case AMDGPU::sub2: return 2;
11558 case AMDGPU::sub3: return 3;
11559 case AMDGPU::sub4: return 4; // Possible with TFE/LWE
11560 }
11561 }
11562
11563 /// Adjust the writemask of MIMG instructions
adjustWritemask(MachineSDNode * & Node,SelectionDAG & DAG) const11564 SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
11565 SelectionDAG &DAG) const {
11566 unsigned Opcode = Node->getMachineOpcode();
11567
11568 // Subtract 1 because the vdata output is not a MachineSDNode operand.
11569 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
11570 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
11571 return Node; // not implemented for D16
11572
11573 SDNode *Users[5] = { nullptr };
11574 unsigned Lane = 0;
11575 unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
11576 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
11577 unsigned NewDmask = 0;
11578 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
11579 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
11580 bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
11581 Node->getConstantOperandVal(LWEIdx))
11582 ? true
11583 : false;
11584 unsigned TFCLane = 0;
11585 bool HasChain = Node->getNumValues() > 1;
11586
11587 if (OldDmask == 0) {
11588 // These are folded out, but on the chance it happens don't assert.
11589 return Node;
11590 }
11591
11592 unsigned OldBitsSet = countPopulation(OldDmask);
11593 // Work out which is the TFE/LWE lane if that is enabled.
11594 if (UsesTFC) {
11595 TFCLane = OldBitsSet;
11596 }
11597
11598 // Try to figure out the used register components
11599 for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
11600 I != E; ++I) {
11601
11602 // Don't look at users of the chain.
11603 if (I.getUse().getResNo() != 0)
11604 continue;
11605
11606 // Abort if we can't understand the usage
11607 if (!I->isMachineOpcode() ||
11608 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
11609 return Node;
11610
11611 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
11612 // Note that subregs are packed, i.e. Lane==0 is the first bit set
11613 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
11614 // set, etc.
11615 Lane = SubIdx2Lane(I->getConstantOperandVal(1));
11616 if (Lane == ~0u)
11617 return Node;
11618
11619 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
11620 if (UsesTFC && Lane == TFCLane) {
11621 Users[Lane] = *I;
11622 } else {
11623 // Set which texture component corresponds to the lane.
11624 unsigned Comp;
11625 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
11626 Comp = countTrailingZeros(Dmask);
11627 Dmask &= ~(1 << Comp);
11628 }
11629
11630 // Abort if we have more than one user per component.
11631 if (Users[Lane])
11632 return Node;
11633
11634 Users[Lane] = *I;
11635 NewDmask |= 1 << Comp;
11636 }
11637 }
11638
11639 // Don't allow 0 dmask, as hardware assumes one channel enabled.
11640 bool NoChannels = !NewDmask;
11641 if (NoChannels) {
11642 if (!UsesTFC) {
11643 // No uses of the result and not using TFC. Then do nothing.
11644 return Node;
11645 }
11646 // If the original dmask has one channel - then nothing to do
11647 if (OldBitsSet == 1)
11648 return Node;
11649 // Use an arbitrary dmask - required for the instruction to work
11650 NewDmask = 1;
11651 }
11652 // Abort if there's no change
11653 if (NewDmask == OldDmask)
11654 return Node;
11655
11656 unsigned BitsSet = countPopulation(NewDmask);
11657
11658 // Check for TFE or LWE - increase the number of channels by one to account
11659 // for the extra return value
11660 // This will need adjustment for D16 if this is also included in
11661 // adjustWriteMask (this function) but at present D16 are excluded.
11662 unsigned NewChannels = BitsSet + UsesTFC;
11663
11664 int NewOpcode =
11665 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
11666 assert(NewOpcode != -1 &&
11667 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
11668 "failed to find equivalent MIMG op");
11669
11670 // Adjust the writemask in the node
11671 SmallVector<SDValue, 12> Ops;
11672 Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
11673 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
11674 Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
11675
11676 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
11677
11678 MVT ResultVT = NewChannels == 1 ?
11679 SVT : MVT::getVectorVT(SVT, NewChannels == 3 ? 4 :
11680 NewChannels == 5 ? 8 : NewChannels);
11681 SDVTList NewVTList = HasChain ?
11682 DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
11683
11684
11685 MachineSDNode *NewNode = DAG.getMachineNode(NewOpcode, SDLoc(Node),
11686 NewVTList, Ops);
11687
11688 if (HasChain) {
11689 // Update chain.
11690 DAG.setNodeMemRefs(NewNode, Node->memoperands());
11691 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
11692 }
11693
11694 if (NewChannels == 1) {
11695 assert(Node->hasNUsesOfValue(1, 0));
11696 SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY,
11697 SDLoc(Node), Users[Lane]->getValueType(0),
11698 SDValue(NewNode, 0));
11699 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
11700 return nullptr;
11701 }
11702
11703 // Update the users of the node with the new indices
11704 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
11705 SDNode *User = Users[i];
11706 if (!User) {
11707 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
11708 // Users[0] is still nullptr because channel 0 doesn't really have a use.
11709 if (i || !NoChannels)
11710 continue;
11711 } else {
11712 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
11713 DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
11714 }
11715
11716 switch (Idx) {
11717 default: break;
11718 case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
11719 case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
11720 case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
11721 case AMDGPU::sub3: Idx = AMDGPU::sub4; break;
11722 }
11723 }
11724
11725 DAG.RemoveDeadNode(Node);
11726 return nullptr;
11727 }
11728
isFrameIndexOp(SDValue Op)11729 static bool isFrameIndexOp(SDValue Op) {
11730 if (Op.getOpcode() == ISD::AssertZext)
11731 Op = Op.getOperand(0);
11732
11733 return isa<FrameIndexSDNode>(Op);
11734 }
11735
11736 /// Legalize target independent instructions (e.g. INSERT_SUBREG)
11737 /// with frame index operands.
11738 /// LLVM assumes that inputs are to these instructions are registers.
legalizeTargetIndependentNode(SDNode * Node,SelectionDAG & DAG) const11739 SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
11740 SelectionDAG &DAG) const {
11741 if (Node->getOpcode() == ISD::CopyToReg) {
11742 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
11743 SDValue SrcVal = Node->getOperand(2);
11744
11745 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
11746 // to try understanding copies to physical registers.
11747 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
11748 SDLoc SL(Node);
11749 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
11750 SDValue VReg = DAG.getRegister(
11751 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
11752
11753 SDNode *Glued = Node->getGluedNode();
11754 SDValue ToVReg
11755 = DAG.getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
11756 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
11757 SDValue ToResultReg
11758 = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
11759 VReg, ToVReg.getValue(1));
11760 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
11761 DAG.RemoveDeadNode(Node);
11762 return ToResultReg.getNode();
11763 }
11764 }
11765
11766 SmallVector<SDValue, 8> Ops;
11767 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
11768 if (!isFrameIndexOp(Node->getOperand(i))) {
11769 Ops.push_back(Node->getOperand(i));
11770 continue;
11771 }
11772
11773 SDLoc DL(Node);
11774 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
11775 Node->getOperand(i).getValueType(),
11776 Node->getOperand(i)), 0));
11777 }
11778
11779 return DAG.UpdateNodeOperands(Node, Ops);
11780 }
11781
11782 /// Fold the instructions after selecting them.
11783 /// Returns null if users were already updated.
PostISelFolding(MachineSDNode * Node,SelectionDAG & DAG) const11784 SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
11785 SelectionDAG &DAG) const {
11786 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
11787 unsigned Opcode = Node->getMachineOpcode();
11788
11789 if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() &&
11790 !TII->isGather4(Opcode) &&
11791 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) != -1) {
11792 return adjustWritemask(Node, DAG);
11793 }
11794
11795 if (Opcode == AMDGPU::INSERT_SUBREG ||
11796 Opcode == AMDGPU::REG_SEQUENCE) {
11797 legalizeTargetIndependentNode(Node, DAG);
11798 return Node;
11799 }
11800
11801 switch (Opcode) {
11802 case AMDGPU::V_DIV_SCALE_F32_e64:
11803 case AMDGPU::V_DIV_SCALE_F64_e64: {
11804 // Satisfy the operand register constraint when one of the inputs is
11805 // undefined. Ordinarily each undef value will have its own implicit_def of
11806 // a vreg, so force these to use a single register.
11807 SDValue Src0 = Node->getOperand(1);
11808 SDValue Src1 = Node->getOperand(3);
11809 SDValue Src2 = Node->getOperand(5);
11810
11811 if ((Src0.isMachineOpcode() &&
11812 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
11813 (Src0 == Src1 || Src0 == Src2))
11814 break;
11815
11816 MVT VT = Src0.getValueType().getSimpleVT();
11817 const TargetRegisterClass *RC =
11818 getRegClassFor(VT, Src0.getNode()->isDivergent());
11819
11820 MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
11821 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
11822
11823 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node),
11824 UndefReg, Src0, SDValue());
11825
11826 // src0 must be the same register as src1 or src2, even if the value is
11827 // undefined, so make sure we don't violate this constraint.
11828 if (Src0.isMachineOpcode() &&
11829 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
11830 if (Src1.isMachineOpcode() &&
11831 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
11832 Src0 = Src1;
11833 else if (Src2.isMachineOpcode() &&
11834 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
11835 Src0 = Src2;
11836 else {
11837 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
11838 Src0 = UndefReg;
11839 Src1 = UndefReg;
11840 }
11841 } else
11842 break;
11843
11844 SmallVector<SDValue, 9> Ops(Node->op_begin(), Node->op_end());
11845 Ops[1] = Src0;
11846 Ops[3] = Src1;
11847 Ops[5] = Src2;
11848 Ops.push_back(ImpDef.getValue(1));
11849 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
11850 }
11851 default:
11852 break;
11853 }
11854
11855 return Node;
11856 }
11857
11858 // Any MIMG instructions that use tfe or lwe require an initialization of the
11859 // result register that will be written in the case of a memory access failure.
11860 // The required code is also added to tie this init code to the result of the
11861 // img instruction.
AddIMGInit(MachineInstr & MI) const11862 void SITargetLowering::AddIMGInit(MachineInstr &MI) const {
11863 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
11864 const SIRegisterInfo &TRI = TII->getRegisterInfo();
11865 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
11866 MachineBasicBlock &MBB = *MI.getParent();
11867
11868 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
11869 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
11870 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
11871
11872 if (!TFE && !LWE) // intersect_ray
11873 return;
11874
11875 unsigned TFEVal = TFE ? TFE->getImm() : 0;
11876 unsigned LWEVal = LWE->getImm();
11877 unsigned D16Val = D16 ? D16->getImm() : 0;
11878
11879 if (!TFEVal && !LWEVal)
11880 return;
11881
11882 // At least one of TFE or LWE are non-zero
11883 // We have to insert a suitable initialization of the result value and
11884 // tie this to the dest of the image instruction.
11885
11886 const DebugLoc &DL = MI.getDebugLoc();
11887
11888 int DstIdx =
11889 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
11890
11891 // Calculate which dword we have to initialize to 0.
11892 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
11893
11894 // check that dmask operand is found.
11895 assert(MO_Dmask && "Expected dmask operand in instruction");
11896
11897 unsigned dmask = MO_Dmask->getImm();
11898 // Determine the number of active lanes taking into account the
11899 // Gather4 special case
11900 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : countPopulation(dmask);
11901
11902 bool Packed = !Subtarget->hasUnpackedD16VMem();
11903
11904 unsigned InitIdx =
11905 D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
11906
11907 // Abandon attempt if the dst size isn't large enough
11908 // - this is in fact an error but this is picked up elsewhere and
11909 // reported correctly.
11910 uint32_t DstSize = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
11911 if (DstSize < InitIdx)
11912 return;
11913
11914 // Create a register for the initialization value.
11915 Register PrevDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
11916 unsigned NewDst = 0; // Final initialized value will be in here
11917
11918 // If PRTStrictNull feature is enabled (the default) then initialize
11919 // all the result registers to 0, otherwise just the error indication
11920 // register (VGPRn+1)
11921 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
11922 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
11923
11924 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
11925 for (; SizeLeft; SizeLeft--, CurrIdx++) {
11926 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
11927 // Initialize dword
11928 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
11929 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
11930 .addImm(0);
11931 // Insert into the super-reg
11932 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
11933 .addReg(PrevDst)
11934 .addReg(SubReg)
11935 .addImm(SIRegisterInfo::getSubRegFromChannel(CurrIdx));
11936
11937 PrevDst = NewDst;
11938 }
11939
11940 // Add as an implicit operand
11941 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
11942
11943 // Tie the just added implicit operand to the dst
11944 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
11945 }
11946
11947 /// Assign the register class depending on the number of
11948 /// bits set in the writemask
AdjustInstrPostInstrSelection(MachineInstr & MI,SDNode * Node) const11949 void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
11950 SDNode *Node) const {
11951 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
11952
11953 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
11954
11955 if (TII->isVOP3(MI.getOpcode())) {
11956 // Make sure constant bus requirements are respected.
11957 TII->legalizeOperandsVOP3(MRI, MI);
11958
11959 // Prefer VGPRs over AGPRs in mAI instructions where possible.
11960 // This saves a chain-copy of registers and better balance register
11961 // use between vgpr and agpr as agpr tuples tend to be big.
11962 if (MI.getDesc().OpInfo) {
11963 unsigned Opc = MI.getOpcode();
11964 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
11965 for (auto I : { AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
11966 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) }) {
11967 if (I == -1)
11968 break;
11969 MachineOperand &Op = MI.getOperand(I);
11970 if (!Op.isReg() || !Op.getReg().isVirtual())
11971 continue;
11972 auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
11973 if (!TRI->hasAGPRs(RC))
11974 continue;
11975 auto *Src = MRI.getUniqueVRegDef(Op.getReg());
11976 if (!Src || !Src->isCopy() ||
11977 !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
11978 continue;
11979 auto *NewRC = TRI->getEquivalentVGPRClass(RC);
11980 // All uses of agpr64 and agpr32 can also accept vgpr except for
11981 // v_accvgpr_read, but we do not produce agpr reads during selection,
11982 // so no use checks are needed.
11983 MRI.setRegClass(Op.getReg(), NewRC);
11984 }
11985
11986 // Resolve the rest of AV operands to AGPRs.
11987 if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
11988 if (Src2->isReg() && Src2->getReg().isVirtual()) {
11989 auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
11990 if (TRI->isVectorSuperClass(RC)) {
11991 auto *NewRC = TRI->getEquivalentAGPRClass(RC);
11992 MRI.setRegClass(Src2->getReg(), NewRC);
11993 if (Src2->isTied())
11994 MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
11995 }
11996 }
11997 }
11998 }
11999
12000 return;
12001 }
12002
12003 if (TII->isMIMG(MI)) {
12004 if (!MI.mayStore())
12005 AddIMGInit(MI);
12006 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
12007 }
12008 }
12009
buildSMovImm32(SelectionDAG & DAG,const SDLoc & DL,uint64_t Val)12010 static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL,
12011 uint64_t Val) {
12012 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
12013 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
12014 }
12015
wrapAddr64Rsrc(SelectionDAG & DAG,const SDLoc & DL,SDValue Ptr) const12016 MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
12017 const SDLoc &DL,
12018 SDValue Ptr) const {
12019 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
12020
12021 // Build the half of the subregister with the constants before building the
12022 // full 128-bit register. If we are building multiple resource descriptors,
12023 // this will allow CSEing of the 2-component register.
12024 const SDValue Ops0[] = {
12025 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
12026 buildSMovImm32(DAG, DL, 0),
12027 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
12028 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
12029 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
12030 };
12031
12032 SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
12033 MVT::v2i32, Ops0), 0);
12034
12035 // Combine the constants and the pointer.
12036 const SDValue Ops1[] = {
12037 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
12038 Ptr,
12039 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
12040 SubRegHi,
12041 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
12042 };
12043
12044 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
12045 }
12046
12047 /// Return a resource descriptor with the 'Add TID' bit enabled
12048 /// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
12049 /// of the resource descriptor) to create an offset, which is added to
12050 /// the resource pointer.
buildRSRC(SelectionDAG & DAG,const SDLoc & DL,SDValue Ptr,uint32_t RsrcDword1,uint64_t RsrcDword2And3) const12051 MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
12052 SDValue Ptr, uint32_t RsrcDword1,
12053 uint64_t RsrcDword2And3) const {
12054 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
12055 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
12056 if (RsrcDword1) {
12057 PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
12058 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
12059 0);
12060 }
12061
12062 SDValue DataLo = buildSMovImm32(DAG, DL,
12063 RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
12064 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
12065
12066 const SDValue Ops[] = {
12067 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
12068 PtrLo,
12069 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
12070 PtrHi,
12071 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
12072 DataLo,
12073 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
12074 DataHi,
12075 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
12076 };
12077
12078 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
12079 }
12080
12081 //===----------------------------------------------------------------------===//
12082 // SI Inline Assembly Support
12083 //===----------------------------------------------------------------------===//
12084
12085 std::pair<unsigned, const TargetRegisterClass *>
getRegForInlineAsmConstraint(const TargetRegisterInfo * TRI_,StringRef Constraint,MVT VT) const12086 SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
12087 StringRef Constraint,
12088 MVT VT) const {
12089 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
12090
12091 const TargetRegisterClass *RC = nullptr;
12092 if (Constraint.size() == 1) {
12093 const unsigned BitWidth = VT.getSizeInBits();
12094 switch (Constraint[0]) {
12095 default:
12096 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
12097 case 's':
12098 case 'r':
12099 switch (BitWidth) {
12100 case 16:
12101 RC = &AMDGPU::SReg_32RegClass;
12102 break;
12103 case 64:
12104 RC = &AMDGPU::SGPR_64RegClass;
12105 break;
12106 default:
12107 RC = SIRegisterInfo::getSGPRClassForBitWidth(BitWidth);
12108 if (!RC)
12109 return std::make_pair(0U, nullptr);
12110 break;
12111 }
12112 break;
12113 case 'v':
12114 switch (BitWidth) {
12115 case 16:
12116 RC = &AMDGPU::VGPR_32RegClass;
12117 break;
12118 default:
12119 RC = TRI->getVGPRClassForBitWidth(BitWidth);
12120 if (!RC)
12121 return std::make_pair(0U, nullptr);
12122 break;
12123 }
12124 break;
12125 case 'a':
12126 if (!Subtarget->hasMAIInsts())
12127 break;
12128 switch (BitWidth) {
12129 case 16:
12130 RC = &AMDGPU::AGPR_32RegClass;
12131 break;
12132 default:
12133 RC = TRI->getAGPRClassForBitWidth(BitWidth);
12134 if (!RC)
12135 return std::make_pair(0U, nullptr);
12136 break;
12137 }
12138 break;
12139 }
12140 // We actually support i128, i16 and f16 as inline parameters
12141 // even if they are not reported as legal
12142 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
12143 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
12144 return std::make_pair(0U, RC);
12145 }
12146
12147 if (Constraint.startswith("{") && Constraint.endswith("}")) {
12148 StringRef RegName(Constraint.data() + 1, Constraint.size() - 2);
12149 if (RegName.consume_front("v")) {
12150 RC = &AMDGPU::VGPR_32RegClass;
12151 } else if (RegName.consume_front("s")) {
12152 RC = &AMDGPU::SGPR_32RegClass;
12153 } else if (RegName.consume_front("a")) {
12154 RC = &AMDGPU::AGPR_32RegClass;
12155 }
12156
12157 if (RC) {
12158 uint32_t Idx;
12159 if (RegName.consume_front("[")) {
12160 uint32_t End;
12161 bool Failed = RegName.consumeInteger(10, Idx);
12162 Failed |= !RegName.consume_front(":");
12163 Failed |= RegName.consumeInteger(10, End);
12164 Failed |= !RegName.consume_back("]");
12165 if (!Failed) {
12166 uint32_t Width = (End - Idx + 1) * 32;
12167 MCRegister Reg = RC->getRegister(Idx);
12168 if (SIRegisterInfo::isVGPRClass(RC))
12169 RC = TRI->getVGPRClassForBitWidth(Width);
12170 else if (SIRegisterInfo::isSGPRClass(RC))
12171 RC = TRI->getSGPRClassForBitWidth(Width);
12172 else if (SIRegisterInfo::isAGPRClass(RC))
12173 RC = TRI->getAGPRClassForBitWidth(Width);
12174 if (RC) {
12175 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
12176 return std::make_pair(Reg, RC);
12177 }
12178 }
12179 } else {
12180 bool Failed = RegName.getAsInteger(10, Idx);
12181 if (!Failed && Idx < RC->getNumRegs())
12182 return std::make_pair(RC->getRegister(Idx), RC);
12183 }
12184 }
12185 }
12186
12187 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
12188 if (Ret.first)
12189 Ret.second = TRI->getPhysRegClass(Ret.first);
12190
12191 return Ret;
12192 }
12193
isImmConstraint(StringRef Constraint)12194 static bool isImmConstraint(StringRef Constraint) {
12195 if (Constraint.size() == 1) {
12196 switch (Constraint[0]) {
12197 default: break;
12198 case 'I':
12199 case 'J':
12200 case 'A':
12201 case 'B':
12202 case 'C':
12203 return true;
12204 }
12205 } else if (Constraint == "DA" ||
12206 Constraint == "DB") {
12207 return true;
12208 }
12209 return false;
12210 }
12211
12212 SITargetLowering::ConstraintType
getConstraintType(StringRef Constraint) const12213 SITargetLowering::getConstraintType(StringRef Constraint) const {
12214 if (Constraint.size() == 1) {
12215 switch (Constraint[0]) {
12216 default: break;
12217 case 's':
12218 case 'v':
12219 case 'a':
12220 return C_RegisterClass;
12221 }
12222 }
12223 if (isImmConstraint(Constraint)) {
12224 return C_Other;
12225 }
12226 return TargetLowering::getConstraintType(Constraint);
12227 }
12228
clearUnusedBits(uint64_t Val,unsigned Size)12229 static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
12230 if (!AMDGPU::isInlinableIntLiteral(Val)) {
12231 Val = Val & maskTrailingOnes<uint64_t>(Size);
12232 }
12233 return Val;
12234 }
12235
LowerAsmOperandForConstraint(SDValue Op,std::string & Constraint,std::vector<SDValue> & Ops,SelectionDAG & DAG) const12236 void SITargetLowering::LowerAsmOperandForConstraint(SDValue Op,
12237 std::string &Constraint,
12238 std::vector<SDValue> &Ops,
12239 SelectionDAG &DAG) const {
12240 if (isImmConstraint(Constraint)) {
12241 uint64_t Val;
12242 if (getAsmOperandConstVal(Op, Val) &&
12243 checkAsmConstraintVal(Op, Constraint, Val)) {
12244 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
12245 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
12246 }
12247 } else {
12248 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
12249 }
12250 }
12251
getAsmOperandConstVal(SDValue Op,uint64_t & Val) const12252 bool SITargetLowering::getAsmOperandConstVal(SDValue Op, uint64_t &Val) const {
12253 unsigned Size = Op.getScalarValueSizeInBits();
12254 if (Size > 64)
12255 return false;
12256
12257 if (Size == 16 && !Subtarget->has16BitInsts())
12258 return false;
12259
12260 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
12261 Val = C->getSExtValue();
12262 return true;
12263 }
12264 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) {
12265 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
12266 return true;
12267 }
12268 if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Op)) {
12269 if (Size != 16 || Op.getNumOperands() != 2)
12270 return false;
12271 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
12272 return false;
12273 if (ConstantSDNode *C = V->getConstantSplatNode()) {
12274 Val = C->getSExtValue();
12275 return true;
12276 }
12277 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
12278 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
12279 return true;
12280 }
12281 }
12282
12283 return false;
12284 }
12285
checkAsmConstraintVal(SDValue Op,const std::string & Constraint,uint64_t Val) const12286 bool SITargetLowering::checkAsmConstraintVal(SDValue Op,
12287 const std::string &Constraint,
12288 uint64_t Val) const {
12289 if (Constraint.size() == 1) {
12290 switch (Constraint[0]) {
12291 case 'I':
12292 return AMDGPU::isInlinableIntLiteral(Val);
12293 case 'J':
12294 return isInt<16>(Val);
12295 case 'A':
12296 return checkAsmConstraintValA(Op, Val);
12297 case 'B':
12298 return isInt<32>(Val);
12299 case 'C':
12300 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
12301 AMDGPU::isInlinableIntLiteral(Val);
12302 default:
12303 break;
12304 }
12305 } else if (Constraint.size() == 2) {
12306 if (Constraint == "DA") {
12307 int64_t HiBits = static_cast<int32_t>(Val >> 32);
12308 int64_t LoBits = static_cast<int32_t>(Val);
12309 return checkAsmConstraintValA(Op, HiBits, 32) &&
12310 checkAsmConstraintValA(Op, LoBits, 32);
12311 }
12312 if (Constraint == "DB") {
12313 return true;
12314 }
12315 }
12316 llvm_unreachable("Invalid asm constraint");
12317 }
12318
checkAsmConstraintValA(SDValue Op,uint64_t Val,unsigned MaxSize) const12319 bool SITargetLowering::checkAsmConstraintValA(SDValue Op,
12320 uint64_t Val,
12321 unsigned MaxSize) const {
12322 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
12323 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
12324 if ((Size == 16 && AMDGPU::isInlinableLiteral16(Val, HasInv2Pi)) ||
12325 (Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
12326 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi))) {
12327 return true;
12328 }
12329 return false;
12330 }
12331
getAlignedAGPRClassID(unsigned UnalignedClassID)12332 static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
12333 switch (UnalignedClassID) {
12334 case AMDGPU::VReg_64RegClassID:
12335 return AMDGPU::VReg_64_Align2RegClassID;
12336 case AMDGPU::VReg_96RegClassID:
12337 return AMDGPU::VReg_96_Align2RegClassID;
12338 case AMDGPU::VReg_128RegClassID:
12339 return AMDGPU::VReg_128_Align2RegClassID;
12340 case AMDGPU::VReg_160RegClassID:
12341 return AMDGPU::VReg_160_Align2RegClassID;
12342 case AMDGPU::VReg_192RegClassID:
12343 return AMDGPU::VReg_192_Align2RegClassID;
12344 case AMDGPU::VReg_224RegClassID:
12345 return AMDGPU::VReg_224_Align2RegClassID;
12346 case AMDGPU::VReg_256RegClassID:
12347 return AMDGPU::VReg_256_Align2RegClassID;
12348 case AMDGPU::VReg_512RegClassID:
12349 return AMDGPU::VReg_512_Align2RegClassID;
12350 case AMDGPU::VReg_1024RegClassID:
12351 return AMDGPU::VReg_1024_Align2RegClassID;
12352 case AMDGPU::AReg_64RegClassID:
12353 return AMDGPU::AReg_64_Align2RegClassID;
12354 case AMDGPU::AReg_96RegClassID:
12355 return AMDGPU::AReg_96_Align2RegClassID;
12356 case AMDGPU::AReg_128RegClassID:
12357 return AMDGPU::AReg_128_Align2RegClassID;
12358 case AMDGPU::AReg_160RegClassID:
12359 return AMDGPU::AReg_160_Align2RegClassID;
12360 case AMDGPU::AReg_192RegClassID:
12361 return AMDGPU::AReg_192_Align2RegClassID;
12362 case AMDGPU::AReg_256RegClassID:
12363 return AMDGPU::AReg_256_Align2RegClassID;
12364 case AMDGPU::AReg_512RegClassID:
12365 return AMDGPU::AReg_512_Align2RegClassID;
12366 case AMDGPU::AReg_1024RegClassID:
12367 return AMDGPU::AReg_1024_Align2RegClassID;
12368 default:
12369 return -1;
12370 }
12371 }
12372
12373 // Figure out which registers should be reserved for stack access. Only after
12374 // the function is legalized do we know all of the non-spill stack objects or if
12375 // calls are present.
finalizeLowering(MachineFunction & MF) const12376 void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
12377 MachineRegisterInfo &MRI = MF.getRegInfo();
12378 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
12379 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
12380 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
12381 const SIInstrInfo *TII = ST.getInstrInfo();
12382
12383 if (Info->isEntryFunction()) {
12384 // Callable functions have fixed registers used for stack access.
12385 reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);
12386 }
12387
12388 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
12389 Info->getStackPtrOffsetReg()));
12390 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
12391 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
12392
12393 // We need to worry about replacing the default register with itself in case
12394 // of MIR testcases missing the MFI.
12395 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
12396 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
12397
12398 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
12399 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
12400
12401 Info->limitOccupancy(MF);
12402
12403 if (ST.isWave32() && !MF.empty()) {
12404 for (auto &MBB : MF) {
12405 for (auto &MI : MBB) {
12406 TII->fixImplicitOperands(MI);
12407 }
12408 }
12409 }
12410
12411 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
12412 // classes if required. Ideally the register class constraints would differ
12413 // per-subtarget, but there's no easy way to achieve that right now. This is
12414 // not a problem for VGPRs because the correctly aligned VGPR class is implied
12415 // from using them as the register class for legal types.
12416 if (ST.needsAlignedVGPRs()) {
12417 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
12418 const Register Reg = Register::index2VirtReg(I);
12419 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
12420 if (!RC)
12421 continue;
12422 int NewClassID = getAlignedAGPRClassID(RC->getID());
12423 if (NewClassID != -1)
12424 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
12425 }
12426 }
12427
12428 TargetLoweringBase::finalizeLowering(MF);
12429 }
12430
computeKnownBitsForFrameIndex(const int FI,KnownBits & Known,const MachineFunction & MF) const12431 void SITargetLowering::computeKnownBitsForFrameIndex(
12432 const int FI, KnownBits &Known, const MachineFunction &MF) const {
12433 TargetLowering::computeKnownBitsForFrameIndex(FI, Known, MF);
12434
12435 // Set the high bits to zero based on the maximum allowed scratch size per
12436 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
12437 // calculation won't overflow, so assume the sign bit is never set.
12438 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
12439 }
12440
knownBitsForWorkitemID(const GCNSubtarget & ST,GISelKnownBits & KB,KnownBits & Known,unsigned Dim)12441 static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB,
12442 KnownBits &Known, unsigned Dim) {
12443 unsigned MaxValue =
12444 ST.getMaxWorkitemID(KB.getMachineFunction().getFunction(), Dim);
12445 Known.Zero.setHighBits(countLeadingZeros(MaxValue));
12446 }
12447
computeKnownBitsForTargetInstr(GISelKnownBits & KB,Register R,KnownBits & Known,const APInt & DemandedElts,const MachineRegisterInfo & MRI,unsigned Depth) const12448 void SITargetLowering::computeKnownBitsForTargetInstr(
12449 GISelKnownBits &KB, Register R, KnownBits &Known, const APInt &DemandedElts,
12450 const MachineRegisterInfo &MRI, unsigned Depth) const {
12451 const MachineInstr *MI = MRI.getVRegDef(R);
12452 switch (MI->getOpcode()) {
12453 case AMDGPU::G_INTRINSIC: {
12454 switch (MI->getIntrinsicID()) {
12455 case Intrinsic::amdgcn_workitem_id_x:
12456 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 0);
12457 break;
12458 case Intrinsic::amdgcn_workitem_id_y:
12459 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 1);
12460 break;
12461 case Intrinsic::amdgcn_workitem_id_z:
12462 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 2);
12463 break;
12464 case Intrinsic::amdgcn_mbcnt_lo:
12465 case Intrinsic::amdgcn_mbcnt_hi: {
12466 // These return at most the wavefront size - 1.
12467 unsigned Size = MRI.getType(R).getSizeInBits();
12468 Known.Zero.setHighBits(Size - getSubtarget()->getWavefrontSizeLog2());
12469 break;
12470 }
12471 case Intrinsic::amdgcn_groupstaticsize: {
12472 // We can report everything over the maximum size as 0. We can't report
12473 // based on the actual size because we don't know if it's accurate or not
12474 // at any given point.
12475 Known.Zero.setHighBits(countLeadingZeros(getSubtarget()->getLocalMemorySize()));
12476 break;
12477 }
12478 }
12479 break;
12480 }
12481 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
12482 Known.Zero.setHighBits(24);
12483 break;
12484 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
12485 Known.Zero.setHighBits(16);
12486 break;
12487 }
12488 }
12489
computeKnownAlignForTargetInstr(GISelKnownBits & KB,Register R,const MachineRegisterInfo & MRI,unsigned Depth) const12490 Align SITargetLowering::computeKnownAlignForTargetInstr(
12491 GISelKnownBits &KB, Register R, const MachineRegisterInfo &MRI,
12492 unsigned Depth) const {
12493 const MachineInstr *MI = MRI.getVRegDef(R);
12494 switch (MI->getOpcode()) {
12495 case AMDGPU::G_INTRINSIC:
12496 case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
12497 // FIXME: Can this move to generic code? What about the case where the call
12498 // site specifies a lower alignment?
12499 Intrinsic::ID IID = MI->getIntrinsicID();
12500 LLVMContext &Ctx = KB.getMachineFunction().getFunction().getContext();
12501 AttributeList Attrs = Intrinsic::getAttributes(Ctx, IID);
12502 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
12503 return *RetAlign;
12504 return Align(1);
12505 }
12506 default:
12507 return Align(1);
12508 }
12509 }
12510
getPrefLoopAlignment(MachineLoop * ML) const12511 Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
12512 const Align PrefAlign = TargetLowering::getPrefLoopAlignment(ML);
12513 const Align CacheLineAlign = Align(64);
12514
12515 // Pre-GFX10 target did not benefit from loop alignment
12516 if (!ML || DisableLoopAlignment ||
12517 (getSubtarget()->getGeneration() < AMDGPUSubtarget::GFX10) ||
12518 getSubtarget()->hasInstFwdPrefetchBug())
12519 return PrefAlign;
12520
12521 // On GFX10 I$ is 4 x 64 bytes cache lines.
12522 // By default prefetcher keeps one cache line behind and reads two ahead.
12523 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
12524 // behind and one ahead.
12525 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
12526 // If loop fits 64 bytes it always spans no more than two cache lines and
12527 // does not need an alignment.
12528 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
12529 // Else if loop is less or equal 192 bytes we need two lines behind.
12530
12531 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
12532 const MachineBasicBlock *Header = ML->getHeader();
12533 if (Header->getAlignment() != PrefAlign)
12534 return Header->getAlignment(); // Already processed.
12535
12536 unsigned LoopSize = 0;
12537 for (const MachineBasicBlock *MBB : ML->blocks()) {
12538 // If inner loop block is aligned assume in average half of the alignment
12539 // size to be added as nops.
12540 if (MBB != Header)
12541 LoopSize += MBB->getAlignment().value() / 2;
12542
12543 for (const MachineInstr &MI : *MBB) {
12544 LoopSize += TII->getInstSizeInBytes(MI);
12545 if (LoopSize > 192)
12546 return PrefAlign;
12547 }
12548 }
12549
12550 if (LoopSize <= 64)
12551 return PrefAlign;
12552
12553 if (LoopSize <= 128)
12554 return CacheLineAlign;
12555
12556 // If any of parent loops is surrounded by prefetch instructions do not
12557 // insert new for inner loop, which would reset parent's settings.
12558 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
12559 if (MachineBasicBlock *Exit = P->getExitBlock()) {
12560 auto I = Exit->getFirstNonDebugInstr();
12561 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
12562 return CacheLineAlign;
12563 }
12564 }
12565
12566 MachineBasicBlock *Pre = ML->getLoopPreheader();
12567 MachineBasicBlock *Exit = ML->getExitBlock();
12568
12569 if (Pre && Exit) {
12570 auto PreTerm = Pre->getFirstTerminator();
12571 if (PreTerm == Pre->begin() ||
12572 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
12573 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
12574 .addImm(1); // prefetch 2 lines behind PC
12575
12576 auto ExitHead = Exit->getFirstNonDebugInstr();
12577 if (ExitHead == Exit->end() ||
12578 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
12579 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
12580 .addImm(2); // prefetch 1 line behind PC
12581 }
12582
12583 return CacheLineAlign;
12584 }
12585
12586 LLVM_ATTRIBUTE_UNUSED
isCopyFromRegOfInlineAsm(const SDNode * N)12587 static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
12588 assert(N->getOpcode() == ISD::CopyFromReg);
12589 do {
12590 // Follow the chain until we find an INLINEASM node.
12591 N = N->getOperand(0).getNode();
12592 if (N->getOpcode() == ISD::INLINEASM ||
12593 N->getOpcode() == ISD::INLINEASM_BR)
12594 return true;
12595 } while (N->getOpcode() == ISD::CopyFromReg);
12596 return false;
12597 }
12598
isSDNodeSourceOfDivergence(const SDNode * N,FunctionLoweringInfo * FLI,LegacyDivergenceAnalysis * KDA) const12599 bool SITargetLowering::isSDNodeSourceOfDivergence(
12600 const SDNode *N, FunctionLoweringInfo *FLI,
12601 LegacyDivergenceAnalysis *KDA) const {
12602 switch (N->getOpcode()) {
12603 case ISD::CopyFromReg: {
12604 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
12605 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
12606 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
12607 Register Reg = R->getReg();
12608
12609 // FIXME: Why does this need to consider isLiveIn?
12610 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
12611 return !TRI->isSGPRReg(MRI, Reg);
12612
12613 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
12614 return KDA->isDivergent(V);
12615
12616 assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
12617 return !TRI->isSGPRReg(MRI, Reg);
12618 }
12619 case ISD::LOAD: {
12620 const LoadSDNode *L = cast<LoadSDNode>(N);
12621 unsigned AS = L->getAddressSpace();
12622 // A flat load may access private memory.
12623 return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
12624 }
12625 case ISD::CALLSEQ_END:
12626 return true;
12627 case ISD::INTRINSIC_WO_CHAIN:
12628 return AMDGPU::isIntrinsicSourceOfDivergence(
12629 cast<ConstantSDNode>(N->getOperand(0))->getZExtValue());
12630 case ISD::INTRINSIC_W_CHAIN:
12631 return AMDGPU::isIntrinsicSourceOfDivergence(
12632 cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
12633 case AMDGPUISD::ATOMIC_CMP_SWAP:
12634 case AMDGPUISD::ATOMIC_INC:
12635 case AMDGPUISD::ATOMIC_DEC:
12636 case AMDGPUISD::ATOMIC_LOAD_FMIN:
12637 case AMDGPUISD::ATOMIC_LOAD_FMAX:
12638 case AMDGPUISD::BUFFER_ATOMIC_SWAP:
12639 case AMDGPUISD::BUFFER_ATOMIC_ADD:
12640 case AMDGPUISD::BUFFER_ATOMIC_SUB:
12641 case AMDGPUISD::BUFFER_ATOMIC_SMIN:
12642 case AMDGPUISD::BUFFER_ATOMIC_UMIN:
12643 case AMDGPUISD::BUFFER_ATOMIC_SMAX:
12644 case AMDGPUISD::BUFFER_ATOMIC_UMAX:
12645 case AMDGPUISD::BUFFER_ATOMIC_AND:
12646 case AMDGPUISD::BUFFER_ATOMIC_OR:
12647 case AMDGPUISD::BUFFER_ATOMIC_XOR:
12648 case AMDGPUISD::BUFFER_ATOMIC_INC:
12649 case AMDGPUISD::BUFFER_ATOMIC_DEC:
12650 case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
12651 case AMDGPUISD::BUFFER_ATOMIC_CSUB:
12652 case AMDGPUISD::BUFFER_ATOMIC_FADD:
12653 case AMDGPUISD::BUFFER_ATOMIC_FMIN:
12654 case AMDGPUISD::BUFFER_ATOMIC_FMAX:
12655 // Target-specific read-modify-write atomics are sources of divergence.
12656 return true;
12657 default:
12658 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
12659 // Generic read-modify-write atomics are sources of divergence.
12660 return A->readMem() && A->writeMem();
12661 }
12662 return false;
12663 }
12664 }
12665
denormalsEnabledForType(const SelectionDAG & DAG,EVT VT) const12666 bool SITargetLowering::denormalsEnabledForType(const SelectionDAG &DAG,
12667 EVT VT) const {
12668 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
12669 case MVT::f32:
12670 return hasFP32Denormals(DAG.getMachineFunction());
12671 case MVT::f64:
12672 case MVT::f16:
12673 return hasFP64FP16Denormals(DAG.getMachineFunction());
12674 default:
12675 return false;
12676 }
12677 }
12678
denormalsEnabledForType(LLT Ty,MachineFunction & MF) const12679 bool SITargetLowering::denormalsEnabledForType(LLT Ty,
12680 MachineFunction &MF) const {
12681 switch (Ty.getScalarSizeInBits()) {
12682 case 32:
12683 return hasFP32Denormals(MF);
12684 case 64:
12685 case 16:
12686 return hasFP64FP16Denormals(MF);
12687 default:
12688 return false;
12689 }
12690 }
12691
isKnownNeverNaNForTargetNode(SDValue Op,const SelectionDAG & DAG,bool SNaN,unsigned Depth) const12692 bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
12693 const SelectionDAG &DAG,
12694 bool SNaN,
12695 unsigned Depth) const {
12696 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
12697 const MachineFunction &MF = DAG.getMachineFunction();
12698 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
12699
12700 if (Info->getMode().DX10Clamp)
12701 return true; // Clamped to 0.
12702 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
12703 }
12704
12705 return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op, DAG,
12706 SNaN, Depth);
12707 }
12708
12709 // Global FP atomic instructions have a hardcoded FP mode and do not support
12710 // FP32 denormals, and only support v2f16 denormals.
fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst * RMW)12711 static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) {
12712 const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
12713 auto DenormMode = RMW->getParent()->getParent()->getDenormalMode(Flt);
12714 if (&Flt == &APFloat::IEEEsingle())
12715 return DenormMode == DenormalMode::getPreserveSign();
12716 return DenormMode == DenormalMode::getIEEE();
12717 }
12718
12719 TargetLowering::AtomicExpansionKind
shouldExpandAtomicRMWInIR(AtomicRMWInst * RMW) const12720 SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
12721 unsigned AS = RMW->getPointerAddressSpace();
12722 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
12723 return AtomicExpansionKind::NotAtomic;
12724
12725 auto ReportUnsafeHWInst = [&](TargetLowering::AtomicExpansionKind Kind) {
12726 OptimizationRemarkEmitter ORE(RMW->getFunction());
12727 LLVMContext &Ctx = RMW->getFunction()->getContext();
12728 SmallVector<StringRef> SSNs;
12729 Ctx.getSyncScopeNames(SSNs);
12730 auto MemScope = SSNs[RMW->getSyncScopeID()].empty()
12731 ? "system"
12732 : SSNs[RMW->getSyncScopeID()];
12733 ORE.emit([&]() {
12734 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
12735 << "Hardware instruction generated for atomic "
12736 << RMW->getOperationName(RMW->getOperation())
12737 << " operation at memory scope " << MemScope
12738 << " due to an unsafe request.";
12739 });
12740 return Kind;
12741 };
12742
12743 switch (RMW->getOperation()) {
12744 case AtomicRMWInst::FAdd: {
12745 Type *Ty = RMW->getType();
12746
12747 // We don't have a way to support 16-bit atomics now, so just leave them
12748 // as-is.
12749 if (Ty->isHalfTy())
12750 return AtomicExpansionKind::None;
12751
12752 if (!Ty->isFloatTy() && (!Subtarget->hasGFX90AInsts() || !Ty->isDoubleTy()))
12753 return AtomicExpansionKind::CmpXChg;
12754
12755 if ((AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) &&
12756 Subtarget->hasAtomicFaddNoRtnInsts()) {
12757 if (Subtarget->hasGFX940Insts())
12758 return AtomicExpansionKind::None;
12759
12760 // The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
12761 // floating point atomic instructions. May generate more efficient code,
12762 // but may not respect rounding and denormal modes, and may give incorrect
12763 // results for certain memory destinations.
12764 if (RMW->getFunction()
12765 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
12766 .getValueAsString() != "true")
12767 return AtomicExpansionKind::CmpXChg;
12768
12769 if (Subtarget->hasGFX90AInsts()) {
12770 if (Ty->isFloatTy() && AS == AMDGPUAS::FLAT_ADDRESS)
12771 return AtomicExpansionKind::CmpXChg;
12772
12773 auto SSID = RMW->getSyncScopeID();
12774 if (SSID == SyncScope::System ||
12775 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as"))
12776 return AtomicExpansionKind::CmpXChg;
12777
12778 return ReportUnsafeHWInst(AtomicExpansionKind::None);
12779 }
12780
12781 if (AS == AMDGPUAS::FLAT_ADDRESS)
12782 return AtomicExpansionKind::CmpXChg;
12783
12784 return RMW->use_empty() ? ReportUnsafeHWInst(AtomicExpansionKind::None)
12785 : AtomicExpansionKind::CmpXChg;
12786 }
12787
12788 // DS FP atomics do respect the denormal mode, but the rounding mode is
12789 // fixed to round-to-nearest-even.
12790 // The only exception is DS_ADD_F64 which never flushes regardless of mode.
12791 if (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomicAdd()) {
12792 if (!Ty->isDoubleTy())
12793 return AtomicExpansionKind::None;
12794
12795 if (fpModeMatchesGlobalFPAtomicMode(RMW))
12796 return AtomicExpansionKind::None;
12797
12798 return RMW->getFunction()
12799 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
12800 .getValueAsString() == "true"
12801 ? ReportUnsafeHWInst(AtomicExpansionKind::None)
12802 : AtomicExpansionKind::CmpXChg;
12803 }
12804
12805 return AtomicExpansionKind::CmpXChg;
12806 }
12807 default:
12808 break;
12809 }
12810
12811 return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW);
12812 }
12813
12814 TargetLowering::AtomicExpansionKind
shouldExpandAtomicLoadInIR(LoadInst * LI) const12815 SITargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
12816 return LI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
12817 ? AtomicExpansionKind::NotAtomic
12818 : AtomicExpansionKind::None;
12819 }
12820
12821 TargetLowering::AtomicExpansionKind
shouldExpandAtomicStoreInIR(StoreInst * SI) const12822 SITargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
12823 return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
12824 ? AtomicExpansionKind::NotAtomic
12825 : AtomicExpansionKind::None;
12826 }
12827
12828 TargetLowering::AtomicExpansionKind
shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst * CmpX) const12829 SITargetLowering::shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CmpX) const {
12830 return CmpX->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
12831 ? AtomicExpansionKind::NotAtomic
12832 : AtomicExpansionKind::None;
12833 }
12834
12835 const TargetRegisterClass *
getRegClassFor(MVT VT,bool isDivergent) const12836 SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
12837 const TargetRegisterClass *RC = TargetLoweringBase::getRegClassFor(VT, false);
12838 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
12839 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
12840 return Subtarget->getWavefrontSize() == 64 ? &AMDGPU::SReg_64RegClass
12841 : &AMDGPU::SReg_32RegClass;
12842 if (!TRI->isSGPRClass(RC) && !isDivergent)
12843 return TRI->getEquivalentSGPRClass(RC);
12844 else if (TRI->isSGPRClass(RC) && isDivergent)
12845 return TRI->getEquivalentVGPRClass(RC);
12846
12847 return RC;
12848 }
12849
12850 // FIXME: This is a workaround for DivergenceAnalysis not understanding always
12851 // uniform values (as produced by the mask results of control flow intrinsics)
12852 // used outside of divergent blocks. The phi users need to also be treated as
12853 // always uniform.
hasCFUser(const Value * V,SmallPtrSet<const Value *,16> & Visited,unsigned WaveSize)12854 static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
12855 unsigned WaveSize) {
12856 // FIXME: We assume we never cast the mask results of a control flow
12857 // intrinsic.
12858 // Early exit if the type won't be consistent as a compile time hack.
12859 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
12860 if (!IT || IT->getBitWidth() != WaveSize)
12861 return false;
12862
12863 if (!isa<Instruction>(V))
12864 return false;
12865 if (!Visited.insert(V).second)
12866 return false;
12867 bool Result = false;
12868 for (auto U : V->users()) {
12869 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
12870 if (V == U->getOperand(1)) {
12871 switch (Intrinsic->getIntrinsicID()) {
12872 default:
12873 Result = false;
12874 break;
12875 case Intrinsic::amdgcn_if_break:
12876 case Intrinsic::amdgcn_if:
12877 case Intrinsic::amdgcn_else:
12878 Result = true;
12879 break;
12880 }
12881 }
12882 if (V == U->getOperand(0)) {
12883 switch (Intrinsic->getIntrinsicID()) {
12884 default:
12885 Result = false;
12886 break;
12887 case Intrinsic::amdgcn_end_cf:
12888 case Intrinsic::amdgcn_loop:
12889 Result = true;
12890 break;
12891 }
12892 }
12893 } else {
12894 Result = hasCFUser(U, Visited, WaveSize);
12895 }
12896 if (Result)
12897 break;
12898 }
12899 return Result;
12900 }
12901
requiresUniformRegister(MachineFunction & MF,const Value * V) const12902 bool SITargetLowering::requiresUniformRegister(MachineFunction &MF,
12903 const Value *V) const {
12904 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
12905 if (CI->isInlineAsm()) {
12906 // FIXME: This cannot give a correct answer. This should only trigger in
12907 // the case where inline asm returns mixed SGPR and VGPR results, used
12908 // outside the defining block. We don't have a specific result to
12909 // consider, so this assumes if any value is SGPR, the overall register
12910 // also needs to be SGPR.
12911 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
12912 TargetLowering::AsmOperandInfoVector TargetConstraints = ParseConstraints(
12913 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
12914 for (auto &TC : TargetConstraints) {
12915 if (TC.Type == InlineAsm::isOutput) {
12916 ComputeConstraintToUse(TC, SDValue());
12917 const TargetRegisterClass *RC = getRegForInlineAsmConstraint(
12918 SIRI, TC.ConstraintCode, TC.ConstraintVT).second;
12919 if (RC && SIRI->isSGPRClass(RC))
12920 return true;
12921 }
12922 }
12923 }
12924 }
12925 SmallPtrSet<const Value *, 16> Visited;
12926 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
12927 }
12928
12929 std::pair<InstructionCost, MVT>
getTypeLegalizationCost(const DataLayout & DL,Type * Ty) const12930 SITargetLowering::getTypeLegalizationCost(const DataLayout &DL,
12931 Type *Ty) const {
12932 std::pair<InstructionCost, MVT> Cost =
12933 TargetLoweringBase::getTypeLegalizationCost(DL, Ty);
12934 auto Size = DL.getTypeSizeInBits(Ty);
12935 // Maximum load or store can handle 8 dwords for scalar and 4 for
12936 // vector ALU. Let's assume anything above 8 dwords is expensive
12937 // even if legal.
12938 if (Size <= 256)
12939 return Cost;
12940
12941 Cost.first += (Size + 255) / 256;
12942 return Cost;
12943 }
12944
hasMemSDNodeUser(SDNode * N) const12945 bool SITargetLowering::hasMemSDNodeUser(SDNode *N) const {
12946 SDNode::use_iterator I = N->use_begin(), E = N->use_end();
12947 for (; I != E; ++I) {
12948 if (MemSDNode *M = dyn_cast<MemSDNode>(*I)) {
12949 if (getBasePtrIndex(M) == I.getOperandNo())
12950 return true;
12951 }
12952 }
12953 return false;
12954 }
12955
isReassocProfitable(SelectionDAG & DAG,SDValue N0,SDValue N1) const12956 bool SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
12957 SDValue N1) const {
12958 if (!N0.hasOneUse())
12959 return false;
12960 // Take care of the opportunity to keep N0 uniform
12961 if (N0->isDivergent() || !N1->isDivergent())
12962 return true;
12963 // Check if we have a good chance to form the memory access pattern with the
12964 // base and offset
12965 return (DAG.isBaseWithConstantOffset(N0) &&
12966 hasMemSDNodeUser(*N0->use_begin()));
12967 }
12968
12969 MachineMemOperand::Flags
getTargetMMOFlags(const Instruction & I) const12970 SITargetLowering::getTargetMMOFlags(const Instruction &I) const {
12971 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
12972 if (I.getMetadata("amdgpu.noclobber"))
12973 return MONoClobber;
12974 return MachineMemOperand::MONone;
12975 }
12976