1; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -enable-var-scope --check-prefixes=SI,GCN,MESA-GCN,FUNC %s
2; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=VI,GCN,MESA-VI,MESA-GCN,FUNC %s
3; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=VI,GCN,HSA-VI,FUNC %s
4; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=EG --check-prefix=FUNC %s
5; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck -enable-var-scope --check-prefix=EG --check-prefix=FUNC %s
6
7; FUNC-LABEL: {{^}}i8_arg:
8; HSA-VI: kernarg_segment_byte_size = 12
9; HSA-VI: kernarg_segment_alignment = 4
10; EG: AND_INT {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
11; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
12; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
13; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
14
15; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
16; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
17
18
19define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
20entry:
21  %0 = zext i8 %in to i32
22  store i32 %0, i32 addrspace(1)* %out, align 4
23  ret void
24}
25
26; FUNC-LABEL: {{^}}i8_zext_arg:
27; HSA-VI: kernarg_segment_byte_size = 12
28; HSA-VI: kernarg_segment_alignment = 4
29; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
30; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
31; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
32
33; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
34; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
35define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
36entry:
37  %0 = zext i8 %in to i32
38  store i32 %0, i32 addrspace(1)* %out, align 4
39  ret void
40}
41
42; FUNC-LABEL: {{^}}i8_sext_arg:
43; HSA-VI: kernarg_segment_byte_size = 12
44; HSA-VI: kernarg_segment_alignment = 4
45; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
46; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
47
48; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
49
50; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
51; HSA-VI: s_sext_i32_i8 s{{[0-9]+}}, [[VAL]]
52; HSA-VI: flat_store_dword
53define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
54entry:
55  %0 = sext i8 %in to i32
56  store i32 %0, i32 addrspace(1)* %out, align 4
57  ret void
58}
59
60; FUNC-LABEL: {{^}}i16_arg:
61; HSA-VI: kernarg_segment_byte_size = 12
62; HSA-VI: kernarg_segment_alignment = 4
63
64; EG: AND_INT {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
65; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
66
67; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
68; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
69
70; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
71; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}}
72; HSA-VI: flat_store_dword
73define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
74entry:
75  %0 = zext i16 %in to i32
76  store i32 %0, i32 addrspace(1)* %out, align 4
77  ret void
78}
79
80; FUNC-LABEL: {{^}}i16_zext_arg:
81; HSA-VI: kernarg_segment_byte_size = 12
82; HSA-VI: kernarg_segment_alignment = 4
83
84; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
85; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
86; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
87
88; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
89; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}}
90; HSA-VI: flat_store_dword
91define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
92entry:
93  %0 = zext i16 %in to i32
94  store i32 %0, i32 addrspace(1)* %out, align 4
95  ret void
96}
97
98; FUNC-LABEL: {{^}}i16_sext_arg:
99; HSA-VI: kernarg_segment_byte_size = 12
100; HSA-VI: kernarg_segment_alignment = 4
101
102; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
103; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
104; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
105
106
107; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
108; HSA-VI: s_sext_i32_i16 s{{[0-9]+}}, [[VAL]]
109; HSA-VI: flat_store_dword
110define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
111entry:
112  %0 = sext i16 %in to i32
113  store i32 %0, i32 addrspace(1)* %out, align 4
114  ret void
115}
116
117; FUNC-LABEL: {{^}}i32_arg:
118; HSA-VI: kernarg_segment_byte_size = 12
119; HSA-VI: kernarg_segment_alignment = 4
120
121; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z
122; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
123; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
124; HSA-VI: s_load_dword s{{[0-9]}}, s[4:5], 0x8
125define amdgpu_kernel void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind {
126entry:
127  store i32 %in, i32 addrspace(1)* %out, align 4
128  ret void
129}
130
131; FUNC-LABEL: {{^}}f32_arg:
132; HSA-VI: kernarg_segment_byte_size = 12
133; HSA-VI: kernarg_segment_alignment = 4
134; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z
135; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
136; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
137; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8
138define amdgpu_kernel void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind {
139entry:
140  store float %in, float addrspace(1)* %out, align 4
141  ret void
142}
143
144; FUNC-LABEL: {{^}}v2i8_arg:
145; HSA-VI: kernarg_segment_byte_size = 12
146; HSA-VI: kernarg_segment_alignment = 4
147
148; EG: VTX_READ_8
149; EG: VTX_READ_8
150
151; GCN: s_load_dword s
152; GCN-NOT: {{buffer|flat|global}}_load_
153define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
154entry:
155  store <2 x i8> %in, <2 x i8> addrspace(1)* %out
156  ret void
157}
158
159; FUNC-LABEL: {{^}}v2i16_arg:
160; HSA-VI: kernarg_segment_byte_size = 12
161; HSA-VI: kernarg_segment_alignment = 4
162
163; EG: VTX_READ_16
164; EG: VTX_READ_16
165
166; SI: s_load_dword s{{[0-9]+}}, s[0:1], 0xb
167; MESA-VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
168; HSA-VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
169define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) {
170entry:
171  store <2 x i16> %in, <2 x i16> addrspace(1)* %out
172  ret void
173}
174
175; FUNC-LABEL: {{^}}v2i32_arg:
176; HSA-VI: kernarg_segment_byte_size = 16
177; HSA-VI: kernarg_segment_alignment = 4
178
179; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
180; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
181; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
182; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c
183; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
184define amdgpu_kernel void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind {
185entry:
186  store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4
187  ret void
188}
189
190; FUNC-LABEL: {{^}}v2f32_arg:
191; HSA-VI: kernarg_segment_byte_size = 16
192; HSA-VI: kernarg_segment_alignment = 4
193
194; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
195; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
196; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
197; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c
198; HSA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[4:5], 0x8
199define amdgpu_kernel void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind {
200entry:
201  store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4
202  ret void
203}
204
205; FUNC-LABEL: {{^}}v3i8_arg:
206; HSA-VI: kernarg_segment_byte_size = 12
207; HSA-VI: kernarg_segment_alignment = 4
208
209; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40
210; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41
211; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42
212
213; GCN: s_load_dword s
214; GCN-NOT: {{buffer|flat|global}}_load_
215define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
216entry:
217  store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4
218  ret void
219}
220
221; FUNC-LABEL: {{^}}v3i16_arg:
222; HSA-VI: kernarg_segment_byte_size = 16
223; HSA-VI: kernarg_segment_alignment = 4
224
225; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44
226; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
227; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48
228
229; SI: s_load_dword s
230; SI: s_load_dword s
231
232; VI-HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
233; VI-MESA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
234define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind {
235entry:
236  store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
237  ret void
238}
239; FUNC-LABEL: {{^}}v3i32_arg:
240; HSA-VI: kernarg_segment_byte_size = 32
241; HSA-VI: kernarg_segment_alignment = 4
242; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
243; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
244; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
245; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
246; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34
247; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
248define amdgpu_kernel void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind {
249entry:
250  store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4
251  ret void
252}
253
254; FUNC-LABEL: {{^}}v3f32_arg:
255; HSA-VI: kernarg_segment_byte_size = 32
256; HSA-VI: kernarg_segment_alignment = 4
257; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
258; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
259; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
260; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
261; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34
262; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
263define amdgpu_kernel void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind {
264entry:
265  store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4
266  ret void
267}
268
269; FUNC-LABEL: {{^}}v4i8_arg:
270; HSA-VI: kernarg_segment_byte_size = 12
271; HSA-VI: kernarg_segment_alignment = 4
272; EG: VTX_READ_8
273; EG: VTX_READ_8
274; EG: VTX_READ_8
275; EG: VTX_READ_8
276
277; GCN: s_load_dword s
278; GCN-NOT: {{buffer|flat|global}}_load_
279define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
280entry:
281  store <4 x i8> %in, <4 x i8> addrspace(1)* %out
282  ret void
283}
284
285; FUNC-LABEL: {{^}}v4i16_arg:
286; HSA-VI: kernarg_segment_byte_size = 16
287; HSA-VI: kernarg_segment_alignment = 4
288; EG: VTX_READ_16
289; EG: VTX_READ_16
290; EG: VTX_READ_16
291; EG: VTX_READ_16
292
293; SI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb
294; SI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xc
295; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x9
296
297; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x2c
298; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
299define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
300entry:
301  store <4 x i16> %in, <4 x i16> addrspace(1)* %out
302  ret void
303}
304
305; FUNC-LABEL: {{^}}v4i32_arg:
306; HSA-VI: kernarg_segment_byte_size = 32
307; HSA-VI: kernarg_segment_alignment = 4
308; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
309; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
310; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
311; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
312
313; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
314; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34
315; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
316define amdgpu_kernel void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind {
317entry:
318  store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4
319  ret void
320}
321
322; FUNC-LABEL: {{^}}v4f32_arg:
323; HSA-VI: kernarg_segment_byte_size = 32
324; HSA-VI: kernarg_segment_alignment = 4
325; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
326; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
327; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
328; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
329; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
330; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34
331; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
332define amdgpu_kernel void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind {
333entry:
334  store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4
335  ret void
336}
337
338; FIXME: Lots of unpack and re-pack junk on VI
339; FUNC-LABEL: {{^}}v8i8_arg:
340; HSA-VI: kernarg_segment_byte_size = 16
341; HSA-VI: kernarg_segment_alignment = 4
342; EG: VTX_READ_8
343; EG: VTX_READ_8
344; EG: VTX_READ_8
345; EG: VTX_READ_8
346; EG: VTX_READ_8
347; EG: VTX_READ_8
348; EG: VTX_READ_8
349; EG: VTX_READ_8
350
351
352; SI: s_load_dword s
353; SI: s_load_dword s
354; SI: s_load_dwordx2 s
355; SI-NOT: {{buffer|flat|global}}_load
356
357; VI: s_load_dword s
358; VI: s_load_dword s
359
360; VI: v_lshlrev_b16
361; VI: v_or_b32_e32
362; VI: v_or_b32_sdwa
363; VI: v_or_b32_sdwa
364; VI: v_lshlrev_b16
365; VI: s_lshr_b32
366; VI: v_or_b32_sdwa
367; VI: v_or_b32_sdwa
368define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
369entry:
370  store <8 x i8> %in, <8 x i8> addrspace(1)* %out
371  ret void
372}
373
374; FUNC-LABEL: {{^}}v8i16_arg:
375; HSA-VI: kernarg_segment_byte_size = 32
376; HSA-VI: kernarg_segment_alignment = 4
377; EG: VTX_READ_16
378; EG: VTX_READ_16
379; EG: VTX_READ_16
380; EG: VTX_READ_16
381; EG: VTX_READ_16
382; EG: VTX_READ_16
383; EG: VTX_READ_16
384; EG: VTX_READ_16
385
386; SI: s_load_dword s
387; SI: s_load_dword s
388; SI: s_load_dword s
389; SI: s_load_dword s
390; SI: s_load_dwordx2
391; SI-NOT: {{buffer|flat|global}}_load
392
393
394; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x34
395; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x3c
396
397; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10
398; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x18
399define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
400entry:
401  store <8 x i16> %in, <8 x i16> addrspace(1)* %out
402  ret void
403}
404
405; FUNC-LABEL: {{^}}v8i32_arg:
406; HSA-VI: kernarg_segment_byte_size = 64
407; HSA-VI: kernarg_segment_alignment = 5
408; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
409; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
410; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
411; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X
412; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y
413; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
414; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
415; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
416; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11
417; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44
418; HSA-VI: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20
419define amdgpu_kernel void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind {
420entry:
421  store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4
422  ret void
423}
424
425; FUNC-LABEL: {{^}}v8f32_arg:
426; HSA-VI: kernarg_segment_byte_size = 64
427; HSA-VI: kernarg_segment_alignment = 5
428; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
429; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
430; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
431; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X
432; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y
433; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
434; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
435; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
436; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11
437define amdgpu_kernel void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind {
438entry:
439  store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4
440  ret void
441}
442
443; FIXME: Pack/repack on VI
444
445; FUNC-LABEL: {{^}}v16i8_arg:
446; HSA-VI: kernarg_segment_byte_size = 32
447; HSA-VI: kernarg_segment_alignment = 4
448; EG: VTX_READ_8
449; EG: VTX_READ_8
450; EG: VTX_READ_8
451; EG: VTX_READ_8
452; EG: VTX_READ_8
453; EG: VTX_READ_8
454; EG: VTX_READ_8
455; EG: VTX_READ_8
456; EG: VTX_READ_8
457; EG: VTX_READ_8
458; EG: VTX_READ_8
459; EG: VTX_READ_8
460; EG: VTX_READ_8
461; EG: VTX_READ_8
462; EG: VTX_READ_8
463; EG: VTX_READ_8
464
465; SI: s_load_dword s
466; SI: s_load_dword s
467; SI: s_load_dword s
468; SI: s_load_dword s
469; SI: s_load_dwordx2
470; SI-NOT: {{buffer|flat|global}}_load
471
472
473; VI: s_load_dword s
474; VI: s_load_dword s
475; VI: s_load_dword s
476; VI: s_load_dword s
477
478; VI: s_lshr_b32
479; VI: v_lshlrev_b16
480; VI: s_lshr_b32
481; VI: s_lshr_b32
482; VI: v_or_b32_sdwa
483; VI: v_or_b32_sdwa
484; VI: v_lshlrev_b16
485; VI: v_lshlrev_b16
486; VI: v_or_b32_sdwa
487; VI: v_or_b32_sdwa
488; VI: v_lshlrev_b16
489; VI: v_lshlrev_b16
490; VI: v_or_b32_sdwa
491; VI: v_or_b32_sdwa
492define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
493entry:
494  store <16 x i8> %in, <16 x i8> addrspace(1)* %out
495  ret void
496}
497
498; FUNC-LABEL: {{^}}v16i16_arg:
499; HSA-VI: kernarg_segment_byte_size = 64
500; HSA-VI: kernarg_segment_alignment = 5
501; EG: VTX_READ_16
502; EG: VTX_READ_16
503; EG: VTX_READ_16
504; EG: VTX_READ_16
505; EG: VTX_READ_16
506
507; EG: VTX_READ_16
508; EG: VTX_READ_16
509; EG: VTX_READ_16
510; EG: VTX_READ_16
511; EG: VTX_READ_16
512; EG: VTX_READ_16
513; EG: VTX_READ_16
514; EG: VTX_READ_16
515; EG: VTX_READ_16
516; EG: VTX_READ_16
517; EG: VTX_READ_16
518
519; SI: s_load_dword s
520; SI: s_load_dword s
521; SI: s_load_dword s
522; SI: s_load_dword s
523; SI: s_load_dword s
524; SI: s_load_dword s
525; SI: s_load_dword s
526; SI: s_load_dword s
527
528; SI-NOT: {{buffer|flat|global}}_load
529
530
531; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44
532; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x4c
533; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x54
534; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x5c
535
536; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
537; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x28
538; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x30
539; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x38
540define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
541entry:
542  store <16 x i16> %in, <16 x i16> addrspace(1)* %out
543  ret void
544}
545
546; FUNC-LABEL: {{^}}v16i32_arg:
547; HSA-VI: kernarg_segment_byte_size = 128
548; HSA-VI: kernarg_segment_alignment = 6
549; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
550; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
551; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
552; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X
553; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y
554; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z
555; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W
556; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
557; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
558; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
559; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W
560; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X
561; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y
562; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
563; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
564; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
565; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19
566; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64
567; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
568define amdgpu_kernel void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind {
569entry:
570  store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4
571  ret void
572}
573
574; FUNC-LABEL: {{^}}v16f32_arg:
575; HSA-VI: kernarg_segment_byte_size = 128
576; HSA-VI: kernarg_segment_alignment = 6
577; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
578; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
579; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
580; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X
581; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y
582; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z
583; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W
584; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
585; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
586; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
587; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W
588; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X
589; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y
590; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
591; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
592; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
593; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19
594; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64
595; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
596define amdgpu_kernel void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind {
597entry:
598  store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4
599  ret void
600}
601
602; FUNC-LABEL: {{^}}kernel_arg_i64:
603; MESA-GCN: s_load_dwordx2
604; MESA-GCN: s_load_dwordx2
605; MESA-GCN: buffer_store_dwordx2
606; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
607define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
608  store i64 %a, i64 addrspace(1)* %out, align 8
609  ret void
610}
611
612; FUNC-LABEL: {{^}}f64_kernel_arg:
613; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x9
614; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0xb
615; MESA-VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x24
616; MESA-VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x2c
617; MESA-GCN: buffer_store_dwordx2
618; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
619define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double  %in) {
620entry:
621  store double %in, double addrspace(1)* %out
622  ret void
623}
624
625; XFUNC-LABEL: {{^}}kernel_arg_v1i64:
626; XGCN: s_load_dwordx2
627; XGCN: s_load_dwordx2
628; XGCN: buffer_store_dwordx2
629; define amdgpu_kernel void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind {
630;   store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8
631;   ret void
632; }
633
634; FUNC-LABEL: {{^}}i1_arg:
635; HSA-VI: kernarg_segment_byte_size = 12
636; HSA-VI: kernarg_segment_alignment = 4
637
638; GCN: s_load_dword s
639; GCN: s_and_b32
640; GCN: {{buffer|flat}}_store_byte
641define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
642  store i1 %x, i1 addrspace(1)* %out, align 1
643  ret void
644}
645
646; FUNC-LABEL: {{^}}i1_arg_zext_i32:
647; HSA-VI: kernarg_segment_byte_size = 12
648; HSA-VI: kernarg_segment_alignment = 4
649
650; GCN: s_load_dword
651; SGCN: buffer_store_dword
652define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
653  %ext = zext i1 %x to i32
654  store i32 %ext, i32 addrspace(1)* %out, align 4
655  ret void
656}
657
658; FUNC-LABEL: {{^}}i1_arg_zext_i64:
659; HSA-VI: kernarg_segment_byte_size = 12
660; HSA-VI: kernarg_segment_alignment = 4
661
662; GCN: s_load_dword s
663; GCN: {{buffer|flat}}_store_dwordx2
664define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
665  %ext = zext i1 %x to i64
666  store i64 %ext, i64 addrspace(1)* %out, align 8
667  ret void
668}
669
670; FUNC-LABEL: {{^}}i1_arg_sext_i32:
671; HSA-VI: kernarg_segment_byte_size = 12
672; HSA-VI: kernarg_segment_alignment = 4
673
674; GCN: s_load_dword
675; GCN: {{buffer|flat}}_store_dword
676define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
677  %ext = sext i1 %x to i32
678  store i32 %ext, i32addrspace(1)* %out, align 4
679  ret void
680}
681
682; FUNC-LABEL: {{^}}i1_arg_sext_i64:
683; HSA-VI: kernarg_segment_byte_size = 12
684; HSA-VI: kernarg_segment_alignment = 4
685
686; GCN: s_load_dword
687; GCN: s_bfe_i64
688; GCN: {{buffer|flat}}_store_dwordx2
689define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
690  %ext = sext i1 %x to i64
691  store i64 %ext, i64 addrspace(1)* %out, align 8
692  ret void
693}
694