1; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=SI,GCN,MESA-GCN,FUNC %s
2; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=VI,GCN,MESA-VI,MESA-GCN,FUNC %s
3; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=VI,GCN,HSA-VI,FUNC %s
4; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope -check-prefix=EG --check-prefix=FUNC %s
5; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefix=EG --check-prefix=FUNC %s
6
7; FUNC-LABEL: {{^}}i8_arg:
8; HSA-VI: kernarg_segment_byte_size = 12
9; HSA-VI: kernarg_segment_alignment = 4
10; EG: AND_INT {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
11; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
12; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
13; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
14
15; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
16; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
17define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
18  %ext = zext i8 %in to i32
19  store i32 %ext, i32 addrspace(1)* %out, align 4
20  ret void
21}
22
23; FUNC-LABEL: {{^}}i8_zext_arg:
24; HSA-VI: kernarg_segment_byte_size = 12
25; HSA-VI: kernarg_segment_alignment = 4
26; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
27; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
28; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
29
30; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
31; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
32define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
33  %ext = zext i8 %in to i32
34  store i32 %ext, i32 addrspace(1)* %out, align 4
35  ret void
36}
37
38; FUNC-LABEL: {{^}}i8_sext_arg:
39; HSA-VI: kernarg_segment_byte_size = 12
40; HSA-VI: kernarg_segment_alignment = 4
41; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
42; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
43
44; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
45
46; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
47; HSA-VI: s_sext_i32_i8 s{{[0-9]+}}, [[VAL]]
48; HSA-VI: flat_store_dword
49define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
50  %ext = sext i8 %in to i32
51  store i32 %ext, i32 addrspace(1)* %out, align 4
52  ret void
53}
54
55; FUNC-LABEL: {{^}}i16_arg:
56; HSA-VI: kernarg_segment_byte_size = 12
57; HSA-VI: kernarg_segment_alignment = 4
58
59; EG: AND_INT {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
60; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
61
62; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
63; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
64
65; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
66; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}}
67; HSA-VI: flat_store_dword
68define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
69  %ext = zext i16 %in to i32
70  store i32 %ext, i32 addrspace(1)* %out, align 4
71  ret void
72}
73
74; FUNC-LABEL: {{^}}i16_zext_arg:
75; HSA-VI: kernarg_segment_byte_size = 12
76; HSA-VI: kernarg_segment_alignment = 4
77
78; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
79; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
80; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
81
82; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
83; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}}
84; HSA-VI: flat_store_dword
85define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
86  %ext = zext i16 %in to i32
87  store i32 %ext, i32 addrspace(1)* %out, align 4
88  ret void
89}
90
91; FUNC-LABEL: {{^}}i16_sext_arg:
92; HSA-VI: kernarg_segment_byte_size = 12
93; HSA-VI: kernarg_segment_alignment = 4
94
95; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
96; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
97; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
98
99
100; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
101; HSA-VI: s_sext_i32_i16 s{{[0-9]+}}, [[VAL]]
102; HSA-VI: flat_store_dword
103define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
104  %ext = sext i16 %in to i32
105  store i32 %ext, i32 addrspace(1)* %out, align 4
106  ret void
107}
108
109; FUNC-LABEL: {{^}}i32_arg:
110; HSA-VI: kernarg_segment_byte_size = 12
111; HSA-VI: kernarg_segment_alignment = 4
112
113; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z
114; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
115; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
116; HSA-VI: s_load_dword s{{[0-9]}}, s[4:5], 0x8
117define amdgpu_kernel void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind {
118entry:
119  store i32 %in, i32 addrspace(1)* %out, align 4
120  ret void
121}
122
123; FUNC-LABEL: {{^}}f32_arg:
124; HSA-VI: kernarg_segment_byte_size = 12
125; HSA-VI: kernarg_segment_alignment = 4
126; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z
127; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
128; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
129; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8
130define amdgpu_kernel void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind {
131entry:
132  store float %in, float addrspace(1)* %out, align 4
133  ret void
134}
135
136; FUNC-LABEL: {{^}}v2i8_arg:
137; HSA-VI: kernarg_segment_byte_size = 12
138; HSA-VI: kernarg_segment_alignment = 4
139
140; EG: VTX_READ_8
141; EG: VTX_READ_8
142
143; GCN: s_load_dword s
144; GCN-NOT: {{buffer|flat|global}}_load_
145define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
146entry:
147  store <2 x i8> %in, <2 x i8> addrspace(1)* %out
148  ret void
149}
150
151; FUNC-LABEL: {{^}}v2i16_arg:
152; HSA-VI: kernarg_segment_byte_size = 12
153; HSA-VI: kernarg_segment_alignment = 4
154
155; EG: VTX_READ_16
156; EG: VTX_READ_16
157
158; SI: s_load_dword s{{[0-9]+}}, s[0:1], 0xb
159; MESA-VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
160; HSA-VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
161define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) {
162entry:
163  store <2 x i16> %in, <2 x i16> addrspace(1)* %out
164  ret void
165}
166
167; FUNC-LABEL: {{^}}v2i32_arg:
168; HSA-VI: kernarg_segment_byte_size = 16
169; HSA-VI: kernarg_segment_alignment = 4
170
171; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
172; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
173; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
174; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c
175; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
176define amdgpu_kernel void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind {
177entry:
178  store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4
179  ret void
180}
181
182; FUNC-LABEL: {{^}}v2f32_arg:
183; HSA-VI: kernarg_segment_byte_size = 16
184; HSA-VI: kernarg_segment_alignment = 4
185
186; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
187; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
188; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
189; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c
190; HSA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[4:5], 0x8
191define amdgpu_kernel void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind {
192entry:
193  store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4
194  ret void
195}
196
197; FUNC-LABEL: {{^}}v3i8_arg:
198; HSA-VI: kernarg_segment_byte_size = 12
199; HSA-VI: kernarg_segment_alignment = 4
200
201; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40
202; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41
203; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42
204
205; SI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
206
207; VI-MESA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
208; VI-HSA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
209define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
210entry:
211  store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4
212  ret void
213}
214
215; FUNC-LABEL: {{^}}v3i16_arg:
216; HSA-VI: kernarg_segment_byte_size = 16
217; HSA-VI: kernarg_segment_alignment = 4
218
219; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44
220; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
221; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48
222
223; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
224
225; VI-HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
226; VI-MESA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
227define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind {
228entry:
229  store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
230  ret void
231}
232
233; FUNC-LABEL: {{^}}v3i32_arg:
234; HSA-VI: kernarg_segment_byte_size = 32
235; HSA-VI: kernarg_segment_alignment = 4
236; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
237; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
238; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
239; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
240; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34
241; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
242define amdgpu_kernel void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind {
243entry:
244  store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4
245  ret void
246}
247
248; FUNC-LABEL: {{^}}v3f32_arg:
249; HSA-VI: kernarg_segment_byte_size = 32
250; HSA-VI: kernarg_segment_alignment = 4
251; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
252; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
253; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
254; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
255; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34
256; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
257define amdgpu_kernel void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind {
258entry:
259  store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4
260  ret void
261}
262
263; FUNC-LABEL: {{^}}v4i8_arg:
264; HSA-VI: kernarg_segment_byte_size = 12
265; HSA-VI: kernarg_segment_alignment = 4
266; EG: VTX_READ_8
267; EG: VTX_READ_8
268; EG: VTX_READ_8
269; EG: VTX_READ_8
270
271; GCN-DAG: s_load_dwordx2 s
272; GCN-DAG: s_load_dword s
273define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
274entry:
275  store <4 x i8> %in, <4 x i8> addrspace(1)* %out
276  ret void
277}
278
279; FUNC-LABEL: {{^}}v4i16_arg:
280; HSA-VI: kernarg_segment_byte_size = 16
281; HSA-VI: kernarg_segment_alignment = 4
282; EG: VTX_READ_16
283; EG: VTX_READ_16
284; EG: VTX_READ_16
285; EG: VTX_READ_16
286
287; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0xb
288; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x9
289
290; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24
291; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x2c
292
293
294; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24
295; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x2c
296
297; HSA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0
298; HSA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
299define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
300entry:
301  store <4 x i16> %in, <4 x i16> addrspace(1)* %out
302  ret void
303}
304
305; FUNC-LABEL: {{^}}v4i32_arg:
306; HSA-VI: kernarg_segment_byte_size = 32
307; HSA-VI: kernarg_segment_alignment = 4
308; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
309; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
310; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
311; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
312
313; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
314; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34
315; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
316define amdgpu_kernel void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind {
317entry:
318  store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4
319  ret void
320}
321
322; FUNC-LABEL: {{^}}v4f32_arg:
323; HSA-VI: kernarg_segment_byte_size = 32
324; HSA-VI: kernarg_segment_alignment = 4
325; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
326; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
327; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
328; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
329; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
330; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34
331; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
332define amdgpu_kernel void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind {
333entry:
334  store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4
335  ret void
336}
337
338; FIXME: Lots of unpack and re-pack junk on VI
339; FUNC-LABEL: {{^}}v8i8_arg:
340; HSA-VI: kernarg_segment_byte_size = 16
341; HSA-VI: kernarg_segment_alignment = 4
342; EG: VTX_READ_8
343; EG: VTX_READ_8
344; EG: VTX_READ_8
345; EG: VTX_READ_8
346; EG: VTX_READ_8
347; EG: VTX_READ_8
348; EG: VTX_READ_8
349; EG: VTX_READ_8
350
351; SI-NOT: {{buffer|flat|global}}_load
352; SI: s_load_dwordx2 s
353; SI-NEXT: s_load_dwordx2 s
354; SI-NOT: {{buffer|flat|global}}_load
355
356; VI: s_load_dwordx2 s
357; VI-NEXT: s_load_dwordx2 s
358; VI-NOT: lshl
359; VI-NOT: _or
360; VI-NOT: _sdwa
361define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
362entry:
363  store <8 x i8> %in, <8 x i8> addrspace(1)* %out
364  ret void
365}
366
367; FUNC-LABEL: {{^}}v8i16_arg:
368; HSA-VI: kernarg_segment_byte_size = 32
369; HSA-VI: kernarg_segment_alignment = 4
370; EG: VTX_READ_16
371; EG: VTX_READ_16
372; EG: VTX_READ_16
373; EG: VTX_READ_16
374; EG: VTX_READ_16
375; EG: VTX_READ_16
376; EG: VTX_READ_16
377; EG: VTX_READ_16
378
379; SI: s_load_dwordx4
380; SI-NEXT: s_load_dwordx2
381; SI-NOT: {{buffer|flat|global}}_load
382
383
384; MESA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x34
385
386; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10
387define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
388entry:
389  store <8 x i16> %in, <8 x i16> addrspace(1)* %out
390  ret void
391}
392
393; FUNC-LABEL: {{^}}v8i32_arg:
394; HSA-VI: kernarg_segment_byte_size = 64
395; HSA-VI: kernarg_segment_alignment = 5
396; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
397; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
398; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
399; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X
400; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y
401; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
402; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
403; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
404
405; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11
406; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44
407; HSA-VI: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20
408define amdgpu_kernel void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind {
409entry:
410  store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4
411  ret void
412}
413
414; FUNC-LABEL: {{^}}v8f32_arg:
415; HSA-VI: kernarg_segment_byte_size = 64
416; HSA-VI: kernarg_segment_alignment = 5
417; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
418; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
419; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
420; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X
421; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y
422; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
423; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
424; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
425; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11
426define amdgpu_kernel void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind {
427entry:
428  store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4
429  ret void
430}
431
432; FIXME: Pack/repack on VI
433
434; FUNC-LABEL: {{^}}v16i8_arg:
435; HSA-VI: kernarg_segment_byte_size = 32
436; HSA-VI: kernarg_segment_alignment = 4
437; EG: VTX_READ_8
438; EG: VTX_READ_8
439; EG: VTX_READ_8
440; EG: VTX_READ_8
441; EG: VTX_READ_8
442; EG: VTX_READ_8
443; EG: VTX_READ_8
444; EG: VTX_READ_8
445; EG: VTX_READ_8
446; EG: VTX_READ_8
447; EG: VTX_READ_8
448; EG: VTX_READ_8
449; EG: VTX_READ_8
450; EG: VTX_READ_8
451; EG: VTX_READ_8
452; EG: VTX_READ_8
453
454; SI: s_load_dwordx4 s
455; SI-NEXT: s_load_dwordx2 s
456; SI-NOT: {{buffer|flat|global}}_load
457
458
459; VI: s_load_dwordx4 s
460; VI-NOT: shr
461; VI-NOT: shl
462; VI-NOT: _sdwa
463; VI-NOT: _or_
464define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
465entry:
466  store <16 x i8> %in, <16 x i8> addrspace(1)* %out
467  ret void
468}
469
470; FUNC-LABEL: {{^}}v16i16_arg:
471; HSA-VI: kernarg_segment_byte_size = 64
472; HSA-VI: kernarg_segment_alignment = 5
473; EG: VTX_READ_16
474; EG: VTX_READ_16
475; EG: VTX_READ_16
476; EG: VTX_READ_16
477; EG: VTX_READ_16
478
479; EG: VTX_READ_16
480; EG: VTX_READ_16
481; EG: VTX_READ_16
482; EG: VTX_READ_16
483; EG: VTX_READ_16
484; EG: VTX_READ_16
485; EG: VTX_READ_16
486; EG: VTX_READ_16
487; EG: VTX_READ_16
488; EG: VTX_READ_16
489; EG: VTX_READ_16
490
491; SI: s_load_dwordx8 s
492; SI-NEXT: s_load_dwordx2 s
493; SI-NOT: {{buffer|flat|global}}_load
494
495
496; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44
497
498; HSA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
499define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
500entry:
501  store <16 x i16> %in, <16 x i16> addrspace(1)* %out
502  ret void
503}
504
505; FUNC-LABEL: {{^}}v16i32_arg:
506; HSA-VI: kernarg_segment_byte_size = 128
507; HSA-VI: kernarg_segment_alignment = 6
508; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
509; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
510; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
511; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X
512; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y
513; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z
514; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W
515; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
516; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
517; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
518; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W
519; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X
520; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y
521; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
522; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
523; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
524; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19
525; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64
526; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
527define amdgpu_kernel void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind {
528entry:
529  store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4
530  ret void
531}
532
533; FUNC-LABEL: {{^}}v16f32_arg:
534; HSA-VI: kernarg_segment_byte_size = 128
535; HSA-VI: kernarg_segment_alignment = 6
536; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
537; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
538; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
539; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X
540; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y
541; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z
542; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W
543; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
544; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
545; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
546; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W
547; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X
548; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y
549; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
550; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
551; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
552; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19
553; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64
554; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
555define amdgpu_kernel void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind {
556entry:
557  store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4
558  ret void
559}
560
561; FUNC-LABEL: {{^}}kernel_arg_i64:
562; MESA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[0:1], 0x24
563; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
564
565; MESA-GCN: buffer_store_dwordx2
566define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
567  store i64 %a, i64 addrspace(1)* %out, align 8
568  ret void
569}
570
571; FUNC-LABEL: {{^}}f64_kernel_arg:
572; SI-DAG: s_load_dwordx4 s[{{[0-9]:[0-9]}}], s[0:1], 0x9
573; MESA-VI-DAG: s_load_dwordx4 s[{{[0-9]:[0-9]}}], s[0:1], 0x24
574; MESA-GCN: buffer_store_dwordx2
575
576; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
577define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double  %in) {
578entry:
579  store double %in, double addrspace(1)* %out
580  ret void
581}
582
583; XFUNC-LABEL: {{^}}kernel_arg_v1i64:
584; XGCN: s_load_dwordx2
585; XGCN: s_load_dwordx2
586; XGCN: buffer_store_dwordx2
587; define amdgpu_kernel void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind {
588;   store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8
589;   ret void
590; }
591
592; FUNC-LABEL: {{^}}i65_arg:
593; HSA-VI: kernarg_segment_byte_size = 24
594; HSA-VI: kernarg_segment_alignment = 4
595; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0
596; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
597define amdgpu_kernel void @i65_arg(i65 addrspace(1)* nocapture %out, i65 %in) nounwind {
598entry:
599  store i65 %in, i65 addrspace(1)* %out, align 4
600  ret void
601}
602
603; FUNC-LABEL: {{^}}i1_arg:
604; HSA-VI: kernarg_segment_byte_size = 12
605; HSA-VI: kernarg_segment_alignment = 4
606
607; GCN: s_load_dword s
608; GCN: s_and_b32
609; GCN: {{buffer|flat}}_store_byte
610define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
611  store i1 %x, i1 addrspace(1)* %out, align 1
612  ret void
613}
614
615; FUNC-LABEL: {{^}}i1_arg_zext_i32:
616; HSA-VI: kernarg_segment_byte_size = 12
617; HSA-VI: kernarg_segment_alignment = 4
618
619; GCN: s_load_dword
620; SGCN: buffer_store_dword
621define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
622  %ext = zext i1 %x to i32
623  store i32 %ext, i32 addrspace(1)* %out, align 4
624  ret void
625}
626
627; FUNC-LABEL: {{^}}i1_arg_zext_i64:
628; HSA-VI: kernarg_segment_byte_size = 12
629; HSA-VI: kernarg_segment_alignment = 4
630
631; GCN: s_load_dword s
632; GCN: {{buffer|flat}}_store_dwordx2
633define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
634  %ext = zext i1 %x to i64
635  store i64 %ext, i64 addrspace(1)* %out, align 8
636  ret void
637}
638
639; FUNC-LABEL: {{^}}i1_arg_sext_i32:
640; HSA-VI: kernarg_segment_byte_size = 12
641; HSA-VI: kernarg_segment_alignment = 4
642
643; GCN: s_load_dword
644; GCN: {{buffer|flat}}_store_dword
645define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
646  %ext = sext i1 %x to i32
647  store i32 %ext, i32addrspace(1)* %out, align 4
648  ret void
649}
650
651; FUNC-LABEL: {{^}}i1_arg_sext_i64:
652; HSA-VI: kernarg_segment_byte_size = 12
653; HSA-VI: kernarg_segment_alignment = 4
654
655; GCN: s_load_dword
656; GCN: s_bfe_i64
657; GCN: {{buffer|flat}}_store_dwordx2
658define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
659  %ext = sext i1 %x to i64
660  store i64 %ext, i64 addrspace(1)* %out, align 8
661  ret void
662}
663
664; FUNC-LABEL: {{^}}empty_struct_arg:
665; HSA-VI: kernarg_segment_byte_size = 0
666define amdgpu_kernel void @empty_struct_arg({} %in) nounwind {
667  ret void
668}
669
670; The correct load offsets for these:
671; load 4 from 0,
672; load 8 from 8
673; load 4 from 24
674; load 8 from 32
675
676; With the SelectionDAG argument lowering, the alignments for the
677; struct members is not properly considered, making these wrong.
678
679; FIXME: Total argument size is computed wrong
680; FUNC-LABEL: {{^}}struct_argument_alignment:
681; HSA-VI: kernarg_segment_byte_size = 40
682; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
683; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
684; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18
685; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
686define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) {
687  %val0 = extractvalue {i32, i64} %arg0, 0
688  %val1 = extractvalue {i32, i64} %arg0, 1
689  %val2 = extractvalue {i32, i64} %arg1, 0
690  %val3 = extractvalue {i32, i64} %arg1, 1
691  store volatile i32 %val0, i32 addrspace(1)* null
692  store volatile i64 %val1, i64 addrspace(1)* null
693  store volatile i32 %val2, i32 addrspace(1)* null
694  store volatile i64 %val3, i64 addrspace(1)* null
695  ret void
696}
697
698; No padding between i8 and next struct, but round up at end to 4 byte
699; multiple.
700; FUNC-LABEL: {{^}}packed_struct_argument_alignment:
701; HSA-VI: kernarg_segment_byte_size = 28
702; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
703; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4
704; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0xc
705; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10
706define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) {
707  %val0 = extractvalue <{i32, i64}> %arg0, 0
708  %val1 = extractvalue <{i32, i64}> %arg0, 1
709  %val2 = extractvalue <{i32, i64}> %arg1, 0
710  %val3 = extractvalue <{i32, i64}> %arg1, 1
711  store volatile i32 %val0, i32 addrspace(1)* null
712  store volatile i64 %val1, i64 addrspace(1)* null
713  store volatile i32 %val2, i32 addrspace(1)* null
714  store volatile i64 %val3, i64 addrspace(1)* null
715  ret void
716}
717
718; GCN-LABEL: {{^}}struct_argument_alignment_after:
719; HSA-VI: kernarg_segment_byte_size = 64
720; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
721; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
722; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x18
723; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
724; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x30
725define amdgpu_kernel void @struct_argument_alignment_after({i32, i64} %arg0, i8, {i32, i64} %arg2, i8, <4 x i32> %arg4) {
726  %val0 = extractvalue {i32, i64} %arg0, 0
727  %val1 = extractvalue {i32, i64} %arg0, 1
728  %val2 = extractvalue {i32, i64} %arg2, 0
729  %val3 = extractvalue {i32, i64} %arg2, 1
730  store volatile i32 %val0, i32 addrspace(1)* null
731  store volatile i64 %val1, i64 addrspace(1)* null
732  store volatile i32 %val2, i32 addrspace(1)* null
733  store volatile i64 %val3, i64 addrspace(1)* null
734  store volatile <4 x i32> %arg4, <4 x i32> addrspace(1)* null
735  ret void
736}
737
738; GCN-LABEL: {{^}}array_3xi32:
739; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
740; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4
741; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8
742; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0xc
743define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) {
744  store volatile i16 %arg0, i16 addrspace(1)* undef
745  store volatile [3 x i32] %arg1, [3 x i32] addrspace(1)* undef
746  ret void
747}
748
749; FIXME: Why not all scalar loads?
750; GCN-LABEL: {{^}}array_3xi16:
751; HSA-VI: s_add_u32 s{{[0-9]+}}, s4, 2
752; HSA-VI: s_addc_u32 s{{[0-9]+}}, s5, 0
753; HSA-VI: flat_load_ushort
754; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
755; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x4
756define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) {
757  store volatile i8 %arg0, i8 addrspace(1)* undef
758  store volatile [3 x i16] %arg1, [3 x i16] addrspace(1)* undef
759  ret void
760}
761