1; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -enable-var-scope --check-prefixes=SI,GCN,MESA-GCN,FUNC %s
2; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=VI,GCN,MESA-VI,MESA-GCN,FUNC %s
3; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=VI,GCN,HSA-VI,FUNC %s
4; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=EG --check-prefix=FUNC %s
5; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck -enable-var-scope --check-prefix=EG --check-prefix=FUNC %s
6
7; FUNC-LABEL: {{^}}i8_arg:
8; HSA-VI: kernarg_segment_byte_size = 12
9; HSA-VI: kernarg_segment_alignment = 4
10; EG: AND_INT {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
11; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
12; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
13; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
14
15; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
16; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
17
18
19define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
20entry:
21  %0 = zext i8 %in to i32
22  store i32 %0, i32 addrspace(1)* %out, align 4
23  ret void
24}
25
26; FUNC-LABEL: {{^}}i8_zext_arg:
27; HSA-VI: kernarg_segment_byte_size = 12
28; HSA-VI: kernarg_segment_alignment = 4
29; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
30; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
31; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
32
33; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
34; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
35define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
36entry:
37  %0 = zext i8 %in to i32
38  store i32 %0, i32 addrspace(1)* %out, align 4
39  ret void
40}
41
42; FUNC-LABEL: {{^}}i8_sext_arg:
43; HSA-VI: kernarg_segment_byte_size = 12
44; HSA-VI: kernarg_segment_alignment = 4
45; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
46; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
47
48; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
49
50; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
51; HSA-VI: s_sext_i32_i8 s{{[0-9]+}}, [[VAL]]
52; HSA-VI: flat_store_dword
53define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
54entry:
55  %0 = sext i8 %in to i32
56  store i32 %0, i32 addrspace(1)* %out, align 4
57  ret void
58}
59
60; FUNC-LABEL: {{^}}i16_arg:
61; HSA-VI: kernarg_segment_byte_size = 12
62; HSA-VI: kernarg_segment_alignment = 4
63
64; EG: AND_INT {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
65; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
66
67; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
68; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
69
70; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
71; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}}
72; HSA-VI: flat_store_dword
73define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
74entry:
75  %0 = zext i16 %in to i32
76  store i32 %0, i32 addrspace(1)* %out, align 4
77  ret void
78}
79
80; FUNC-LABEL: {{^}}i16_zext_arg:
81; HSA-VI: kernarg_segment_byte_size = 12
82; HSA-VI: kernarg_segment_alignment = 4
83
84; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
85; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
86; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
87
88; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
89; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}}
90; HSA-VI: flat_store_dword
91define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
92entry:
93  %0 = zext i16 %in to i32
94  store i32 %0, i32 addrspace(1)* %out, align 4
95  ret void
96}
97
98; FUNC-LABEL: {{^}}i16_sext_arg:
99; HSA-VI: kernarg_segment_byte_size = 12
100; HSA-VI: kernarg_segment_alignment = 4
101
102; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
103; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
104; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
105
106
107; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
108; HSA-VI: s_sext_i32_i16 s{{[0-9]+}}, [[VAL]]
109; HSA-VI: flat_store_dword
110define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
111entry:
112  %0 = sext i16 %in to i32
113  store i32 %0, i32 addrspace(1)* %out, align 4
114  ret void
115}
116
117; FUNC-LABEL: {{^}}i32_arg:
118; HSA-VI: kernarg_segment_byte_size = 12
119; HSA-VI: kernarg_segment_alignment = 4
120
121; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z
122; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
123; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
124; HSA-VI: s_load_dword s{{[0-9]}}, s[4:5], 0x8
125define amdgpu_kernel void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind {
126entry:
127  store i32 %in, i32 addrspace(1)* %out, align 4
128  ret void
129}
130
131; FUNC-LABEL: {{^}}f32_arg:
132; HSA-VI: kernarg_segment_byte_size = 12
133; HSA-VI: kernarg_segment_alignment = 4
134; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z
135; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
136; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
137; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8
138define amdgpu_kernel void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind {
139entry:
140  store float %in, float addrspace(1)* %out, align 4
141  ret void
142}
143
144; FUNC-LABEL: {{^}}v2i8_arg:
145; HSA-VI: kernarg_segment_byte_size = 12
146; HSA-VI: kernarg_segment_alignment = 4
147
148; EG: VTX_READ_8
149; EG: VTX_READ_8
150
151; GCN: s_load_dword s
152; GCN-NOT: {{buffer|flat|global}}_load_
153define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
154entry:
155  store <2 x i8> %in, <2 x i8> addrspace(1)* %out
156  ret void
157}
158
159; FUNC-LABEL: {{^}}v2i16_arg:
160; HSA-VI: kernarg_segment_byte_size = 12
161; HSA-VI: kernarg_segment_alignment = 4
162
163; EG: VTX_READ_16
164; EG: VTX_READ_16
165
166; SI: s_load_dword s{{[0-9]+}}, s[0:1], 0xb
167; MESA-VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
168; HSA-VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
169define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) {
170entry:
171  store <2 x i16> %in, <2 x i16> addrspace(1)* %out
172  ret void
173}
174
175; FUNC-LABEL: {{^}}v2i32_arg:
176; HSA-VI: kernarg_segment_byte_size = 16
177; HSA-VI: kernarg_segment_alignment = 4
178
179; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
180; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
181; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
182; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c
183; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
184define amdgpu_kernel void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind {
185entry:
186  store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4
187  ret void
188}
189
190; FUNC-LABEL: {{^}}v2f32_arg:
191; HSA-VI: kernarg_segment_byte_size = 16
192; HSA-VI: kernarg_segment_alignment = 4
193
194; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
195; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
196; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
197; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c
198; HSA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[4:5], 0x8
199define amdgpu_kernel void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind {
200entry:
201  store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4
202  ret void
203}
204
205; FUNC-LABEL: {{^}}v3i8_arg:
206; HSA-VI: kernarg_segment_byte_size = 12
207; HSA-VI: kernarg_segment_alignment = 4
208
209; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40
210; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41
211; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42
212
213; SI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
214
215; VI-MESA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
216; VI-HSA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
217define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
218entry:
219  store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4
220  ret void
221}
222
223; FUNC-LABEL: {{^}}v3i16_arg:
224; HSA-VI: kernarg_segment_byte_size = 16
225; HSA-VI: kernarg_segment_alignment = 4
226
227; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44
228; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
229; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48
230
231; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
232
233; VI-HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
234; VI-MESA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
235define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind {
236entry:
237  store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
238  ret void
239}
240
241; FUNC-LABEL: {{^}}v3i32_arg:
242; HSA-VI: kernarg_segment_byte_size = 32
243; HSA-VI: kernarg_segment_alignment = 4
244; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
245; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
246; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
247; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
248; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34
249; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
250define amdgpu_kernel void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind {
251entry:
252  store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4
253  ret void
254}
255
256; FUNC-LABEL: {{^}}v3f32_arg:
257; HSA-VI: kernarg_segment_byte_size = 32
258; HSA-VI: kernarg_segment_alignment = 4
259; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
260; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
261; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
262; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
263; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34
264; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
265define amdgpu_kernel void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind {
266entry:
267  store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4
268  ret void
269}
270
271; FUNC-LABEL: {{^}}v4i8_arg:
272; HSA-VI: kernarg_segment_byte_size = 12
273; HSA-VI: kernarg_segment_alignment = 4
274; EG: VTX_READ_8
275; EG: VTX_READ_8
276; EG: VTX_READ_8
277; EG: VTX_READ_8
278
279; GCN-DAG: s_load_dwordx2 s
280; GCN-DAG: s_load_dword s
281define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
282entry:
283  store <4 x i8> %in, <4 x i8> addrspace(1)* %out
284  ret void
285}
286
287; FUNC-LABEL: {{^}}v4i16_arg:
288; HSA-VI: kernarg_segment_byte_size = 16
289; HSA-VI: kernarg_segment_alignment = 4
290; EG: VTX_READ_16
291; EG: VTX_READ_16
292; EG: VTX_READ_16
293; EG: VTX_READ_16
294
295; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0xb
296; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x9
297
298; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24
299; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x2c
300
301
302; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24
303; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x2c
304
305; HSA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0
306; HSA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
307define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
308entry:
309  store <4 x i16> %in, <4 x i16> addrspace(1)* %out
310  ret void
311}
312
313; FUNC-LABEL: {{^}}v4i32_arg:
314; HSA-VI: kernarg_segment_byte_size = 32
315; HSA-VI: kernarg_segment_alignment = 4
316; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
317; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
318; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
319; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
320
321; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
322; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34
323; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
324define amdgpu_kernel void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind {
325entry:
326  store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4
327  ret void
328}
329
330; FUNC-LABEL: {{^}}v4f32_arg:
331; HSA-VI: kernarg_segment_byte_size = 32
332; HSA-VI: kernarg_segment_alignment = 4
333; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
334; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
335; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
336; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
337; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
338; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34
339; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
340define amdgpu_kernel void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind {
341entry:
342  store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4
343  ret void
344}
345
346; FIXME: Lots of unpack and re-pack junk on VI
347; FUNC-LABEL: {{^}}v8i8_arg:
348; HSA-VI: kernarg_segment_byte_size = 16
349; HSA-VI: kernarg_segment_alignment = 4
350; EG: VTX_READ_8
351; EG: VTX_READ_8
352; EG: VTX_READ_8
353; EG: VTX_READ_8
354; EG: VTX_READ_8
355; EG: VTX_READ_8
356; EG: VTX_READ_8
357; EG: VTX_READ_8
358
359; SI-NOT: {{buffer|flat|global}}_load
360; SI: s_load_dwordx2 s
361; SI-NEXT: s_load_dwordx2 s
362; SI-NOT: {{buffer|flat|global}}_load
363
364; VI: s_load_dwordx2 s
365; VI-NEXT: s_load_dwordx2 s
366; VI-NOT: lshl
367; VI-NOT: _or
368; VI-NOT: _sdwa
369define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
370entry:
371  store <8 x i8> %in, <8 x i8> addrspace(1)* %out
372  ret void
373}
374
375; FUNC-LABEL: {{^}}v8i16_arg:
376; HSA-VI: kernarg_segment_byte_size = 32
377; HSA-VI: kernarg_segment_alignment = 4
378; EG: VTX_READ_16
379; EG: VTX_READ_16
380; EG: VTX_READ_16
381; EG: VTX_READ_16
382; EG: VTX_READ_16
383; EG: VTX_READ_16
384; EG: VTX_READ_16
385; EG: VTX_READ_16
386
387; SI: s_load_dwordx4
388; SI-NEXT: s_load_dwordx2
389; SI-NOT: {{buffer|flat|global}}_load
390
391
392; MESA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x34
393
394; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10
395define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
396entry:
397  store <8 x i16> %in, <8 x i16> addrspace(1)* %out
398  ret void
399}
400
401; FUNC-LABEL: {{^}}v8i32_arg:
402; HSA-VI: kernarg_segment_byte_size = 64
403; HSA-VI: kernarg_segment_alignment = 5
404; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
405; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
406; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
407; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X
408; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y
409; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
410; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
411; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
412
413; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11
414; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44
415; HSA-VI: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20
416define amdgpu_kernel void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind {
417entry:
418  store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4
419  ret void
420}
421
422; FUNC-LABEL: {{^}}v8f32_arg:
423; HSA-VI: kernarg_segment_byte_size = 64
424; HSA-VI: kernarg_segment_alignment = 5
425; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
426; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
427; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
428; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X
429; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y
430; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
431; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
432; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
433; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11
434define amdgpu_kernel void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind {
435entry:
436  store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4
437  ret void
438}
439
440; FIXME: Pack/repack on VI
441
442; FUNC-LABEL: {{^}}v16i8_arg:
443; HSA-VI: kernarg_segment_byte_size = 32
444; HSA-VI: kernarg_segment_alignment = 4
445; EG: VTX_READ_8
446; EG: VTX_READ_8
447; EG: VTX_READ_8
448; EG: VTX_READ_8
449; EG: VTX_READ_8
450; EG: VTX_READ_8
451; EG: VTX_READ_8
452; EG: VTX_READ_8
453; EG: VTX_READ_8
454; EG: VTX_READ_8
455; EG: VTX_READ_8
456; EG: VTX_READ_8
457; EG: VTX_READ_8
458; EG: VTX_READ_8
459; EG: VTX_READ_8
460; EG: VTX_READ_8
461
462; SI: s_load_dwordx4 s
463; SI-NEXT: s_load_dwordx2 s
464; SI-NOT: {{buffer|flat|global}}_load
465
466
467; VI: s_load_dwordx4 s
468; VI-NOT: shr
469; VI-NOT: shl
470; VI-NOT: _sdwa
471; VI-NOT: _or_
472define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
473entry:
474  store <16 x i8> %in, <16 x i8> addrspace(1)* %out
475  ret void
476}
477
478; FUNC-LABEL: {{^}}v16i16_arg:
479; HSA-VI: kernarg_segment_byte_size = 64
480; HSA-VI: kernarg_segment_alignment = 5
481; EG: VTX_READ_16
482; EG: VTX_READ_16
483; EG: VTX_READ_16
484; EG: VTX_READ_16
485; EG: VTX_READ_16
486
487; EG: VTX_READ_16
488; EG: VTX_READ_16
489; EG: VTX_READ_16
490; EG: VTX_READ_16
491; EG: VTX_READ_16
492; EG: VTX_READ_16
493; EG: VTX_READ_16
494; EG: VTX_READ_16
495; EG: VTX_READ_16
496; EG: VTX_READ_16
497; EG: VTX_READ_16
498
499; SI: s_load_dwordx8 s
500; SI-NEXT: s_load_dwordx2 s
501; SI-NOT: {{buffer|flat|global}}_load
502
503
504; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44
505
506; HSA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
507define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
508entry:
509  store <16 x i16> %in, <16 x i16> addrspace(1)* %out
510  ret void
511}
512
513; FUNC-LABEL: {{^}}v16i32_arg:
514; HSA-VI: kernarg_segment_byte_size = 128
515; HSA-VI: kernarg_segment_alignment = 6
516; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
517; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
518; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
519; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X
520; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y
521; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z
522; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W
523; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
524; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
525; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
526; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W
527; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X
528; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y
529; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
530; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
531; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
532; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19
533; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64
534; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
535define amdgpu_kernel void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind {
536entry:
537  store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4
538  ret void
539}
540
541; FUNC-LABEL: {{^}}v16f32_arg:
542; HSA-VI: kernarg_segment_byte_size = 128
543; HSA-VI: kernarg_segment_alignment = 6
544; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
545; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
546; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
547; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X
548; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y
549; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z
550; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W
551; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
552; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
553; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
554; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W
555; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X
556; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y
557; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
558; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
559; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
560; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19
561; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64
562; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
563define amdgpu_kernel void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind {
564entry:
565  store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4
566  ret void
567}
568
569; FUNC-LABEL: {{^}}kernel_arg_i64:
570; MESA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[0:1], 0x24
571; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
572
573; MESA-GCN: buffer_store_dwordx2
574define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
575  store i64 %a, i64 addrspace(1)* %out, align 8
576  ret void
577}
578
579; FUNC-LABEL: {{^}}f64_kernel_arg:
580; SI-DAG: s_load_dwordx4 s[{{[0-9]:[0-9]}}], s[0:1], 0x9
581; MESA-VI-DAG: s_load_dwordx4 s[{{[0-9]:[0-9]}}], s[0:1], 0x24
582; MESA-GCN: buffer_store_dwordx2
583
584; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
585define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double  %in) {
586entry:
587  store double %in, double addrspace(1)* %out
588  ret void
589}
590
591; XFUNC-LABEL: {{^}}kernel_arg_v1i64:
592; XGCN: s_load_dwordx2
593; XGCN: s_load_dwordx2
594; XGCN: buffer_store_dwordx2
595; define amdgpu_kernel void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind {
596;   store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8
597;   ret void
598; }
599
600; FUNC-LABEL: {{^}}i1_arg:
601; HSA-VI: kernarg_segment_byte_size = 12
602; HSA-VI: kernarg_segment_alignment = 4
603
604; GCN: s_load_dword s
605; GCN: s_and_b32
606; GCN: {{buffer|flat}}_store_byte
607define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
608  store i1 %x, i1 addrspace(1)* %out, align 1
609  ret void
610}
611
612; FUNC-LABEL: {{^}}i1_arg_zext_i32:
613; HSA-VI: kernarg_segment_byte_size = 12
614; HSA-VI: kernarg_segment_alignment = 4
615
616; GCN: s_load_dword
617; SGCN: buffer_store_dword
618define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
619  %ext = zext i1 %x to i32
620  store i32 %ext, i32 addrspace(1)* %out, align 4
621  ret void
622}
623
624; FUNC-LABEL: {{^}}i1_arg_zext_i64:
625; HSA-VI: kernarg_segment_byte_size = 12
626; HSA-VI: kernarg_segment_alignment = 4
627
628; GCN: s_load_dword s
629; GCN: {{buffer|flat}}_store_dwordx2
630define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
631  %ext = zext i1 %x to i64
632  store i64 %ext, i64 addrspace(1)* %out, align 8
633  ret void
634}
635
636; FUNC-LABEL: {{^}}i1_arg_sext_i32:
637; HSA-VI: kernarg_segment_byte_size = 12
638; HSA-VI: kernarg_segment_alignment = 4
639
640; GCN: s_load_dword
641; GCN: {{buffer|flat}}_store_dword
642define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
643  %ext = sext i1 %x to i32
644  store i32 %ext, i32addrspace(1)* %out, align 4
645  ret void
646}
647
648; FUNC-LABEL: {{^}}i1_arg_sext_i64:
649; HSA-VI: kernarg_segment_byte_size = 12
650; HSA-VI: kernarg_segment_alignment = 4
651
652; GCN: s_load_dword
653; GCN: s_bfe_i64
654; GCN: {{buffer|flat}}_store_dwordx2
655define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
656  %ext = sext i1 %x to i64
657  store i64 %ext, i64 addrspace(1)* %out, align 8
658  ret void
659}
660