1; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -enable-var-scope --check-prefixes=SI,GCN,MESA-GCN,FUNC %s
2; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=VI,GCN,MESA-VI,MESA-GCN,FUNC %s
3; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=VI,GCN,HSA-VI,FUNC %s
4; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=EG --check-prefix=FUNC %s
5; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck -enable-var-scope --check-prefix=EG --check-prefix=FUNC %s
6
7; FUNC-LABEL: {{^}}i8_arg:
8; HSA-VI: kernarg_segment_byte_size = 12
9; HSA-VI: kernarg_segment_alignment = 4
10; EG: AND_INT {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
11; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
12; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
13; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
14; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8
15; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0
16; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]]
17; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]]
18; FIXME: Should be using s_load_dword
19; HSA-VI: flat_load_ubyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}}
20
21define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
22entry:
23  %0 = zext i8 %in to i32
24  store i32 %0, i32 addrspace(1)* %out, align 4
25  ret void
26}
27
28; FUNC-LABEL: {{^}}i8_zext_arg:
29; HSA-VI: kernarg_segment_byte_size = 12
30; HSA-VI: kernarg_segment_alignment = 4
31; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
32; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
33; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
34; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8
35; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0
36; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]]
37; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]]
38; FIXME: Should be using s_load_dword
39; HSA-VI: flat_load_ubyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}}
40
41define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
42entry:
43  %0 = zext i8 %in to i32
44  store i32 %0, i32 addrspace(1)* %out, align 4
45  ret void
46}
47
48; FUNC-LABEL: {{^}}i8_sext_arg:
49; HSA-VI: kernarg_segment_byte_size = 12
50; HSA-VI: kernarg_segment_alignment = 4
51; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
52; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
53; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
54; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8
55; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0
56; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]]
57; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]]
58; FIXME: Should be using s_load_dword
59; HSA-VI: flat_load_sbyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}}
60
61define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
62entry:
63  %0 = sext i8 %in to i32
64  store i32 %0, i32 addrspace(1)* %out, align 4
65  ret void
66}
67
68; FUNC-LABEL: {{^}}i16_arg:
69; HSA-VI: kernarg_segment_byte_size = 12
70; HSA-VI: kernarg_segment_alignment = 4
71
72; EG: AND_INT {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
73; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
74; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
75; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
76; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8
77; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0
78; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]]
79; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]]
80; FIXME: Should be using s_load_dword
81; HSA-VI: flat_load_ushort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}}
82
83define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
84entry:
85  %0 = zext i16 %in to i32
86  store i32 %0, i32 addrspace(1)* %out, align 4
87  ret void
88}
89
90; FUNC-LABEL: {{^}}i16_zext_arg:
91; HSA-VI: kernarg_segment_byte_size = 12
92; HSA-VI: kernarg_segment_alignment = 4
93
94; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
95; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
96; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
97; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8
98; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0
99; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]]
100; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]]
101; FIXME: Should be using s_load_dword
102; HSA-VI: flat_load_ushort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}}
103
104define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
105entry:
106  %0 = zext i16 %in to i32
107  store i32 %0, i32 addrspace(1)* %out, align 4
108  ret void
109}
110
111; FUNC-LABEL: {{^}}i16_sext_arg:
112; HSA-VI: kernarg_segment_byte_size = 12
113; HSA-VI: kernarg_segment_alignment = 4
114
115; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
116; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
117; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
118; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8
119; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0
120; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]]
121; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]]
122; FIXME: Should be using s_load_dword
123; HSA-VI: flat_load_sshort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}}
124
125define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
126entry:
127  %0 = sext i16 %in to i32
128  store i32 %0, i32 addrspace(1)* %out, align 4
129  ret void
130}
131
132; FUNC-LABEL: {{^}}i32_arg:
133; HSA-VI: kernarg_segment_byte_size = 12
134; HSA-VI: kernarg_segment_alignment = 4
135
136; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z
137; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
138; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
139; HSA-VI: s_load_dword s{{[0-9]}}, s[4:5], 0x8
140define amdgpu_kernel void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind {
141entry:
142  store i32 %in, i32 addrspace(1)* %out, align 4
143  ret void
144}
145
146; FUNC-LABEL: {{^}}f32_arg:
147; HSA-VI: kernarg_segment_byte_size = 12
148; HSA-VI: kernarg_segment_alignment = 4
149; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z
150; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
151; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
152; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8
153define amdgpu_kernel void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind {
154entry:
155  store float %in, float addrspace(1)* %out, align 4
156  ret void
157}
158
159; FUNC-LABEL: {{^}}v2i8_arg:
160; HSA-VI: kernarg_segment_byte_size = 12
161; HSA-VI: kernarg_segment_alignment = 4
162
163; EG: VTX_READ_8
164; EG: VTX_READ_8
165
166; SI: buffer_load_ubyte
167; SI: buffer_load_ubyte
168
169; HSA: flat_load_ushort
170define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
171entry:
172  store <2 x i8> %in, <2 x i8> addrspace(1)* %out
173  ret void
174}
175
176; FUNC-LABEL: {{^}}v2i16_arg:
177; HSA-VI: kernarg_segment_byte_size = 12
178; HSA-VI: kernarg_segment_alignment = 4
179
180; EG: VTX_READ_16
181; EG: VTX_READ_16
182
183; SI: s_load_dword s{{[0-9]+}}, s[0:1], 0xb
184; MESA-VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
185; HSA-VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
186define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) {
187entry:
188  store <2 x i16> %in, <2 x i16> addrspace(1)* %out
189  ret void
190}
191
192; FUNC-LABEL: {{^}}v2i32_arg:
193; HSA-VI: kernarg_segment_byte_size = 16
194; HSA-VI: kernarg_segment_alignment = 4
195
196; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
197; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
198; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
199; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c
200; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
201define amdgpu_kernel void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind {
202entry:
203  store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4
204  ret void
205}
206
207; FUNC-LABEL: {{^}}v2f32_arg:
208; HSA-VI: kernarg_segment_byte_size = 16
209; HSA-VI: kernarg_segment_alignment = 4
210
211; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
212; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
213; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
214; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c
215; HSA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[4:5], 0x8
216define amdgpu_kernel void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind {
217entry:
218  store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4
219  ret void
220}
221
222; FUNC-LABEL: {{^}}v3i8_arg:
223; HSA-VI: kernarg_segment_byte_size = 12
224; HSA-VI: kernarg_segment_alignment = 4
225
226; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40
227; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41
228; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42
229; SI: buffer_load_ubyte
230; SI: buffer_load_ubyte
231; SI: buffer_load_ubyte
232
233; MESA-VI: buffer_load_ushort
234; MESA-VI: buffer_load_ubyte
235
236; HSA-VI: flat_load_ushort
237; HSA-VI: flat_load_ubyte
238define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
239entry:
240  store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4
241  ret void
242}
243
244; FUNC-LABEL: {{^}}v3i16_arg:
245; HSA-VI: kernarg_segment_byte_size = 16
246; HSA-VI: kernarg_segment_alignment = 4
247
248; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44
249; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
250; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48
251
252; GCN-DAG: s_load_dword s
253; GCN-DAG: {{buffer|flat}}_load_ushort
254define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind {
255entry:
256  store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
257  ret void
258}
259; FUNC-LABEL: {{^}}v3i32_arg:
260; HSA-VI: kernarg_segment_byte_size = 32
261; HSA-VI: kernarg_segment_alignment = 4
262; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
263; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
264; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
265; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
266; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34
267; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
268define amdgpu_kernel void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind {
269entry:
270  store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4
271  ret void
272}
273
274; FUNC-LABEL: {{^}}v3f32_arg:
275; HSA-VI: kernarg_segment_byte_size = 32
276; HSA-VI: kernarg_segment_alignment = 4
277; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
278; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
279; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
280; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
281; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34
282; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
283define amdgpu_kernel void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind {
284entry:
285  store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4
286  ret void
287}
288
289; FUNC-LABEL: {{^}}v4i8_arg:
290; HSA-VI: kernarg_segment_byte_size = 12
291; HSA-VI: kernarg_segment_alignment = 4
292; EG: VTX_READ_8
293; EG: VTX_READ_8
294; EG: VTX_READ_8
295; EG: VTX_READ_8
296
297; SI: buffer_load_ubyte
298; SI: buffer_load_ubyte
299; SI: buffer_load_ubyte
300; SI: buffer_load_ubyte
301
302; VI: s_load_dword s
303define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
304entry:
305  store <4 x i8> %in, <4 x i8> addrspace(1)* %out
306  ret void
307}
308
309; FUNC-LABEL: {{^}}v4i16_arg:
310; HSA-VI: kernarg_segment_byte_size = 16
311; HSA-VI: kernarg_segment_alignment = 4
312; EG: VTX_READ_16
313; EG: VTX_READ_16
314; EG: VTX_READ_16
315; EG: VTX_READ_16
316
317; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0xb
318; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x9
319
320; MESA-VI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x2c
321; MESA-VI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x30
322
323; HSA-VI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x8
324; HSA-VI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xc
325define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
326entry:
327  store <4 x i16> %in, <4 x i16> addrspace(1)* %out
328  ret void
329}
330
331; FUNC-LABEL: {{^}}v4i32_arg:
332; HSA-VI: kernarg_segment_byte_size = 32
333; HSA-VI: kernarg_segment_alignment = 4
334; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
335; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
336; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
337; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
338
339; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
340; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34
341; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
342define amdgpu_kernel void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind {
343entry:
344  store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4
345  ret void
346}
347
348; FUNC-LABEL: {{^}}v4f32_arg:
349; HSA-VI: kernarg_segment_byte_size = 32
350; HSA-VI: kernarg_segment_alignment = 4
351; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
352; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
353; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
354; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
355; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
356; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34
357; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
358define amdgpu_kernel void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind {
359entry:
360  store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4
361  ret void
362}
363
364; FUNC-LABEL: {{^}}v8i8_arg:
365; HSA-VI: kernarg_segment_byte_size = 16
366; HSA-VI: kernarg_segment_alignment = 4
367; EG: VTX_READ_8
368; EG: VTX_READ_8
369; EG: VTX_READ_8
370; EG: VTX_READ_8
371; EG: VTX_READ_8
372; EG: VTX_READ_8
373; EG: VTX_READ_8
374; EG: VTX_READ_8
375
376; SI: buffer_load_ubyte
377; SI: buffer_load_ubyte
378; SI: buffer_load_ubyte
379; SI: buffer_load_ubyte
380; SI: buffer_load_ubyte
381; SI: buffer_load_ubyte
382; SI: buffer_load_ubyte
383
384; VI: s_load_dwordx2
385; VI: s_load_dwordx2
386define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
387entry:
388  store <8 x i8> %in, <8 x i8> addrspace(1)* %out
389  ret void
390}
391
392; FUNC-LABEL: {{^}}v8i16_arg:
393; HSA-VI: kernarg_segment_byte_size = 32
394; HSA-VI: kernarg_segment_alignment = 4
395; EG: VTX_READ_16
396; EG: VTX_READ_16
397; EG: VTX_READ_16
398; EG: VTX_READ_16
399; EG: VTX_READ_16
400; EG: VTX_READ_16
401; EG: VTX_READ_16
402; EG: VTX_READ_16
403
404; SI: s_load_dwordx2
405; SI: s_load_dwordx2
406; SI: s_load_dwordx2
407
408; VI: s_load_dwordx2
409; VI: s_load_dword s
410; VI: s_load_dword s
411; VI: s_load_dword s
412; VI: s_load_dword s
413define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
414entry:
415  store <8 x i16> %in, <8 x i16> addrspace(1)* %out
416  ret void
417}
418
419; FUNC-LABEL: {{^}}v8i32_arg:
420; HSA-VI: kernarg_segment_byte_size = 64
421; HSA-VI: kernarg_segment_alignment = 5
422; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
423; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
424; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
425; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X
426; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y
427; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
428; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
429; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
430; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11
431; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44
432; HSA-VI: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20
433define amdgpu_kernel void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind {
434entry:
435  store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4
436  ret void
437}
438
439; FUNC-LABEL: {{^}}v8f32_arg:
440; HSA-VI: kernarg_segment_byte_size = 64
441; HSA-VI: kernarg_segment_alignment = 5
442; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
443; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
444; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
445; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X
446; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y
447; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
448; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
449; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
450; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11
451define amdgpu_kernel void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind {
452entry:
453  store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4
454  ret void
455}
456
457; FUNC-LABEL: {{^}}v16i8_arg:
458; HSA-VI: kernarg_segment_byte_size = 32
459; HSA-VI: kernarg_segment_alignment = 4
460; EG: VTX_READ_8
461; EG: VTX_READ_8
462; EG: VTX_READ_8
463; EG: VTX_READ_8
464; EG: VTX_READ_8
465; EG: VTX_READ_8
466; EG: VTX_READ_8
467; EG: VTX_READ_8
468; EG: VTX_READ_8
469; EG: VTX_READ_8
470; EG: VTX_READ_8
471; EG: VTX_READ_8
472; EG: VTX_READ_8
473; EG: VTX_READ_8
474; EG: VTX_READ_8
475; EG: VTX_READ_8
476
477; SI: buffer_load_ubyte
478; SI: buffer_load_ubyte
479; SI: buffer_load_ubyte
480; SI: buffer_load_ubyte
481; SI: buffer_load_ubyte
482; SI: buffer_load_ubyte
483; SI: buffer_load_ubyte
484; SI: buffer_load_ubyte
485; SI: buffer_load_ubyte
486; SI: buffer_load_ubyte
487; SI: buffer_load_ubyte
488; SI: buffer_load_ubyte
489; SI: buffer_load_ubyte
490; SI: buffer_load_ubyte
491; SI: buffer_load_ubyte
492; SI: buffer_load_ubyte
493
494; VI: s_load_dwordx2
495; VI: s_load_dwordx2
496; VI: s_load_dwordx2
497define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
498entry:
499  store <16 x i8> %in, <16 x i8> addrspace(1)* %out
500  ret void
501}
502
503; FUNC-LABEL: {{^}}v16i16_arg:
504; HSA-VI: kernarg_segment_byte_size = 64
505; HSA-VI: kernarg_segment_alignment = 5
506; EG: VTX_READ_16
507; EG: VTX_READ_16
508; EG: VTX_READ_16
509; EG: VTX_READ_16
510; EG: VTX_READ_16
511; EG: VTX_READ_16
512; EG: VTX_READ_16
513; EG: VTX_READ_16
514; EG: VTX_READ_16
515; EG: VTX_READ_16
516; EG: VTX_READ_16
517; EG: VTX_READ_16
518; EG: VTX_READ_16
519; EG: VTX_READ_16
520; EG: VTX_READ_16
521; EG: VTX_READ_16
522
523; SI: s_load_dword s
524; SI: s_load_dword s
525; SI: s_load_dword s
526; SI: s_load_dword s
527; SI: s_load_dwordx2
528; SI: s_load_dwordx2
529; SI: s_load_dwordx2
530
531; VI: s_load_dword s
532; VI: s_load_dword s
533; VI: s_load_dword s
534; VI: s_load_dword s
535; VI: s_load_dword s
536; VI: s_load_dword s
537; VI: s_load_dword s
538; VI: s_load_dword s
539define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
540entry:
541  store <16 x i16> %in, <16 x i16> addrspace(1)* %out
542  ret void
543}
544
545; FUNC-LABEL: {{^}}v16i32_arg:
546; HSA-VI: kernarg_segment_byte_size = 128
547; HSA-VI: kernarg_segment_alignment = 6
548; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
549; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
550; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
551; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X
552; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y
553; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z
554; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W
555; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
556; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
557; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
558; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W
559; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X
560; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y
561; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
562; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
563; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
564; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19
565; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64
566; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
567define amdgpu_kernel void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind {
568entry:
569  store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4
570  ret void
571}
572
573; FUNC-LABEL: {{^}}v16f32_arg:
574; HSA-VI: kernarg_segment_byte_size = 128
575; HSA-VI: kernarg_segment_alignment = 6
576; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
577; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
578; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
579; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X
580; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y
581; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z
582; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W
583; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
584; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
585; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
586; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W
587; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X
588; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y
589; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
590; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
591; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
592; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19
593; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64
594; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
595define amdgpu_kernel void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind {
596entry:
597  store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4
598  ret void
599}
600
601; FUNC-LABEL: {{^}}kernel_arg_i64:
602; MESA-GCN: s_load_dwordx2
603; MESA-GCN: s_load_dwordx2
604; MESA-GCN: buffer_store_dwordx2
605; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
606define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
607  store i64 %a, i64 addrspace(1)* %out, align 8
608  ret void
609}
610
611; FUNC-LABEL: {{^}}f64_kernel_arg:
612; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x9
613; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0xb
614; MESA-VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x24
615; MESA-VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x2c
616; MESA-GCN: buffer_store_dwordx2
617; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
618define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double  %in) {
619entry:
620  store double %in, double addrspace(1)* %out
621  ret void
622}
623
624; XFUNC-LABEL: {{^}}kernel_arg_v1i64:
625; XGCN: s_load_dwordx2
626; XGCN: s_load_dwordx2
627; XGCN: buffer_store_dwordx2
628; define amdgpu_kernel void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind {
629;   store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8
630;   ret void
631; }
632
633; FUNC-LABEL: {{^}}i1_arg:
634; HSA-VI: kernarg_segment_byte_size = 12
635; HSA-VI: kernarg_segment_alignment = 4
636
637; SI: buffer_load_ubyte
638; SI: v_and_b32_e32
639; SI: buffer_store_byte
640; SI: s_endpgm
641define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
642  store i1 %x, i1 addrspace(1)* %out, align 1
643  ret void
644}
645
646; FUNC-LABEL: {{^}}i1_arg_zext_i32:
647; HSA-VI: kernarg_segment_byte_size = 12
648; HSA-VI: kernarg_segment_alignment = 4
649
650; SI: buffer_load_ubyte
651; SI: buffer_store_dword
652; SI: s_endpgm
653define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
654  %ext = zext i1 %x to i32
655  store i32 %ext, i32 addrspace(1)* %out, align 4
656  ret void
657}
658
659; FUNC-LABEL: {{^}}i1_arg_zext_i64:
660; HSA-VI: kernarg_segment_byte_size = 12
661; HSA-VI: kernarg_segment_alignment = 4
662
663; SI: buffer_load_ubyte
664; SI: buffer_store_dwordx2
665; SI: s_endpgm
666define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
667  %ext = zext i1 %x to i64
668  store i64 %ext, i64 addrspace(1)* %out, align 8
669  ret void
670}
671
672; FUNC-LABEL: {{^}}i1_arg_sext_i32:
673; HSA-VI: kernarg_segment_byte_size = 12
674; HSA-VI: kernarg_segment_alignment = 4
675
676; SI: buffer_load_ubyte
677; SI: buffer_store_dword
678; SI: s_endpgm
679define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
680  %ext = sext i1 %x to i32
681  store i32 %ext, i32addrspace(1)* %out, align 4
682  ret void
683}
684
685; FUNC-LABEL: {{^}}i1_arg_sext_i64:
686; HSA-VI: kernarg_segment_byte_size = 12
687; HSA-VI: kernarg_segment_alignment = 4
688
689; SI: buffer_load_ubyte
690; SI: v_bfe_i32
691; SI: v_ashrrev_i32
692; SI: buffer_store_dwordx2
693; SI: s_endpgm
694define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
695  %ext = sext i1 %x to i64
696  store i64 %ext, i64 addrspace(1)* %out, align 8
697  ret void
698}
699