1; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -enable-var-scope --check-prefixes=SI,GCN,MESA-GCN,FUNC %s
2; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=VI,GCN,MESA-VI,MESA-GCN,FUNC %s
3; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs | FileCheck -enable-var-scope -check-prefixes=VI,GCN,HSA-VI,FUNC %s
4; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck -enable-var-scope -check-prefix=EG --check-prefix=FUNC %s
5; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck -enable-var-scope --check-prefix=EG --check-prefix=FUNC %s
6
7; FUNC-LABEL: {{^}}i8_arg:
8; HSA-VI: kernarg_segment_byte_size = 12
9; HSA-VI: kernarg_segment_alignment = 4
10; EG: AND_INT {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
11; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
12; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
13; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
14; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8
15; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0
16; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]]
17; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]]
18; FIXME: Should be using s_load_dword
19; HSA-VI: flat_load_ubyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}}
20
21define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
22entry:
23  %0 = zext i8 %in to i32
24  store i32 %0, i32 addrspace(1)* %out, align 4
25  ret void
26}
27
28; FUNC-LABEL: {{^}}i8_zext_arg:
29; HSA-VI: kernarg_segment_byte_size = 12
30; HSA-VI: kernarg_segment_alignment = 4
31; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
32; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
33; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
34; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8
35; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0
36; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]]
37; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]]
38; FIXME: Should be using s_load_dword
39; HSA-VI: flat_load_ubyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}}
40
41define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
42entry:
43  %0 = zext i8 %in to i32
44  store i32 %0, i32 addrspace(1)* %out, align 4
45  ret void
46}
47
48; FUNC-LABEL: {{^}}i8_sext_arg:
49; HSA-VI: kernarg_segment_byte_size = 12
50; HSA-VI: kernarg_segment_alignment = 4
51; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
52; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
53; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
54; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8
55; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0
56; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]]
57; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]]
58; FIXME: Should be using s_load_dword
59; HSA-VI: flat_load_sbyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}}
60
61define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
62entry:
63  %0 = sext i8 %in to i32
64  store i32 %0, i32 addrspace(1)* %out, align 4
65  ret void
66}
67
68; FUNC-LABEL: {{^}}i16_arg:
69; HSA-VI: kernarg_segment_byte_size = 12
70; HSA-VI: kernarg_segment_alignment = 4
71
72; EG: AND_INT {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
73; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
74; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
75; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
76; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8
77; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0
78; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]]
79; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]]
80; FIXME: Should be using s_load_dword
81; HSA-VI: flat_load_ushort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}}
82
83define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
84entry:
85  %0 = zext i16 %in to i32
86  store i32 %0, i32 addrspace(1)* %out, align 4
87  ret void
88}
89
90; FUNC-LABEL: {{^}}i16_zext_arg:
91; HSA-VI: kernarg_segment_byte_size = 12
92; HSA-VI: kernarg_segment_alignment = 4
93
94; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
95; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
96; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
97; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8
98; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0
99; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]]
100; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]]
101; FIXME: Should be using s_load_dword
102; HSA-VI: flat_load_ushort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}}
103
104define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
105entry:
106  %0 = zext i16 %in to i32
107  store i32 %0, i32 addrspace(1)* %out, align 4
108  ret void
109}
110
111; FUNC-LABEL: {{^}}i16_sext_arg:
112; HSA-VI: kernarg_segment_byte_size = 12
113; HSA-VI: kernarg_segment_alignment = 4
114
115; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
116; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
117; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
118; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8
119; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0
120; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]]
121; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]]
122; FIXME: Should be using s_load_dword
123; HSA-VI: flat_load_sshort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]{{$}}
124
125define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
126entry:
127  %0 = sext i16 %in to i32
128  store i32 %0, i32 addrspace(1)* %out, align 4
129  ret void
130}
131
132; FUNC-LABEL: {{^}}i32_arg:
133; HSA-VI: kernarg_segment_byte_size = 12
134; HSA-VI: kernarg_segment_alignment = 4
135
136; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z
137; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
138; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
139; HSA-VI: s_load_dword s{{[0-9]}}, s[4:5], 0x8
140define amdgpu_kernel void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind {
141entry:
142  store i32 %in, i32 addrspace(1)* %out, align 4
143  ret void
144}
145
146; FUNC-LABEL: {{^}}f32_arg:
147; HSA-VI: kernarg_segment_byte_size = 12
148; HSA-VI: kernarg_segment_alignment = 4
149; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z
150; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
151; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
152; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8
153define amdgpu_kernel void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind {
154entry:
155  store float %in, float addrspace(1)* %out, align 4
156  ret void
157}
158
159; FUNC-LABEL: {{^}}v2i8_arg:
160; HSA-VI: kernarg_segment_byte_size = 12
161; HSA-VI: kernarg_segment_alignment = 4
162
163; EG: VTX_READ_8
164; EG: VTX_READ_8
165; MESA-GCN: buffer_load_ubyte
166; MESA-GCN: buffer_load_ubyte
167; HSA-VI: flat_load_ubyte
168; HSA-VI: flat_load_ubyte
169define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
170entry:
171  store <2 x i8> %in, <2 x i8> addrspace(1)* %out
172  ret void
173}
174
175; FUNC-LABEL: {{^}}v2i16_arg:
176; HSA-VI: kernarg_segment_byte_size = 12
177; HSA-VI: kernarg_segment_alignment = 4
178
179; EG: VTX_READ_16
180; EG: VTX_READ_16
181
182; SI: buffer_load_ushort
183; SI: buffer_load_ushort
184
185; VI: s_load_dword s
186define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) {
187entry:
188  store <2 x i16> %in, <2 x i16> addrspace(1)* %out
189  ret void
190}
191
192; FUNC-LABEL: {{^}}v2i32_arg:
193; HSA-VI: kernarg_segment_byte_size = 16
194; HSA-VI: kernarg_segment_alignment = 4
195
196; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
197; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
198; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
199; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c
200; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
201define amdgpu_kernel void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind {
202entry:
203  store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4
204  ret void
205}
206
207; FUNC-LABEL: {{^}}v2f32_arg:
208; HSA-VI: kernarg_segment_byte_size = 16
209; HSA-VI: kernarg_segment_alignment = 4
210
211; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
212; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
213; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
214; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c
215; HSA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[4:5], 0x8
216define amdgpu_kernel void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind {
217entry:
218  store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4
219  ret void
220}
221
222; FUNC-LABEL: {{^}}v3i8_arg:
223; HSA-VI: kernarg_segment_byte_size = 12
224; HSA-VI: kernarg_segment_alignment = 4
225
226; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40
227; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41
228; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42
229; MESA-GCN: buffer_load_ubyte
230; MESA-GCN: buffer_load_ubyte
231; MESA-GCN: buffer_load_ubyte
232; HSA-VI: flat_load_ubyte
233; HSA-VI: flat_load_ubyte
234; HSA-VI: flat_load_ubyte
235define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
236entry:
237  store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4
238  ret void
239}
240
241; FUNC-LABEL: {{^}}v3i16_arg:
242; HSA-VI: kernarg_segment_byte_size = 16
243; HSA-VI: kernarg_segment_alignment = 4
244
245; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44
246; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
247; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48
248; MESA-GCN: buffer_load_ushort
249; MESA-GCN: buffer_load_ushort
250; MESA-GCN: buffer_load_ushort
251; HSA-VI: flat_load_ushort
252; HSA-VI: flat_load_ushort
253; HSA-VI: flat_load_ushort
254define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind {
255entry:
256  store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
257  ret void
258}
259; FUNC-LABEL: {{^}}v3i32_arg:
260; HSA-VI: kernarg_segment_byte_size = 32
261; HSA-VI: kernarg_segment_alignment = 4
262; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
263; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
264; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
265; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
266; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34
267; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
268define amdgpu_kernel void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind {
269entry:
270  store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4
271  ret void
272}
273
274; FUNC-LABEL: {{^}}v3f32_arg:
275; HSA-VI: kernarg_segment_byte_size = 32
276; HSA-VI: kernarg_segment_alignment = 4
277; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
278; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
279; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
280; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
281; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34
282; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
283define amdgpu_kernel void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind {
284entry:
285  store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4
286  ret void
287}
288
289; FUNC-LABEL: {{^}}v4i8_arg:
290; HSA-VI: kernarg_segment_byte_size = 12
291; HSA-VI: kernarg_segment_alignment = 4
292; EG: VTX_READ_8
293; EG: VTX_READ_8
294; EG: VTX_READ_8
295; EG: VTX_READ_8
296; MESA-GCN: buffer_load_ubyte
297; MESA-GCN: buffer_load_ubyte
298; MESA-GCN: buffer_load_ubyte
299; MESA-GCN: buffer_load_ubyte
300; HSA-VI: flat_load_ubyte
301; HSA-VI: flat_load_ubyte
302; HSA-VI: flat_load_ubyte
303; HSA-VI: flat_load_ubyte
304define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
305entry:
306  store <4 x i8> %in, <4 x i8> addrspace(1)* %out
307  ret void
308}
309
310; FUNC-LABEL: {{^}}v4i16_arg:
311; HSA-VI: kernarg_segment_byte_size = 16
312; HSA-VI: kernarg_segment_alignment = 4
313; EG: VTX_READ_16
314; EG: VTX_READ_16
315; EG: VTX_READ_16
316; EG: VTX_READ_16
317
318; SI: buffer_load_ushort
319; SI: buffer_load_ushort
320; SI: buffer_load_ushort
321; SI: buffer_load_ushort
322
323; VI: s_load_dword s
324; VI: s_load_dword s
325define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
326entry:
327  store <4 x i16> %in, <4 x i16> addrspace(1)* %out
328  ret void
329}
330
331; FUNC-LABEL: {{^}}v4i32_arg:
332; HSA-VI: kernarg_segment_byte_size = 32
333; HSA-VI: kernarg_segment_alignment = 4
334; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
335; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
336; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
337; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
338
339; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
340; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34
341; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
342define amdgpu_kernel void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind {
343entry:
344  store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4
345  ret void
346}
347
348; FUNC-LABEL: {{^}}v4f32_arg:
349; HSA-VI: kernarg_segment_byte_size = 32
350; HSA-VI: kernarg_segment_alignment = 4
351; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
352; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
353; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
354; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
355; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
356; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34
357; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
358define amdgpu_kernel void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind {
359entry:
360  store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4
361  ret void
362}
363
364; FUNC-LABEL: {{^}}v8i8_arg:
365; HSA-VI: kernarg_segment_byte_size = 16
366; HSA-VI: kernarg_segment_alignment = 4
367; EG: VTX_READ_8
368; EG: VTX_READ_8
369; EG: VTX_READ_8
370; EG: VTX_READ_8
371; EG: VTX_READ_8
372; EG: VTX_READ_8
373; EG: VTX_READ_8
374; EG: VTX_READ_8
375; MESA-GCN: buffer_load_ubyte
376; MESA-GCN: buffer_load_ubyte
377; MESA-GCN: buffer_load_ubyte
378; MESA-GCN: buffer_load_ubyte
379; MESA-GCN: buffer_load_ubyte
380; MESA-GCN: buffer_load_ubyte
381; MESA-GCN: buffer_load_ubyte
382; HSA-GCN: float_load_ubyte
383; HSA-GCN: float_load_ubyte
384; HSA-GCN: float_load_ubyte
385; HSA-GCN: float_load_ubyte
386; HSA-GCN: float_load_ubyte
387; HSA-GCN: float_load_ubyte
388; HSA-GCN: float_load_ubyte
389; HSA-GCN: float_load_ubyte
390define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
391entry:
392  store <8 x i8> %in, <8 x i8> addrspace(1)* %out
393  ret void
394}
395
396; FUNC-LABEL: {{^}}v8i16_arg:
397; HSA-VI: kernarg_segment_byte_size = 32
398; HSA-VI: kernarg_segment_alignment = 4
399; EG: VTX_READ_16
400; EG: VTX_READ_16
401; EG: VTX_READ_16
402; EG: VTX_READ_16
403; EG: VTX_READ_16
404; EG: VTX_READ_16
405; EG: VTX_READ_16
406; EG: VTX_READ_16
407
408; SI: buffer_load_ushort
409; SI: buffer_load_ushort
410; SI: buffer_load_ushort
411; SI: buffer_load_ushort
412; SI: buffer_load_ushort
413; SI: buffer_load_ushort
414; SI: buffer_load_ushort
415; SI: buffer_load_ushort
416
417; VI: s_load_dword s
418; VI: s_load_dword s
419; VI: s_load_dword s
420; VI: s_load_dword s
421define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
422entry:
423  store <8 x i16> %in, <8 x i16> addrspace(1)* %out
424  ret void
425}
426
427; FUNC-LABEL: {{^}}v8i32_arg:
428; HSA-VI: kernarg_segment_byte_size = 64
429; HSA-VI: kernarg_segment_alignment = 5
430; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
431; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
432; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
433; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X
434; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y
435; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
436; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
437; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
438; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11
439; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44
440; HSA-VI: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20
441define amdgpu_kernel void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind {
442entry:
443  store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4
444  ret void
445}
446
447; FUNC-LABEL: {{^}}v8f32_arg:
448; HSA-VI: kernarg_segment_byte_size = 64
449; HSA-VI: kernarg_segment_alignment = 5
450; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
451; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
452; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
453; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X
454; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y
455; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
456; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
457; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
458; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11
459define amdgpu_kernel void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind {
460entry:
461  store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4
462  ret void
463}
464
465; FUNC-LABEL: {{^}}v16i8_arg:
466; HSA-VI: kernarg_segment_byte_size = 32
467; HSA-VI: kernarg_segment_alignment = 4
468; EG: VTX_READ_8
469; EG: VTX_READ_8
470; EG: VTX_READ_8
471; EG: VTX_READ_8
472; EG: VTX_READ_8
473; EG: VTX_READ_8
474; EG: VTX_READ_8
475; EG: VTX_READ_8
476; EG: VTX_READ_8
477; EG: VTX_READ_8
478; EG: VTX_READ_8
479; EG: VTX_READ_8
480; EG: VTX_READ_8
481; EG: VTX_READ_8
482; EG: VTX_READ_8
483; EG: VTX_READ_8
484; MESA-GCN: buffer_load_ubyte
485; MESA-GCN: buffer_load_ubyte
486; MESA-GCN: buffer_load_ubyte
487; MESA-GCN: buffer_load_ubyte
488; MESA-GCN: buffer_load_ubyte
489; MESA-GCN: buffer_load_ubyte
490; MESA-GCN: buffer_load_ubyte
491; MESA-GCN: buffer_load_ubyte
492; MESA-GCN: buffer_load_ubyte
493; MESA-GCN: buffer_load_ubyte
494; MESA-GCN: buffer_load_ubyte
495; MESA-GCN: buffer_load_ubyte
496; MESA-GCN: buffer_load_ubyte
497; MESA-GCN: buffer_load_ubyte
498; MESA-GCN: buffer_load_ubyte
499; MESA-GCN: buffer_load_ubyte
500; HSA-VI: flat_load_ubyte
501; HSA-VI: flat_load_ubyte
502; HSA-VI: flat_load_ubyte
503; HSA-VI: flat_load_ubyte
504; HSA-VI: flat_load_ubyte
505; HSA-VI: flat_load_ubyte
506; HSA-VI: flat_load_ubyte
507; HSA-VI: flat_load_ubyte
508; HSA-VI: flat_load_ubyte
509; HSA-VI: flat_load_ubyte
510; HSA-VI: flat_load_ubyte
511; HSA-VI: flat_load_ubyte
512; HSA-VI: flat_load_ubyte
513; HSA-VI: flat_load_ubyte
514; HSA-VI: flat_load_ubyte
515; HSA-VI: flat_load_ubyte
516define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
517entry:
518  store <16 x i8> %in, <16 x i8> addrspace(1)* %out
519  ret void
520}
521
522; FUNC-LABEL: {{^}}v16i16_arg:
523; HSA-VI: kernarg_segment_byte_size = 64
524; HSA-VI: kernarg_segment_alignment = 5
525; EG: VTX_READ_16
526; EG: VTX_READ_16
527; EG: VTX_READ_16
528; EG: VTX_READ_16
529; EG: VTX_READ_16
530; EG: VTX_READ_16
531; EG: VTX_READ_16
532; EG: VTX_READ_16
533; EG: VTX_READ_16
534; EG: VTX_READ_16
535; EG: VTX_READ_16
536; EG: VTX_READ_16
537; EG: VTX_READ_16
538; EG: VTX_READ_16
539; EG: VTX_READ_16
540; EG: VTX_READ_16
541
542; SI: buffer_load_ushort
543; SI: buffer_load_ushort
544; SI: buffer_load_ushort
545; SI: buffer_load_ushort
546; SI: buffer_load_ushort
547; SI: buffer_load_ushort
548; SI: buffer_load_ushort
549; SI: buffer_load_ushort
550; SI: buffer_load_ushort
551; SI: buffer_load_ushort
552; SI: buffer_load_ushort
553; SI: buffer_load_ushort
554; SI: buffer_load_ushort
555; SI: buffer_load_ushort
556; SI: buffer_load_ushort
557; SI: buffer_load_ushort
558
559; VI: s_load_dword s
560; VI: s_load_dword s
561; VI: s_load_dword s
562; VI: s_load_dword s
563; VI: s_load_dword s
564; VI: s_load_dword s
565; VI: s_load_dword s
566; VI: s_load_dword s
567define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
568entry:
569  store <16 x i16> %in, <16 x i16> addrspace(1)* %out
570  ret void
571}
572
573; FUNC-LABEL: {{^}}v16i32_arg:
574; HSA-VI: kernarg_segment_byte_size = 128
575; HSA-VI: kernarg_segment_alignment = 6
576; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
577; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
578; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
579; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X
580; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y
581; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z
582; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W
583; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
584; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
585; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
586; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W
587; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X
588; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y
589; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
590; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
591; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
592; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19
593; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64
594; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
595define amdgpu_kernel void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind {
596entry:
597  store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4
598  ret void
599}
600
601; FUNC-LABEL: {{^}}v16f32_arg:
602; HSA-VI: kernarg_segment_byte_size = 128
603; HSA-VI: kernarg_segment_alignment = 6
604; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
605; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
606; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
607; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X
608; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y
609; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z
610; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W
611; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
612; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
613; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
614; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W
615; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X
616; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y
617; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
618; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
619; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
620; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19
621; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64
622; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
623define amdgpu_kernel void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind {
624entry:
625  store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4
626  ret void
627}
628
629; FUNC-LABEL: {{^}}kernel_arg_i64:
630; MESA-GCN: s_load_dwordx2
631; MESA-GCN: s_load_dwordx2
632; MESA-GCN: buffer_store_dwordx2
633; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
634define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
635  store i64 %a, i64 addrspace(1)* %out, align 8
636  ret void
637}
638
639; FUNC-LABEL: {{^}}f64_kernel_arg:
640; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x9
641; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0xb
642; MESA-VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x24
643; MESA-VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x2c
644; MESA-GCN: buffer_store_dwordx2
645; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
646define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double  %in) {
647entry:
648  store double %in, double addrspace(1)* %out
649  ret void
650}
651
652; XFUNC-LABEL: {{^}}kernel_arg_v1i64:
653; XGCN: s_load_dwordx2
654; XGCN: s_load_dwordx2
655; XGCN: buffer_store_dwordx2
656; define amdgpu_kernel void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind {
657;   store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8
658;   ret void
659; }
660
661; FUNC-LABEL: {{^}}i1_arg:
662; HSA-VI: kernarg_segment_byte_size = 12
663; HSA-VI: kernarg_segment_alignment = 4
664
665; SI: buffer_load_ubyte
666; SI: v_and_b32_e32
667; SI: buffer_store_byte
668; SI: s_endpgm
669define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
670  store i1 %x, i1 addrspace(1)* %out, align 1
671  ret void
672}
673
674; FUNC-LABEL: {{^}}i1_arg_zext_i32:
675; HSA-VI: kernarg_segment_byte_size = 12
676; HSA-VI: kernarg_segment_alignment = 4
677
678; SI: buffer_load_ubyte
679; SI: buffer_store_dword
680; SI: s_endpgm
681define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
682  %ext = zext i1 %x to i32
683  store i32 %ext, i32 addrspace(1)* %out, align 4
684  ret void
685}
686
687; FUNC-LABEL: {{^}}i1_arg_zext_i64:
688; HSA-VI: kernarg_segment_byte_size = 12
689; HSA-VI: kernarg_segment_alignment = 4
690
691; SI: buffer_load_ubyte
692; SI: buffer_store_dwordx2
693; SI: s_endpgm
694define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
695  %ext = zext i1 %x to i64
696  store i64 %ext, i64 addrspace(1)* %out, align 8
697  ret void
698}
699
700; FUNC-LABEL: {{^}}i1_arg_sext_i32:
701; HSA-VI: kernarg_segment_byte_size = 12
702; HSA-VI: kernarg_segment_alignment = 4
703
704; SI: buffer_load_ubyte
705; SI: buffer_store_dword
706; SI: s_endpgm
707define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
708  %ext = sext i1 %x to i32
709  store i32 %ext, i32addrspace(1)* %out, align 4
710  ret void
711}
712
713; FUNC-LABEL: {{^}}i1_arg_sext_i64:
714; HSA-VI: kernarg_segment_byte_size = 12
715; HSA-VI: kernarg_segment_alignment = 4
716
717; SI: buffer_load_ubyte
718; SI: v_bfe_i32
719; SI: v_ashrrev_i32
720; SI: buffer_store_dwordx2
721; SI: s_endpgm
722define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
723  %ext = sext i1 %x to i64
724  store i64 %ext, i64 addrspace(1)* %out, align 8
725  ret void
726}
727