1; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefixes=SI,GCN,MESA-GCN,FUNC
2; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=VI,GCN,MESA-VI,MESA-GCN,FUNC
3; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs | FileCheck %s --check-prefixes=VI,GCN,HSA-VI,FUNC
4; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s --check-prefix=EG --check-prefix=FUNC
5; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck %s --check-prefix=EG --check-prefix=FUNC
6
7; FUNC-LABEL: {{^}}i8_arg:
8; HSA-VI: kernarg_segment_alignment = 4
9; EG: AND_INT {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
10; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
11; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
12; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
13; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8
14; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0
15; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]]
16; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]]
17; FIXME: Should be using s_load_dword
18; HSA-VI: flat_load_ubyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]
19
20define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
21entry:
22  %0 = zext i8 %in to i32
23  store i32 %0, i32 addrspace(1)* %out, align 4
24  ret void
25}
26
27; FUNC-LABEL: {{^}}i8_zext_arg:
28; HSA-VI: kernarg_segment_alignment = 4
29; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
30; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
31; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
32; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8
33; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0
34; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]]
35; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]]
36; FIXME: Should be using s_load_dword
37; HSA-VI: flat_load_ubyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]
38
39define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
40entry:
41  %0 = zext i8 %in to i32
42  store i32 %0, i32 addrspace(1)* %out, align 4
43  ret void
44}
45
46; FUNC-LABEL: {{^}}i8_sext_arg:
47; HSA-VI: kernarg_segment_alignment = 4
48; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
49; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
50; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
51; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8
52; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0
53; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]]
54; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]]
55; FIXME: Should be using s_load_dword
56; HSA-VI: flat_load_sbyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]
57
58define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
59entry:
60  %0 = sext i8 %in to i32
61  store i32 %0, i32 addrspace(1)* %out, align 4
62  ret void
63}
64
65; FUNC-LABEL: {{^}}i16_arg:
66; HSA-VI: kernarg_segment_alignment = 4
67; EG: AND_INT {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
68; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
69; MESA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
70; MESA-GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
71; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8
72; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0
73; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]]
74; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]]
75; FIXME: Should be using s_load_dword
76; HSA-VI: flat_load_ushort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]
77
78define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
79entry:
80  %0 = zext i16 %in to i32
81  store i32 %0, i32 addrspace(1)* %out, align 4
82  ret void
83}
84
85; FUNC-LABEL: {{^}}i16_zext_arg:
86; HSA-VI: kernarg_segment_alignment = 4
87; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
88; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
89; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
90; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8
91; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0
92; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]]
93; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]]
94; FIXME: Should be using s_load_dword
95; HSA-VI: flat_load_ushort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]
96
97define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
98entry:
99  %0 = zext i16 %in to i32
100  store i32 %0, i32 addrspace(1)* %out, align 4
101  ret void
102}
103
104; FUNC-LABEL: {{^}}i16_sext_arg:
105; HSA-VI: kernarg_segment_alignment = 4
106; EG: MOV {{[ *]*}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z
107; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
108; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
109; HSA-VI: s_add_u32 [[SPTR_LO:s[0-9]+]], s4, 8
110; HSA-VI: s_addc_u32 [[SPTR_HI:s[0-9]+]], s5, 0
111; HSA-VI: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], [[SPTR_LO]]
112; HSA-VI: v_mov_b32_e32 v[[VPTR_HI:[0-9]+]], [[SPTR_HI]]
113; FIXME: Should be using s_load_dword
114; HSA-VI: flat_load_sshort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]
115
116define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
117entry:
118  %0 = sext i16 %in to i32
119  store i32 %0, i32 addrspace(1)* %out, align 4
120  ret void
121}
122
123; FUNC-LABEL: {{^}}i32_arg:
124; HSA-VI: kernarg_segment_alignment = 4
125; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z
126; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
127; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
128; HSA-VI: s_load_dword s{{[0-9]}}, s[4:5], 0x8
129define amdgpu_kernel void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind {
130entry:
131  store i32 %in, i32 addrspace(1)* %out, align 4
132  ret void
133}
134
135; FUNC-LABEL: {{^}}f32_arg:
136; HSA-VI: kernarg_segment_alignment = 4
137; EG: T{{[0-9]\.[XYZW]}}, KC0[2].Z
138; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
139; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
140; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8
141define amdgpu_kernel void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind {
142entry:
143  store float %in, float addrspace(1)* %out, align 4
144  ret void
145}
146
147; FUNC-LABEL: {{^}}v2i8_arg:
148; HSA-VI: kernarg_segment_alignment = 4
149; EG: VTX_READ_8
150; EG: VTX_READ_8
151; MESA-GCN: buffer_load_ubyte
152; MESA-GCN: buffer_load_ubyte
153; HSA-VI: flat_load_ubyte
154; HSA-VI: flat_load_ubyte
155define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
156entry:
157  store <2 x i8> %in, <2 x i8> addrspace(1)* %out
158  ret void
159}
160
161; FUNC-LABEL: {{^}}v2i16_arg:
162; HSA-VI: kernarg_segment_alignment = 4
163; EG: VTX_READ_16
164; EG: VTX_READ_16
165; MESA-GCN: buffer_load_ushort
166; MESA-GCN: buffer_load_ushort
167; HSA-VI: flat_load_ushort
168; HSA-VI: flat_load_ushort
169define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) {
170entry:
171  store <2 x i16> %in, <2 x i16> addrspace(1)* %out
172  ret void
173}
174
175; FUNC-LABEL: {{^}}v2i32_arg:
176; HSA-VI: kernarg_segment_alignment = 4
177; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
178; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
179; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
180; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c
181; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
182define amdgpu_kernel void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind {
183entry:
184  store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4
185  ret void
186}
187
188; FUNC-LABEL: {{^}}v2f32_arg:
189; HSA-VI: kernarg_segment_alignment = 4
190; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].X
191; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[2].W
192; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
193; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c
194; HSA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[4:5], 0x8
195define amdgpu_kernel void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind {
196entry:
197  store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4
198  ret void
199}
200
201; FUNC-LABEL: {{^}}v3i8_arg:
202; HSA-VI: kernarg_segment_alignment = 4
203; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 40
204; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41
205; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42
206; MESA-GCN: buffer_load_ubyte
207; MESA-GCN: buffer_load_ubyte
208; MESA-GCN: buffer_load_ubyte
209; HSA-VI: flat_load_ubyte
210; HSA-VI: flat_load_ubyte
211; HSA-VI: flat_load_ubyte
212define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
213entry:
214  store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4
215  ret void
216}
217
218; FUNC-LABEL: {{^}}v3i16_arg:
219; HSA-VI: kernarg_segment_alignment = 4
220; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 44
221; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
222; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48
223; MESA-GCN: buffer_load_ushort
224; MESA-GCN: buffer_load_ushort
225; MESA-GCN: buffer_load_ushort
226; HSA-VI: flat_load_ushort
227; HSA-VI: flat_load_ushort
228; HSA-VI: flat_load_ushort
229define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind {
230entry:
231  store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
232  ret void
233}
234; FUNC-LABEL: {{^}}v3i32_arg:
235; HSA-VI: kernarg_segment_alignment = 4
236; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
237; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
238; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
239; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
240; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34
241; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
242define amdgpu_kernel void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind {
243entry:
244  store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4
245  ret void
246}
247
248; FUNC-LABEL: {{^}}v3f32_arg:
249; HSA-VI: kernarg_segment_alignment = 4
250; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
251; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
252; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
253; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
254; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34
255; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
256define amdgpu_kernel void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind {
257entry:
258  store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4
259  ret void
260}
261
262; FUNC-LABEL: {{^}}v4i8_arg:
263; HSA-VI: kernarg_segment_alignment = 4
264; EG: VTX_READ_8
265; EG: VTX_READ_8
266; EG: VTX_READ_8
267; EG: VTX_READ_8
268; MESA-GCN: buffer_load_ubyte
269; MESA-GCN: buffer_load_ubyte
270; MESA-GCN: buffer_load_ubyte
271; MESA-GCN: buffer_load_ubyte
272; HSA-VI: flat_load_ubyte
273; HSA-VI: flat_load_ubyte
274; HSA-VI: flat_load_ubyte
275; HSA-VI: flat_load_ubyte
276define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
277entry:
278  store <4 x i8> %in, <4 x i8> addrspace(1)* %out
279  ret void
280}
281
282; FUNC-LABEL: {{^}}v4i16_arg:
283; HSA-VI: kernarg_segment_alignment = 4
284; EG: VTX_READ_16
285; EG: VTX_READ_16
286; EG: VTX_READ_16
287; EG: VTX_READ_16
288; MESA-GCN: buffer_load_ushort
289; MESA-GCN: buffer_load_ushort
290; MESA-GCN: buffer_load_ushort
291; MESA-GCN: buffer_load_ushort
292; HSA-GCN: flat_load_ushort
293; HSA-GCN: flat_load_ushort
294; HSA-GCN: flat_load_ushort
295; HSA-GCN: flat_load_ushort
296define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
297entry:
298  store <4 x i16> %in, <4 x i16> addrspace(1)* %out
299  ret void
300}
301
302; FUNC-LABEL: {{^}}v4i32_arg:
303; HSA-VI: kernarg_segment_alignment = 4
304; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
305; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
306; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
307; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
308; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
309; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34
310; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
311define amdgpu_kernel void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind {
312entry:
313  store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4
314  ret void
315}
316
317; FUNC-LABEL: {{^}}v4f32_arg:
318; HSA-VI: kernarg_segment_alignment = 4
319; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Y
320; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].Z
321; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[3].W
322; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].X
323; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
324; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34
325; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
326define amdgpu_kernel void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind {
327entry:
328  store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4
329  ret void
330}
331
332; FUNC-LABEL: {{^}}v8i8_arg:
333; HSA-VI: kernarg_segment_alignment = 4
334; EG: VTX_READ_8
335; EG: VTX_READ_8
336; EG: VTX_READ_8
337; EG: VTX_READ_8
338; EG: VTX_READ_8
339; EG: VTX_READ_8
340; EG: VTX_READ_8
341; EG: VTX_READ_8
342; MESA-GCN: buffer_load_ubyte
343; MESA-GCN: buffer_load_ubyte
344; MESA-GCN: buffer_load_ubyte
345; MESA-GCN: buffer_load_ubyte
346; MESA-GCN: buffer_load_ubyte
347; MESA-GCN: buffer_load_ubyte
348; MESA-GCN: buffer_load_ubyte
349; HSA-GCN: float_load_ubyte
350; HSA-GCN: float_load_ubyte
351; HSA-GCN: float_load_ubyte
352; HSA-GCN: float_load_ubyte
353; HSA-GCN: float_load_ubyte
354; HSA-GCN: float_load_ubyte
355; HSA-GCN: float_load_ubyte
356; HSA-GCN: float_load_ubyte
357define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
358entry:
359  store <8 x i8> %in, <8 x i8> addrspace(1)* %out
360  ret void
361}
362
363; FUNC-LABEL: {{^}}v8i16_arg:
364; HSA-VI: kernarg_segment_alignment = 4
365; EG: VTX_READ_16
366; EG: VTX_READ_16
367; EG: VTX_READ_16
368; EG: VTX_READ_16
369; EG: VTX_READ_16
370; EG: VTX_READ_16
371; EG: VTX_READ_16
372; EG: VTX_READ_16
373; MESA-GCN: buffer_load_ushort
374; MESA-GCN: buffer_load_ushort
375; MESA-GCN: buffer_load_ushort
376; MESA-GCN: buffer_load_ushort
377; MESA-GCN: buffer_load_ushort
378; MESA-GCN: buffer_load_ushort
379; MESA-GCN: buffer_load_ushort
380; MESA-GCN: buffer_load_ushort
381; HSA-VI: flat_load_ushort
382; HSA-VI: flat_load_ushort
383; HSA-VI: flat_load_ushort
384; HSA-VI: flat_load_ushort
385; HSA-VI: flat_load_ushort
386; HSA-VI: flat_load_ushort
387; HSA-VI: flat_load_ushort
388; HSA-VI: flat_load_ushort
389define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
390entry:
391  store <8 x i16> %in, <8 x i16> addrspace(1)* %out
392  ret void
393}
394
395; FUNC-LABEL: {{^}}v8i32_arg:
396; HSA-VI: kernarg_segment_alignment = 5
397; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
398; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
399; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
400; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X
401; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y
402; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
403; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
404; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
405; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11
406; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44
407; HSA-VI: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20
408define amdgpu_kernel void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind {
409entry:
410  store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4
411  ret void
412}
413
414; FUNC-LABEL: {{^}}v8f32_arg:
415; HSA-VI: kernarg_segment_alignment = 5
416; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Y
417; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].Z
418; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[4].W
419; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].X
420; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Y
421; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
422; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
423; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
424; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11
425define amdgpu_kernel void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind {
426entry:
427  store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4
428  ret void
429}
430
431; FUNC-LABEL: {{^}}v16i8_arg:
432; HSA-VI: kernarg_segment_alignment = 4
433; EG: VTX_READ_8
434; EG: VTX_READ_8
435; EG: VTX_READ_8
436; EG: VTX_READ_8
437; EG: VTX_READ_8
438; EG: VTX_READ_8
439; EG: VTX_READ_8
440; EG: VTX_READ_8
441; EG: VTX_READ_8
442; EG: VTX_READ_8
443; EG: VTX_READ_8
444; EG: VTX_READ_8
445; EG: VTX_READ_8
446; EG: VTX_READ_8
447; EG: VTX_READ_8
448; EG: VTX_READ_8
449; MESA-GCN: buffer_load_ubyte
450; MESA-GCN: buffer_load_ubyte
451; MESA-GCN: buffer_load_ubyte
452; MESA-GCN: buffer_load_ubyte
453; MESA-GCN: buffer_load_ubyte
454; MESA-GCN: buffer_load_ubyte
455; MESA-GCN: buffer_load_ubyte
456; MESA-GCN: buffer_load_ubyte
457; MESA-GCN: buffer_load_ubyte
458; MESA-GCN: buffer_load_ubyte
459; MESA-GCN: buffer_load_ubyte
460; MESA-GCN: buffer_load_ubyte
461; MESA-GCN: buffer_load_ubyte
462; MESA-GCN: buffer_load_ubyte
463; MESA-GCN: buffer_load_ubyte
464; MESA-GCN: buffer_load_ubyte
465; HSA-VI: flat_load_ubyte
466; HSA-VI: flat_load_ubyte
467; HSA-VI: flat_load_ubyte
468; HSA-VI: flat_load_ubyte
469; HSA-VI: flat_load_ubyte
470; HSA-VI: flat_load_ubyte
471; HSA-VI: flat_load_ubyte
472; HSA-VI: flat_load_ubyte
473; HSA-VI: flat_load_ubyte
474; HSA-VI: flat_load_ubyte
475; HSA-VI: flat_load_ubyte
476; HSA-VI: flat_load_ubyte
477; HSA-VI: flat_load_ubyte
478; HSA-VI: flat_load_ubyte
479; HSA-VI: flat_load_ubyte
480; HSA-VI: flat_load_ubyte
481define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
482entry:
483  store <16 x i8> %in, <16 x i8> addrspace(1)* %out
484  ret void
485}
486
487; FUNC-LABEL: {{^}}v16i16_arg:
488; HSA-VI: kernarg_segment_alignment = 5
489; EG: VTX_READ_16
490; EG: VTX_READ_16
491; EG: VTX_READ_16
492; EG: VTX_READ_16
493; EG: VTX_READ_16
494; EG: VTX_READ_16
495; EG: VTX_READ_16
496; EG: VTX_READ_16
497; EG: VTX_READ_16
498; EG: VTX_READ_16
499; EG: VTX_READ_16
500; EG: VTX_READ_16
501; EG: VTX_READ_16
502; EG: VTX_READ_16
503; EG: VTX_READ_16
504; EG: VTX_READ_16
505; MESA-GCN: buffer_load_ushort
506; MESA-GCN: buffer_load_ushort
507; MESA-GCN: buffer_load_ushort
508; MESA-GCN: buffer_load_ushort
509; MESA-GCN: buffer_load_ushort
510; MESA-GCN: buffer_load_ushort
511; MESA-GCN: buffer_load_ushort
512; MESA-GCN: buffer_load_ushort
513; MESA-GCN: buffer_load_ushort
514; MESA-GCN: buffer_load_ushort
515; MESA-GCN: buffer_load_ushort
516; MESA-GCN: buffer_load_ushort
517; MESA-GCN: buffer_load_ushort
518; MESA-GCN: buffer_load_ushort
519; MESA-GCN: buffer_load_ushort
520; MESA-GCN: buffer_load_ushort
521; HSA-VI: flat_load_ushort
522; HSA-VI: flat_load_ushort
523; HSA-VI: flat_load_ushort
524; HSA-VI: flat_load_ushort
525; HSA-VI: flat_load_ushort
526; HSA-VI: flat_load_ushort
527; HSA-VI: flat_load_ushort
528; HSA-VI: flat_load_ushort
529; HSA-VI: flat_load_ushort
530; HSA-VI: flat_load_ushort
531; HSA-VI: flat_load_ushort
532; HSA-VI: flat_load_ushort
533; HSA-VI: flat_load_ushort
534; HSA-VI: flat_load_ushort
535; HSA-VI: flat_load_ushort
536; HSA-VI: flat_load_ushort
537define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
538entry:
539  store <16 x i16> %in, <16 x i16> addrspace(1)* %out
540  ret void
541}
542
543; FUNC-LABEL: {{^}}v16i32_arg:
544; HSA-VI: kernarg_segment_alignment = 6
545; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
546; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
547; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
548; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X
549; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y
550; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z
551; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W
552; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
553; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
554; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
555; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W
556; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X
557; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y
558; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
559; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
560; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
561; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19
562; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64
563; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
564define amdgpu_kernel void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind {
565entry:
566  store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4
567  ret void
568}
569
570; FUNC-LABEL: {{^}}v16f32_arg:
571; HSA-VI: kernarg_segment_alignment = 6
572; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Y
573; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].Z
574; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].W
575; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].X
576; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Y
577; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].Z
578; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[7].W
579; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].X
580; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Y
581; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].Z
582; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[8].W
583; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].X
584; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Y
585; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
586; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
587; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
588; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19
589; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64
590; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
591define amdgpu_kernel void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind {
592entry:
593  store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4
594  ret void
595}
596
597; FUNC-LABEL: {{^}}kernel_arg_i64:
598; MESA-GCN: s_load_dwordx2
599; MESA-GCN: s_load_dwordx2
600; MESA-GCN: buffer_store_dwordx2
601; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
602define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
603  store i64 %a, i64 addrspace(1)* %out, align 8
604  ret void
605}
606
607; FUNC-LABEL: {{^}}f64_kernel_arg:
608; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x9
609; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0xb
610; MESA-VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x24
611; MESA-VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x2c
612; MESA-GCN: buffer_store_dwordx2
613; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
614define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double  %in) {
615entry:
616  store double %in, double addrspace(1)* %out
617  ret void
618}
619
620; XFUNC-LABEL: {{^}}kernel_arg_v1i64:
621; XGCN: s_load_dwordx2
622; XGCN: s_load_dwordx2
623; XGCN: buffer_store_dwordx2
624; define amdgpu_kernel void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind {
625;   store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8
626;   ret void
627; }
628
629; FUNC-LABEL: {{^}}i1_arg:
630; SI: buffer_load_ubyte
631; SI: v_and_b32_e32
632; SI: buffer_store_byte
633; SI: s_endpgm
634define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
635  store i1 %x, i1 addrspace(1)* %out, align 1
636  ret void
637}
638
639; FUNC-LABEL: {{^}}i1_arg_zext_i32:
640; SI: buffer_load_ubyte
641; SI: buffer_store_dword
642; SI: s_endpgm
643define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
644  %ext = zext i1 %x to i32
645  store i32 %ext, i32 addrspace(1)* %out, align 4
646  ret void
647}
648
649; FUNC-LABEL: {{^}}i1_arg_zext_i64:
650; SI: buffer_load_ubyte
651; SI: buffer_store_dwordx2
652; SI: s_endpgm
653define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
654  %ext = zext i1 %x to i64
655  store i64 %ext, i64 addrspace(1)* %out, align 8
656  ret void
657}
658
659; FUNC-LABEL: {{^}}i1_arg_sext_i32:
660; SI: buffer_load_ubyte
661; SI: buffer_store_dword
662; SI: s_endpgm
663define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
664  %ext = sext i1 %x to i32
665  store i32 %ext, i32addrspace(1)* %out, align 4
666  ret void
667}
668
669; FUNC-LABEL: {{^}}i1_arg_sext_i64:
670; SI: buffer_load_ubyte
671; SI: v_bfe_i32
672; SI: v_ashrrev_i32
673; SI: buffer_store_dwordx2
674; SI: s_endpgm
675define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
676  %ext = sext i1 %x to i64
677  store i64 %ext, i64 addrspace(1)* %out, align 8
678  ret void
679}
680