1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope -check-prefix=MUBUF %s
3; RUN: llc -global-isel -mattr=+enable-flat-scratch -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope -check-prefix=FLATSCR %s
4
5; Test end-to-end codegen for outgoing arguments passed on the
6; stack. This test is likely redundant when all DAG and GlobalISel
7; tests are unified.
8
9declare hidden void @external_void_func_v16i32_v16i32_v4i32(<16 x i32>, <16 x i32>, <4 x i32>) #0
10declare hidden void @external_void_func_byval([16 x i32] addrspace(5)* byval([16 x i32])) #0
11
12define amdgpu_kernel void @kernel_caller_stack() {
13; MUBUF-LABEL: kernel_caller_stack:
14; MUBUF:       ; %bb.0:
15; MUBUF-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
16; MUBUF-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
17; MUBUF-NEXT:    s_add_u32 s0, s0, s7
18; MUBUF-NEXT:    s_mov_b32 s32, 0
19; MUBUF-NEXT:    s_addc_u32 s1, s1, 0
20; MUBUF-NEXT:    v_mov_b32_e32 v0, 9
21; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
22; MUBUF-NEXT:    v_mov_b32_e32 v0, 10
23; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
24; MUBUF-NEXT:    v_mov_b32_e32 v0, 11
25; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
26; MUBUF-NEXT:    v_mov_b32_e32 v0, 12
27; MUBUF-NEXT:    s_getpc_b64 s[4:5]
28; MUBUF-NEXT:    s_add_u32 s4, s4, external_void_func_v16i32_v16i32_v4i32@rel32@lo+4
29; MUBUF-NEXT:    s_addc_u32 s5, s5, external_void_func_v16i32_v16i32_v4i32@rel32@hi+12
30; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
31; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[4:5]
32; MUBUF-NEXT:    s_endpgm
33;
34; FLATSCR-LABEL: kernel_caller_stack:
35; FLATSCR:       ; %bb.0:
36; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
37; FLATSCR-NEXT:    s_mov_b32 s32, 0
38; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
39; FLATSCR-NEXT:    v_mov_b32_e32 v0, 9
40; FLATSCR-NEXT:    scratch_store_dword off, v0, s32 offset:4
41; FLATSCR-NEXT:    v_mov_b32_e32 v0, 10
42; FLATSCR-NEXT:    scratch_store_dword off, v0, s32 offset:8
43; FLATSCR-NEXT:    v_mov_b32_e32 v0, 11
44; FLATSCR-NEXT:    scratch_store_dword off, v0, s32 offset:12
45; FLATSCR-NEXT:    v_mov_b32_e32 v0, 12
46; FLATSCR-NEXT:    s_getpc_b64 s[0:1]
47; FLATSCR-NEXT:    s_add_u32 s0, s0, external_void_func_v16i32_v16i32_v4i32@rel32@lo+4
48; FLATSCR-NEXT:    s_addc_u32 s1, s1, external_void_func_v16i32_v16i32_v4i32@rel32@hi+12
49; FLATSCR-NEXT:    scratch_store_dword off, v0, s32 offset:16
50; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[0:1]
51; FLATSCR-NEXT:    s_endpgm
52  call void @external_void_func_v16i32_v16i32_v4i32(<16 x i32> undef, <16 x i32> undef, <4 x i32> <i32 9, i32 10, i32 11, i32 12>)
53  ret void
54}
55
56define amdgpu_kernel void @kernel_caller_byval() {
57; MUBUF-LABEL: kernel_caller_byval:
58; MUBUF:       ; %bb.0:
59; MUBUF-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
60; MUBUF-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
61; MUBUF-NEXT:    s_add_u32 s0, s0, s7
62; MUBUF-NEXT:    s_addc_u32 s1, s1, 0
63; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
64; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
65; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:12
66; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:16
67; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:20
68; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:24
69; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:28
70; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:32
71; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:36
72; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:40
73; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:44
74; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:48
75; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:52
76; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:56
77; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:60
78; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:64
79; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:68
80; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:72
81; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:76
82; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:80
83; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:84
84; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:88
85; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:92
86; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:96
87; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:100
88; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:104
89; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:108
90; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:112
91; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:116
92; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:120
93; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:124
94; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:128
95; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:132
96; MUBUF-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8
97; MUBUF-NEXT:    s_nop 0
98; MUBUF-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:12
99; MUBUF-NEXT:    buffer_load_dword v2, off, s[0:3], 0 offset:16
100; MUBUF-NEXT:    buffer_load_dword v3, off, s[0:3], 0 offset:20
101; MUBUF-NEXT:    buffer_load_dword v4, off, s[0:3], 0 offset:24
102; MUBUF-NEXT:    buffer_load_dword v5, off, s[0:3], 0 offset:28
103; MUBUF-NEXT:    buffer_load_dword v6, off, s[0:3], 0 offset:32
104; MUBUF-NEXT:    buffer_load_dword v7, off, s[0:3], 0 offset:36
105; MUBUF-NEXT:    buffer_load_dword v8, off, s[0:3], 0 offset:40
106; MUBUF-NEXT:    buffer_load_dword v9, off, s[0:3], 0 offset:44
107; MUBUF-NEXT:    buffer_load_dword v10, off, s[0:3], 0 offset:48
108; MUBUF-NEXT:    buffer_load_dword v11, off, s[0:3], 0 offset:52
109; MUBUF-NEXT:    buffer_load_dword v12, off, s[0:3], 0 offset:56
110; MUBUF-NEXT:    buffer_load_dword v13, off, s[0:3], 0 offset:60
111; MUBUF-NEXT:    buffer_load_dword v14, off, s[0:3], 0 offset:64
112; MUBUF-NEXT:    buffer_load_dword v15, off, s[0:3], 0 offset:68
113; MUBUF-NEXT:    s_movk_i32 s32, 0x1400
114; MUBUF-NEXT:    s_getpc_b64 s[4:5]
115; MUBUF-NEXT:    s_add_u32 s4, s4, external_void_func_byval@rel32@lo+4
116; MUBUF-NEXT:    s_addc_u32 s5, s5, external_void_func_byval@rel32@hi+12
117; MUBUF-NEXT:    s_waitcnt vmcnt(15)
118; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32
119; MUBUF-NEXT:    s_waitcnt vmcnt(15)
120; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
121; MUBUF-NEXT:    s_waitcnt vmcnt(15)
122; MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
123; MUBUF-NEXT:    s_waitcnt vmcnt(15)
124; MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12
125; MUBUF-NEXT:    s_waitcnt vmcnt(15)
126; MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:16
127; MUBUF-NEXT:    s_waitcnt vmcnt(15)
128; MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:20
129; MUBUF-NEXT:    s_waitcnt vmcnt(15)
130; MUBUF-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:24
131; MUBUF-NEXT:    s_waitcnt vmcnt(15)
132; MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:28
133; MUBUF-NEXT:    s_waitcnt vmcnt(15)
134; MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:32
135; MUBUF-NEXT:    s_waitcnt vmcnt(15)
136; MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:36
137; MUBUF-NEXT:    s_waitcnt vmcnt(15)
138; MUBUF-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:40
139; MUBUF-NEXT:    s_waitcnt vmcnt(15)
140; MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:44
141; MUBUF-NEXT:    s_waitcnt vmcnt(15)
142; MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:48
143; MUBUF-NEXT:    s_waitcnt vmcnt(15)
144; MUBUF-NEXT:    buffer_store_dword v13, off, s[0:3], s32 offset:52
145; MUBUF-NEXT:    s_waitcnt vmcnt(15)
146; MUBUF-NEXT:    buffer_store_dword v14, off, s[0:3], s32 offset:56
147; MUBUF-NEXT:    s_waitcnt vmcnt(15)
148; MUBUF-NEXT:    buffer_store_dword v15, off, s[0:3], s32 offset:60
149; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[4:5]
150; MUBUF-NEXT:    s_endpgm
151;
152; FLATSCR-LABEL: kernel_caller_byval:
153; FLATSCR:       ; %bb.0:
154; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
155; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
156; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
157; FLATSCR-NEXT:    v_mov_b32_e32 v1, 0
158; FLATSCR-NEXT:    s_mov_b32 vcc_hi, 0
159; FLATSCR-NEXT:    s_mov_b32 s33, 0
160; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], vcc_hi offset:8
161; FLATSCR-NEXT:    s_mov_b32 vcc_hi, 0
162; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s33 offset:72
163; FLATSCR-NEXT:    s_mov_b32 s33, 0
164; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], vcc_hi offset:16
165; FLATSCR-NEXT:    s_mov_b32 vcc_hi, 0
166; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s33 offset:80
167; FLATSCR-NEXT:    s_mov_b32 s33, 0
168; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], vcc_hi offset:24
169; FLATSCR-NEXT:    s_mov_b32 vcc_hi, 0
170; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s33 offset:88
171; FLATSCR-NEXT:    s_mov_b32 s33, 0
172; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], vcc_hi offset:32
173; FLATSCR-NEXT:    s_mov_b32 vcc_hi, 0
174; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s33 offset:96
175; FLATSCR-NEXT:    s_mov_b32 s33, 0
176; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], vcc_hi offset:40
177; FLATSCR-NEXT:    s_mov_b32 vcc_hi, 0
178; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s33 offset:104
179; FLATSCR-NEXT:    s_mov_b32 s33, 0
180; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], vcc_hi offset:48
181; FLATSCR-NEXT:    s_mov_b32 vcc_hi, 0
182; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s33 offset:112
183; FLATSCR-NEXT:    s_mov_b32 s33, 0
184; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], vcc_hi offset:56
185; FLATSCR-NEXT:    s_mov_b32 vcc_hi, 0
186; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s33 offset:120
187; FLATSCR-NEXT:    s_mov_b32 s33, 0
188; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], vcc_hi offset:64
189; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s33 offset:128
190; FLATSCR-NEXT:    s_mov_b32 s33, 0
191; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], off, s33 offset:8
192; FLATSCR-NEXT:    s_mov_b32 s33, 0
193; FLATSCR-NEXT:    scratch_load_dwordx2 v[2:3], off, s33 offset:16
194; FLATSCR-NEXT:    s_mov_b32 s33, 0
195; FLATSCR-NEXT:    scratch_load_dwordx2 v[4:5], off, s33 offset:24
196; FLATSCR-NEXT:    s_mov_b32 s33, 0
197; FLATSCR-NEXT:    scratch_load_dwordx2 v[6:7], off, s33 offset:32
198; FLATSCR-NEXT:    s_mov_b32 s33, 0
199; FLATSCR-NEXT:    scratch_load_dwordx2 v[8:9], off, s33 offset:40
200; FLATSCR-NEXT:    s_mov_b32 s33, 0
201; FLATSCR-NEXT:    scratch_load_dwordx2 v[10:11], off, s33 offset:48
202; FLATSCR-NEXT:    s_mov_b32 s33, 0
203; FLATSCR-NEXT:    scratch_load_dwordx2 v[12:13], off, s33 offset:56
204; FLATSCR-NEXT:    s_mov_b32 s33, 0
205; FLATSCR-NEXT:    scratch_load_dwordx2 v[14:15], off, s33 offset:64
206; FLATSCR-NEXT:    s_movk_i32 s32, 0x50
207; FLATSCR-NEXT:    s_getpc_b64 s[0:1]
208; FLATSCR-NEXT:    s_add_u32 s0, s0, external_void_func_byval@rel32@lo+4
209; FLATSCR-NEXT:    s_addc_u32 s1, s1, external_void_func_byval@rel32@hi+12
210; FLATSCR-NEXT:    s_waitcnt vmcnt(7)
211; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s32
212; FLATSCR-NEXT:    s_waitcnt vmcnt(7)
213; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[2:3], s32 offset:8
214; FLATSCR-NEXT:    s_waitcnt vmcnt(7)
215; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[4:5], s32 offset:16
216; FLATSCR-NEXT:    s_waitcnt vmcnt(7)
217; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[6:7], s32 offset:24
218; FLATSCR-NEXT:    s_waitcnt vmcnt(7)
219; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[8:9], s32 offset:32
220; FLATSCR-NEXT:    s_waitcnt vmcnt(7)
221; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[10:11], s32 offset:40
222; FLATSCR-NEXT:    s_waitcnt vmcnt(7)
223; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[12:13], s32 offset:48
224; FLATSCR-NEXT:    s_waitcnt vmcnt(7)
225; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[14:15], s32 offset:56
226; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[0:1]
227; FLATSCR-NEXT:    s_endpgm
228  %alloca = alloca [16 x i32], align 4, addrspace(5)
229  %cast = bitcast [16 x i32] addrspace(5)* %alloca to i8 addrspace(5)*
230  call void @llvm.memset.p5i8.i32(i8 addrspace(5)* align 4 %cast, i8 0, i32 128, i1 false)
231  call void @external_void_func_byval([16 x i32] addrspace(5)* byval([16 x i32]) %alloca)
232  ret void
233}
234
235define void @func_caller_stack() {
236; MUBUF-LABEL: func_caller_stack:
237; MUBUF:       ; %bb.0:
238; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
239; MUBUF-NEXT:    s_or_saveexec_b64 s[4:5], -1
240; MUBUF-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
241; MUBUF-NEXT:    s_mov_b64 exec, s[4:5]
242; MUBUF-NEXT:    v_writelane_b32 v40, s33, 2
243; MUBUF-NEXT:    s_mov_b32 s33, s32
244; MUBUF-NEXT:    s_addk_i32 s32, 0x400
245; MUBUF-NEXT:    v_mov_b32_e32 v0, 9
246; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
247; MUBUF-NEXT:    v_mov_b32_e32 v0, 10
248; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
249; MUBUF-NEXT:    v_mov_b32_e32 v0, 11
250; MUBUF-NEXT:    v_writelane_b32 v40, s30, 0
251; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
252; MUBUF-NEXT:    v_mov_b32_e32 v0, 12
253; MUBUF-NEXT:    v_writelane_b32 v40, s31, 1
254; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
255; MUBUF-NEXT:    s_getpc_b64 s[4:5]
256; MUBUF-NEXT:    s_add_u32 s4, s4, external_void_func_v16i32_v16i32_v4i32@rel32@lo+4
257; MUBUF-NEXT:    s_addc_u32 s5, s5, external_void_func_v16i32_v16i32_v4i32@rel32@hi+12
258; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[4:5]
259; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
260; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
261; MUBUF-NEXT:    s_addk_i32 s32, 0xfc00
262; MUBUF-NEXT:    v_readlane_b32 s33, v40, 2
263; MUBUF-NEXT:    s_or_saveexec_b64 s[4:5], -1
264; MUBUF-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
265; MUBUF-NEXT:    s_mov_b64 exec, s[4:5]
266; MUBUF-NEXT:    s_waitcnt vmcnt(0)
267; MUBUF-NEXT:    s_setpc_b64 s[30:31]
268;
269; FLATSCR-LABEL: func_caller_stack:
270; FLATSCR:       ; %bb.0:
271; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
272; FLATSCR-NEXT:    s_or_saveexec_b64 s[0:1], -1
273; FLATSCR-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
274; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
275; FLATSCR-NEXT:    v_writelane_b32 v40, s33, 2
276; FLATSCR-NEXT:    s_mov_b32 s33, s32
277; FLATSCR-NEXT:    s_add_i32 s32, s32, 16
278; FLATSCR-NEXT:    v_mov_b32_e32 v0, 9
279; FLATSCR-NEXT:    scratch_store_dword off, v0, s32 offset:4
280; FLATSCR-NEXT:    v_mov_b32_e32 v0, 10
281; FLATSCR-NEXT:    scratch_store_dword off, v0, s32 offset:8
282; FLATSCR-NEXT:    v_mov_b32_e32 v0, 11
283; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 0
284; FLATSCR-NEXT:    scratch_store_dword off, v0, s32 offset:12
285; FLATSCR-NEXT:    v_mov_b32_e32 v0, 12
286; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 1
287; FLATSCR-NEXT:    scratch_store_dword off, v0, s32 offset:16
288; FLATSCR-NEXT:    s_getpc_b64 s[0:1]
289; FLATSCR-NEXT:    s_add_u32 s0, s0, external_void_func_v16i32_v16i32_v4i32@rel32@lo+4
290; FLATSCR-NEXT:    s_addc_u32 s1, s1, external_void_func_v16i32_v16i32_v4i32@rel32@hi+12
291; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[0:1]
292; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
293; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
294; FLATSCR-NEXT:    s_add_i32 s32, s32, -16
295; FLATSCR-NEXT:    v_readlane_b32 s33, v40, 2
296; FLATSCR-NEXT:    s_or_saveexec_b64 s[0:1], -1
297; FLATSCR-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
298; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
299; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
300; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
301  call void @external_void_func_v16i32_v16i32_v4i32(<16 x i32> undef, <16 x i32> undef, <4 x i32> <i32 9, i32 10, i32 11, i32 12>)
302  ret void
303}
304
305define void @func_caller_byval([16 x i32] addrspace(5)* %argptr) {
306; MUBUF-LABEL: func_caller_byval:
307; MUBUF:       ; %bb.0:
308; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
309; MUBUF-NEXT:    s_or_saveexec_b64 s[4:5], -1
310; MUBUF-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
311; MUBUF-NEXT:    s_mov_b64 exec, s[4:5]
312; MUBUF-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen
313; MUBUF-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen offset:4
314; MUBUF-NEXT:    v_writelane_b32 v40, s33, 2
315; MUBUF-NEXT:    s_mov_b32 s33, s32
316; MUBUF-NEXT:    s_addk_i32 s32, 0x400
317; MUBUF-NEXT:    v_writelane_b32 v40, s30, 0
318; MUBUF-NEXT:    v_writelane_b32 v40, s31, 1
319; MUBUF-NEXT:    s_getpc_b64 s[4:5]
320; MUBUF-NEXT:    s_add_u32 s4, s4, external_void_func_byval@rel32@lo+4
321; MUBUF-NEXT:    s_addc_u32 s5, s5, external_void_func_byval@rel32@hi+12
322; MUBUF-NEXT:    s_waitcnt vmcnt(1)
323; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32
324; MUBUF-NEXT:    s_waitcnt vmcnt(1)
325; MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:4
326; MUBUF-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen offset:8
327; MUBUF-NEXT:    s_nop 0
328; MUBUF-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen offset:12
329; MUBUF-NEXT:    s_waitcnt vmcnt(1)
330; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8
331; MUBUF-NEXT:    s_waitcnt vmcnt(1)
332; MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:12
333; MUBUF-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen offset:16
334; MUBUF-NEXT:    s_nop 0
335; MUBUF-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen offset:20
336; MUBUF-NEXT:    s_waitcnt vmcnt(1)
337; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:16
338; MUBUF-NEXT:    s_waitcnt vmcnt(1)
339; MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:20
340; MUBUF-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen offset:24
341; MUBUF-NEXT:    s_nop 0
342; MUBUF-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen offset:28
343; MUBUF-NEXT:    s_waitcnt vmcnt(1)
344; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:24
345; MUBUF-NEXT:    s_waitcnt vmcnt(1)
346; MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:28
347; MUBUF-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen offset:32
348; MUBUF-NEXT:    s_nop 0
349; MUBUF-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen offset:36
350; MUBUF-NEXT:    s_waitcnt vmcnt(1)
351; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:32
352; MUBUF-NEXT:    s_waitcnt vmcnt(1)
353; MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:36
354; MUBUF-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen offset:40
355; MUBUF-NEXT:    s_nop 0
356; MUBUF-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen offset:44
357; MUBUF-NEXT:    s_waitcnt vmcnt(1)
358; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:40
359; MUBUF-NEXT:    s_waitcnt vmcnt(1)
360; MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:44
361; MUBUF-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen offset:48
362; MUBUF-NEXT:    s_nop 0
363; MUBUF-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen offset:52
364; MUBUF-NEXT:    s_waitcnt vmcnt(1)
365; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:48
366; MUBUF-NEXT:    s_waitcnt vmcnt(1)
367; MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:52
368; MUBUF-NEXT:    buffer_load_dword v1, v0, s[0:3], 0 offen offset:56
369; MUBUF-NEXT:    s_nop 0
370; MUBUF-NEXT:    buffer_load_dword v2, v0, s[0:3], 0 offen offset:60
371; MUBUF-NEXT:    s_waitcnt vmcnt(1)
372; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:56
373; MUBUF-NEXT:    s_waitcnt vmcnt(1)
374; MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:60
375; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[4:5]
376; MUBUF-NEXT:    v_readlane_b32 s31, v40, 1
377; MUBUF-NEXT:    v_readlane_b32 s30, v40, 0
378; MUBUF-NEXT:    s_addk_i32 s32, 0xfc00
379; MUBUF-NEXT:    v_readlane_b32 s33, v40, 2
380; MUBUF-NEXT:    s_or_saveexec_b64 s[4:5], -1
381; MUBUF-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
382; MUBUF-NEXT:    s_mov_b64 exec, s[4:5]
383; MUBUF-NEXT:    s_waitcnt vmcnt(0)
384; MUBUF-NEXT:    s_setpc_b64 s[30:31]
385;
386; FLATSCR-LABEL: func_caller_byval:
387; FLATSCR:       ; %bb.0:
388; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
389; FLATSCR-NEXT:    s_or_saveexec_b64 s[0:1], -1
390; FLATSCR-NEXT:    scratch_store_dword off, v40, s32 ; 4-byte Folded Spill
391; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
392; FLATSCR-NEXT:    scratch_load_dwordx2 v[1:2], v0, off
393; FLATSCR-NEXT:    v_writelane_b32 v40, s33, 2
394; FLATSCR-NEXT:    s_mov_b32 s33, s32
395; FLATSCR-NEXT:    s_add_i32 s32, s32, 16
396; FLATSCR-NEXT:    v_writelane_b32 v40, s30, 0
397; FLATSCR-NEXT:    v_writelane_b32 v40, s31, 1
398; FLATSCR-NEXT:    s_getpc_b64 s[0:1]
399; FLATSCR-NEXT:    s_add_u32 s0, s0, external_void_func_byval@rel32@lo+4
400; FLATSCR-NEXT:    s_addc_u32 s1, s1, external_void_func_byval@rel32@hi+12
401; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
402; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[1:2], s32
403; FLATSCR-NEXT:    scratch_load_dwordx2 v[1:2], v0, off offset:8
404; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
405; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[1:2], s32 offset:8
406; FLATSCR-NEXT:    scratch_load_dwordx2 v[1:2], v0, off offset:16
407; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
408; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[1:2], s32 offset:16
409; FLATSCR-NEXT:    scratch_load_dwordx2 v[1:2], v0, off offset:24
410; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
411; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[1:2], s32 offset:24
412; FLATSCR-NEXT:    scratch_load_dwordx2 v[1:2], v0, off offset:32
413; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
414; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[1:2], s32 offset:32
415; FLATSCR-NEXT:    scratch_load_dwordx2 v[1:2], v0, off offset:40
416; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
417; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[1:2], s32 offset:40
418; FLATSCR-NEXT:    scratch_load_dwordx2 v[1:2], v0, off offset:48
419; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
420; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[1:2], s32 offset:48
421; FLATSCR-NEXT:    scratch_load_dwordx2 v[0:1], v0, off offset:56
422; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
423; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[0:1], s32 offset:56
424; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[0:1]
425; FLATSCR-NEXT:    v_readlane_b32 s31, v40, 1
426; FLATSCR-NEXT:    v_readlane_b32 s30, v40, 0
427; FLATSCR-NEXT:    s_add_i32 s32, s32, -16
428; FLATSCR-NEXT:    v_readlane_b32 s33, v40, 2
429; FLATSCR-NEXT:    s_or_saveexec_b64 s[0:1], -1
430; FLATSCR-NEXT:    scratch_load_dword v40, off, s32 ; 4-byte Folded Reload
431; FLATSCR-NEXT:    s_mov_b64 exec, s[0:1]
432; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
433; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
434  %cast = bitcast [16 x i32] addrspace(5)* %argptr to i8 addrspace(5)*
435  call void @external_void_func_byval([16 x i32] addrspace(5)* byval([16 x i32]) %argptr)
436  ret void
437}
438
439declare void @llvm.memset.p5i8.i32(i8 addrspace(5)* nocapture writeonly, i8, i32, i1 immarg) #1
440
441attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
442attributes #1 = { argmemonly nofree nounwind willreturn writeonly }
443