1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=GFX803 %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX900 %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX1010 %s
5
6define amdgpu_kernel void @test_kern_empty() local_unnamed_addr #0 {
7; GFX803-LABEL: test_kern_empty:
8; GFX803:       ; %bb.0: ; %entry
9; GFX803-NEXT:    s_endpgm
10;
11; GFX900-LABEL: test_kern_empty:
12; GFX900:       ; %bb.0: ; %entry
13; GFX900-NEXT:    s_endpgm
14;
15; GFX1010-LABEL: test_kern_empty:
16; GFX1010:       ; %bb.0: ; %entry
17; GFX1010-NEXT:    s_endpgm
18entry:
19  ret void
20}
21
22define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 {
23; GFX803-LABEL: test_kern_stack:
24; GFX803:       ; %bb.0: ; %entry
25; GFX803-NEXT:    s_add_u32 s4, s4, s7
26; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
27; GFX803-NEXT:    s_add_u32 s0, s0, s7
28; GFX803-NEXT:    s_addc_u32 s1, s1, 0
29; GFX803-NEXT:    v_mov_b32_e32 v0, 0
30; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s5
31; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
32; GFX803-NEXT:    s_endpgm
33;
34; GFX900-LABEL: test_kern_stack:
35; GFX900:       ; %bb.0: ; %entry
36; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
37; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
38; GFX900-NEXT:    s_add_u32 s0, s0, s7
39; GFX900-NEXT:    s_addc_u32 s1, s1, 0
40; GFX900-NEXT:    v_mov_b32_e32 v0, 0
41; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
42; GFX900-NEXT:    s_endpgm
43;
44; GFX1010-LABEL: test_kern_stack:
45; GFX1010:       ; %bb.0: ; %entry
46; GFX1010-NEXT:    s_add_u32 s4, s4, s7
47; GFX1010-NEXT:    s_addc_u32 s5, s5, 0
48; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
49; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
50; GFX1010-NEXT:    s_add_u32 s0, s0, s7
51; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
52; GFX1010-NEXT:    v_mov_b32_e32 v0, 0
53; GFX1010-NEXT:    ; implicit-def: $vcc_hi
54; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
55; GFX1010-NEXT:    s_endpgm
56entry:
57  %x = alloca i32, align 4, addrspace(5)
58  store volatile i32 0, i32 addrspace(5)* %x, align 4
59  ret void
60}
61
62define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
63; GFX803-LABEL: test_kern_call:
64; GFX803:       ; %bb.0: ; %entry
65; GFX803-NEXT:    s_add_u32 s4, s4, s7
66; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
67; GFX803-NEXT:    s_add_u32 s0, s0, s7
68; GFX803-NEXT:    s_addc_u32 s1, s1, 0
69; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s5
70; GFX803-NEXT:    s_getpc_b64 s[4:5]
71; GFX803-NEXT:    s_add_u32 s4, s4, ex@rel32@lo+4
72; GFX803-NEXT:    s_addc_u32 s5, s5, ex@rel32@hi+4
73; GFX803-NEXT:    s_mov_b32 s32, 0
74; GFX803-NEXT:    s_swappc_b64 s[30:31], s[4:5]
75; GFX803-NEXT:    s_endpgm
76;
77; GFX900-LABEL: test_kern_call:
78; GFX900:       ; %bb.0: ; %entry
79; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
80; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
81; GFX900-NEXT:    s_add_u32 s0, s0, s7
82; GFX900-NEXT:    s_addc_u32 s1, s1, 0
83; GFX900-NEXT:    s_getpc_b64 s[4:5]
84; GFX900-NEXT:    s_add_u32 s4, s4, ex@rel32@lo+4
85; GFX900-NEXT:    s_addc_u32 s5, s5, ex@rel32@hi+4
86; GFX900-NEXT:    s_mov_b32 s32, 0
87; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5]
88; GFX900-NEXT:    s_endpgm
89;
90; GFX1010-LABEL: test_kern_call:
91; GFX1010:       ; %bb.0: ; %entry
92; GFX1010-NEXT:    s_add_u32 s4, s4, s7
93; GFX1010-NEXT:    s_mov_b32 s32, 0
94; GFX1010-NEXT:    s_addc_u32 s5, s5, 0
95; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
96; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
97; GFX1010-NEXT:    s_add_u32 s0, s0, s7
98; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
99; GFX1010-NEXT:    s_getpc_b64 s[4:5]
100; GFX1010-NEXT:    s_add_u32 s4, s4, ex@rel32@lo+4
101; GFX1010-NEXT:    s_addc_u32 s5, s5, ex@rel32@hi+4
102; GFX1010-NEXT:    ; implicit-def: $vcc_hi
103; GFX1010-NEXT:    s_swappc_b64 s[30:31], s[4:5]
104; GFX1010-NEXT:    s_endpgm
105entry:
106  tail call void @ex() #0
107  ret void
108}
109
110define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
111; GFX803-LABEL: test_kern_stack_and_call:
112; GFX803:       ; %bb.0: ; %entry
113; GFX803-NEXT:    s_add_u32 s4, s4, s7
114; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
115; GFX803-NEXT:    s_add_u32 s0, s0, s7
116; GFX803-NEXT:    s_addc_u32 s1, s1, 0
117; GFX803-NEXT:    v_mov_b32_e32 v0, 0
118; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s5
119; GFX803-NEXT:    s_getpc_b64 s[4:5]
120; GFX803-NEXT:    s_add_u32 s4, s4, ex@rel32@lo+4
121; GFX803-NEXT:    s_addc_u32 s5, s5, ex@rel32@hi+4
122; GFX803-NEXT:    s_movk_i32 s32, 0x400
123; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
124; GFX803-NEXT:    s_swappc_b64 s[30:31], s[4:5]
125; GFX803-NEXT:    s_endpgm
126;
127; GFX900-LABEL: test_kern_stack_and_call:
128; GFX900:       ; %bb.0: ; %entry
129; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
130; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
131; GFX900-NEXT:    s_add_u32 s0, s0, s7
132; GFX900-NEXT:    s_addc_u32 s1, s1, 0
133; GFX900-NEXT:    v_mov_b32_e32 v0, 0
134; GFX900-NEXT:    s_getpc_b64 s[4:5]
135; GFX900-NEXT:    s_add_u32 s4, s4, ex@rel32@lo+4
136; GFX900-NEXT:    s_addc_u32 s5, s5, ex@rel32@hi+4
137; GFX900-NEXT:    s_movk_i32 s32, 0x400
138; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
139; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5]
140; GFX900-NEXT:    s_endpgm
141;
142; GFX1010-LABEL: test_kern_stack_and_call:
143; GFX1010:       ; %bb.0: ; %entry
144; GFX1010-NEXT:    s_add_u32 s4, s4, s7
145; GFX1010-NEXT:    s_movk_i32 s32, 0x200
146; GFX1010-NEXT:    s_addc_u32 s5, s5, 0
147; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
148; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
149; GFX1010-NEXT:    s_add_u32 s0, s0, s7
150; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
151; GFX1010-NEXT:    v_mov_b32_e32 v0, 0
152; GFX1010-NEXT:    s_getpc_b64 s[4:5]
153; GFX1010-NEXT:    s_add_u32 s4, s4, ex@rel32@lo+4
154; GFX1010-NEXT:    s_addc_u32 s5, s5, ex@rel32@hi+4
155; GFX1010-NEXT:    ; implicit-def: $vcc_hi
156; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
157; GFX1010-NEXT:    s_swappc_b64 s[30:31], s[4:5]
158; GFX1010-NEXT:    s_endpgm
159entry:
160  %x = alloca i32, align 4, addrspace(5)
161  store volatile i32 0, i32 addrspace(5)* %x, align 4
162  tail call void @ex() #0
163  ret void
164}
165
166define amdgpu_kernel void @test_force_fp_kern_empty() local_unnamed_addr #2 {
167; GFX803-LABEL: test_force_fp_kern_empty:
168; GFX803:       ; %bb.0: ; %entry
169; GFX803-NEXT:    s_mov_b32 s33, 0
170; GFX803-NEXT:    s_endpgm
171;
172; GFX900-LABEL: test_force_fp_kern_empty:
173; GFX900:       ; %bb.0: ; %entry
174; GFX900-NEXT:    s_mov_b32 s33, 0
175; GFX900-NEXT:    s_endpgm
176;
177; GFX1010-LABEL: test_force_fp_kern_empty:
178; GFX1010:       ; %bb.0: ; %entry
179; GFX1010-NEXT:    s_mov_b32 s33, 0
180; GFX1010-NEXT:    s_endpgm
181entry:
182  ret void
183}
184
185define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 {
186; GFX803-LABEL: test_force_fp_kern_stack:
187; GFX803:       ; %bb.0: ; %entry
188; GFX803-NEXT:    s_add_u32 s4, s4, s7
189; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
190; GFX803-NEXT:    s_add_u32 s0, s0, s7
191; GFX803-NEXT:    s_mov_b32 s33, 0
192; GFX803-NEXT:    s_addc_u32 s1, s1, 0
193; GFX803-NEXT:    v_mov_b32_e32 v0, 0
194; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s5
195; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
196; GFX803-NEXT:    s_endpgm
197;
198; GFX900-LABEL: test_force_fp_kern_stack:
199; GFX900:       ; %bb.0: ; %entry
200; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
201; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
202; GFX900-NEXT:    s_add_u32 s0, s0, s7
203; GFX900-NEXT:    s_mov_b32 s33, 0
204; GFX900-NEXT:    s_addc_u32 s1, s1, 0
205; GFX900-NEXT:    v_mov_b32_e32 v0, 0
206; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
207; GFX900-NEXT:    s_endpgm
208;
209; GFX1010-LABEL: test_force_fp_kern_stack:
210; GFX1010:       ; %bb.0: ; %entry
211; GFX1010-NEXT:    s_add_u32 s4, s4, s7
212; GFX1010-NEXT:    s_mov_b32 s33, 0
213; GFX1010-NEXT:    s_addc_u32 s5, s5, 0
214; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
215; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
216; GFX1010-NEXT:    s_add_u32 s0, s0, s7
217; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
218; GFX1010-NEXT:    v_mov_b32_e32 v0, 0
219; GFX1010-NEXT:    ; implicit-def: $vcc_hi
220; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
221; GFX1010-NEXT:    s_endpgm
222entry:
223  %x = alloca i32, align 4, addrspace(5)
224  store volatile i32 0, i32 addrspace(5)* %x, align 4
225  ret void
226}
227
228define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
229; GFX803-LABEL: test_force_fp_kern_call:
230; GFX803:       ; %bb.0: ; %entry
231; GFX803-NEXT:    s_add_u32 s4, s4, s7
232; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
233; GFX803-NEXT:    s_add_u32 s0, s0, s7
234; GFX803-NEXT:    s_addc_u32 s1, s1, 0
235; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s5
236; GFX803-NEXT:    s_getpc_b64 s[4:5]
237; GFX803-NEXT:    s_add_u32 s4, s4, ex@rel32@lo+4
238; GFX803-NEXT:    s_addc_u32 s5, s5, ex@rel32@hi+4
239; GFX803-NEXT:    s_mov_b32 s32, 0
240; GFX803-NEXT:    s_mov_b32 s33, 0
241; GFX803-NEXT:    s_swappc_b64 s[30:31], s[4:5]
242; GFX803-NEXT:    s_endpgm
243;
244; GFX900-LABEL: test_force_fp_kern_call:
245; GFX900:       ; %bb.0: ; %entry
246; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
247; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
248; GFX900-NEXT:    s_add_u32 s0, s0, s7
249; GFX900-NEXT:    s_addc_u32 s1, s1, 0
250; GFX900-NEXT:    s_getpc_b64 s[4:5]
251; GFX900-NEXT:    s_add_u32 s4, s4, ex@rel32@lo+4
252; GFX900-NEXT:    s_addc_u32 s5, s5, ex@rel32@hi+4
253; GFX900-NEXT:    s_mov_b32 s32, 0
254; GFX900-NEXT:    s_mov_b32 s33, 0
255; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5]
256; GFX900-NEXT:    s_endpgm
257;
258; GFX1010-LABEL: test_force_fp_kern_call:
259; GFX1010:       ; %bb.0: ; %entry
260; GFX1010-NEXT:    s_add_u32 s4, s4, s7
261; GFX1010-NEXT:    s_mov_b32 s32, 0
262; GFX1010-NEXT:    s_mov_b32 s33, 0
263; GFX1010-NEXT:    s_addc_u32 s5, s5, 0
264; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
265; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
266; GFX1010-NEXT:    s_add_u32 s0, s0, s7
267; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
268; GFX1010-NEXT:    s_getpc_b64 s[4:5]
269; GFX1010-NEXT:    s_add_u32 s4, s4, ex@rel32@lo+4
270; GFX1010-NEXT:    s_addc_u32 s5, s5, ex@rel32@hi+4
271; GFX1010-NEXT:    ; implicit-def: $vcc_hi
272; GFX1010-NEXT:    s_swappc_b64 s[30:31], s[4:5]
273; GFX1010-NEXT:    s_endpgm
274entry:
275  tail call void @ex() #2
276  ret void
277}
278
279define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_addr #2 {
280; GFX803-LABEL: test_force_fp_kern_stack_and_call:
281; GFX803:       ; %bb.0: ; %entry
282; GFX803-NEXT:    s_add_u32 s4, s4, s7
283; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
284; GFX803-NEXT:    s_add_u32 s0, s0, s7
285; GFX803-NEXT:    s_mov_b32 s33, 0
286; GFX803-NEXT:    s_addc_u32 s1, s1, 0
287; GFX803-NEXT:    v_mov_b32_e32 v0, 0
288; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s5
289; GFX803-NEXT:    s_getpc_b64 s[4:5]
290; GFX803-NEXT:    s_add_u32 s4, s4, ex@rel32@lo+4
291; GFX803-NEXT:    s_addc_u32 s5, s5, ex@rel32@hi+4
292; GFX803-NEXT:    s_movk_i32 s32, 0x400
293; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
294; GFX803-NEXT:    s_swappc_b64 s[30:31], s[4:5]
295; GFX803-NEXT:    s_endpgm
296;
297; GFX900-LABEL: test_force_fp_kern_stack_and_call:
298; GFX900:       ; %bb.0: ; %entry
299; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
300; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
301; GFX900-NEXT:    s_add_u32 s0, s0, s7
302; GFX900-NEXT:    s_addc_u32 s1, s1, 0
303; GFX900-NEXT:    s_mov_b32 s33, 0
304; GFX900-NEXT:    v_mov_b32_e32 v0, 0
305; GFX900-NEXT:    s_getpc_b64 s[4:5]
306; GFX900-NEXT:    s_add_u32 s4, s4, ex@rel32@lo+4
307; GFX900-NEXT:    s_addc_u32 s5, s5, ex@rel32@hi+4
308; GFX900-NEXT:    s_movk_i32 s32, 0x400
309; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
310; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5]
311; GFX900-NEXT:    s_endpgm
312;
313; GFX1010-LABEL: test_force_fp_kern_stack_and_call:
314; GFX1010:       ; %bb.0: ; %entry
315; GFX1010-NEXT:    s_add_u32 s4, s4, s7
316; GFX1010-NEXT:    s_movk_i32 s32, 0x200
317; GFX1010-NEXT:    s_mov_b32 s33, 0
318; GFX1010-NEXT:    s_addc_u32 s5, s5, 0
319; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
320; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
321; GFX1010-NEXT:    s_add_u32 s0, s0, s7
322; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
323; GFX1010-NEXT:    v_mov_b32_e32 v0, 0
324; GFX1010-NEXT:    s_getpc_b64 s[4:5]
325; GFX1010-NEXT:    s_add_u32 s4, s4, ex@rel32@lo+4
326; GFX1010-NEXT:    s_addc_u32 s5, s5, ex@rel32@hi+4
327; GFX1010-NEXT:    ; implicit-def: $vcc_hi
328; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
329; GFX1010-NEXT:    s_swappc_b64 s[30:31], s[4:5]
330; GFX1010-NEXT:    s_endpgm
331entry:
332  %x = alloca i32, align 4, addrspace(5)
333  store volatile i32 0, i32 addrspace(5)* %x, align 4
334  tail call void @ex() #2
335  ret void
336}
337
338define amdgpu_kernel void @test_sgpr_offset_kernel() #1 {
339; GFX803-LABEL: test_sgpr_offset_kernel:
340; GFX803:       ; %bb.0: ; %entry
341; GFX803-NEXT:    s_add_u32 s4, s4, s7
342; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
343; GFX803-NEXT:    s_add_u32 s0, s0, s7
344; GFX803-NEXT:    s_addc_u32 s1, s1, 0
345; GFX803-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8
346; GFX803-NEXT:    s_mov_b32 s4, 0x40000
347; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s5
348; GFX803-NEXT:    s_waitcnt vmcnt(0)
349; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
350; GFX803-NEXT:    ;;#ASMSTART
351; GFX803-NEXT:    ;;#ASMEND
352; GFX803-NEXT:    s_mov_b32 s4, 0x40000
353; GFX803-NEXT:    buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
354; GFX803-NEXT:    s_waitcnt vmcnt(0)
355; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
356; GFX803-NEXT:    s_endpgm
357;
358; GFX900-LABEL: test_sgpr_offset_kernel:
359; GFX900:       ; %bb.0: ; %entry
360; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
361; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
362; GFX900-NEXT:    s_add_u32 s0, s0, s7
363; GFX900-NEXT:    s_addc_u32 s1, s1, 0
364; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8
365; GFX900-NEXT:    s_mov_b32 s6, 0x40000
366; GFX900-NEXT:    s_waitcnt vmcnt(0)
367; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
368; GFX900-NEXT:    ;;#ASMSTART
369; GFX900-NEXT:    ;;#ASMEND
370; GFX900-NEXT:    s_mov_b32 s6, 0x40000
371; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
372; GFX900-NEXT:    s_waitcnt vmcnt(0)
373; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
374; GFX900-NEXT:    s_endpgm
375;
376; GFX1010-LABEL: test_sgpr_offset_kernel:
377; GFX1010:       ; %bb.0: ; %entry
378; GFX1010-NEXT:    s_add_u32 s4, s4, s7
379; GFX1010-NEXT:    s_addc_u32 s5, s5, 0
380; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
381; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
382; GFX1010-NEXT:    s_add_u32 s0, s0, s7
383; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
384; GFX1010-NEXT:    s_mov_b32 s6, 0x20000
385; GFX1010-NEXT:    ; implicit-def: $vcc_hi
386; GFX1010-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8
387; GFX1010-NEXT:    s_waitcnt vmcnt(0)
388; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
389; GFX1010-NEXT:    v_nop
390; GFX1010-NEXT:    s_mov_b32 s6, 0x20000
391; GFX1010-NEXT:    ;;#ASMSTART
392; GFX1010-NEXT:    ;;#ASMEND
393; GFX1010-NEXT:    buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
394; GFX1010-NEXT:    s_waitcnt vmcnt(0)
395; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
396; GFX1010-NEXT:    s_endpgm
397entry:
398  ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
399  ; fit in the instruction, and has to live in the SGPR offset.
400  %alloca = alloca i8, i32 4092, align 4, addrspace(5)
401  %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
402
403  %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
404  ; 0x40000 / 64 = 4096 (for wave64)
405  ; CHECK: s_add_u32 s6, s7, 0x40000
406  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill
407  %a = load volatile i32, i32 addrspace(5)* %aptr
408
409  ; Force %a to spill
410  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
411
412  %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
413  store volatile i32 %a, i32 addrspace(5)* %outptr
414
415  ret void
416}
417
418declare hidden void @ex() local_unnamed_addr #0
419
420attributes #0 = { nounwind }
421attributes #1 = { nounwind "amdgpu-num-vgpr"="8" }
422attributes #2 = { nounwind "frame-pointer"="all" }
423