1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=GFX803 %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX900 %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX1010 %s
5
6define amdgpu_kernel void @test_kern_empty() local_unnamed_addr #0 {
7; GFX803-LABEL: test_kern_empty:
8; GFX803:       ; %bb.0: ; %entry
9; GFX803-NEXT:    s_endpgm
10;
11; GFX900-LABEL: test_kern_empty:
12; GFX900:       ; %bb.0: ; %entry
13; GFX900-NEXT:    s_endpgm
14;
15; GFX1010-LABEL: test_kern_empty:
16; GFX1010:       ; %bb.0: ; %entry
17; GFX1010-NEXT:    s_endpgm
18entry:
19  ret void
20}
21
22define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 {
23; GFX803-LABEL: test_kern_stack:
24; GFX803:       ; %bb.0: ; %entry
25; GFX803-NEXT:    s_add_i32 s4, s4, s7
26; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
27; GFX803-NEXT:    s_add_u32 s0, s0, s7
28; GFX803-NEXT:    s_addc_u32 s1, s1, 0
29; GFX803-NEXT:    v_mov_b32_e32 v0, 0
30; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s5
31; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
32; GFX803-NEXT:    s_waitcnt vmcnt(0)
33; GFX803-NEXT:    s_endpgm
34;
35; GFX900-LABEL: test_kern_stack:
36; GFX900:       ; %bb.0: ; %entry
37; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
38; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
39; GFX900-NEXT:    s_add_u32 s0, s0, s7
40; GFX900-NEXT:    s_addc_u32 s1, s1, 0
41; GFX900-NEXT:    v_mov_b32_e32 v0, 0
42; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
43; GFX900-NEXT:    s_waitcnt vmcnt(0)
44; GFX900-NEXT:    s_endpgm
45;
46; GFX1010-LABEL: test_kern_stack:
47; GFX1010:       ; %bb.0: ; %entry
48; GFX1010-NEXT:    s_add_u32 s4, s4, s7
49; GFX1010-NEXT:    s_addc_u32 s5, s5, 0
50; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
51; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
52; GFX1010-NEXT:    v_mov_b32_e32 v0, 0
53; GFX1010-NEXT:    s_add_u32 s0, s0, s7
54; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
55; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
56; GFX1010-NEXT:    s_waitcnt_vscnt null, 0x0
57; GFX1010-NEXT:    s_endpgm
58entry:
59  %x = alloca i32, align 4, addrspace(5)
60  store volatile i32 0, i32 addrspace(5)* %x, align 4
61  ret void
62}
63
64define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
65; GFX803-LABEL: test_kern_call:
66; GFX803:       ; %bb.0: ; %entry
67; GFX803-NEXT:    s_add_i32 s4, s4, s7
68; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
69; GFX803-NEXT:    s_add_u32 s0, s0, s7
70; GFX803-NEXT:    s_addc_u32 s1, s1, 0
71; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s5
72; GFX803-NEXT:    s_getpc_b64 s[4:5]
73; GFX803-NEXT:    s_add_u32 s4, s4, ex@rel32@lo+4
74; GFX803-NEXT:    s_addc_u32 s5, s5, ex@rel32@hi+12
75; GFX803-NEXT:    s_mov_b32 s32, 0
76; GFX803-NEXT:    s_swappc_b64 s[30:31], s[4:5]
77; GFX803-NEXT:    s_endpgm
78;
79; GFX900-LABEL: test_kern_call:
80; GFX900:       ; %bb.0: ; %entry
81; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
82; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
83; GFX900-NEXT:    s_add_u32 s0, s0, s7
84; GFX900-NEXT:    s_addc_u32 s1, s1, 0
85; GFX900-NEXT:    s_getpc_b64 s[4:5]
86; GFX900-NEXT:    s_add_u32 s4, s4, ex@rel32@lo+4
87; GFX900-NEXT:    s_addc_u32 s5, s5, ex@rel32@hi+12
88; GFX900-NEXT:    s_mov_b32 s32, 0
89; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5]
90; GFX900-NEXT:    s_endpgm
91;
92; GFX1010-LABEL: test_kern_call:
93; GFX1010:       ; %bb.0: ; %entry
94; GFX1010-NEXT:    s_add_u32 s4, s4, s7
95; GFX1010-NEXT:    s_mov_b32 s32, 0
96; GFX1010-NEXT:    s_addc_u32 s5, s5, 0
97; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
98; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
99; GFX1010-NEXT:    s_add_u32 s0, s0, s7
100; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
101; GFX1010-NEXT:    s_getpc_b64 s[4:5]
102; GFX1010-NEXT:    s_add_u32 s4, s4, ex@rel32@lo+4
103; GFX1010-NEXT:    s_addc_u32 s5, s5, ex@rel32@hi+12
104; GFX1010-NEXT:    s_swappc_b64 s[30:31], s[4:5]
105; GFX1010-NEXT:    s_endpgm
106entry:
107  tail call void @ex() #0
108  ret void
109}
110
111define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
112; GFX803-LABEL: test_kern_stack_and_call:
113; GFX803:       ; %bb.0: ; %entry
114; GFX803-NEXT:    s_add_i32 s4, s4, s7
115; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
116; GFX803-NEXT:    s_add_u32 s0, s0, s7
117; GFX803-NEXT:    s_addc_u32 s1, s1, 0
118; GFX803-NEXT:    v_mov_b32_e32 v0, 0
119; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s5
120; GFX803-NEXT:    s_getpc_b64 s[4:5]
121; GFX803-NEXT:    s_add_u32 s4, s4, ex@rel32@lo+4
122; GFX803-NEXT:    s_addc_u32 s5, s5, ex@rel32@hi+12
123; GFX803-NEXT:    s_movk_i32 s32, 0x400
124; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
125; GFX803-NEXT:    s_waitcnt vmcnt(0)
126; GFX803-NEXT:    s_swappc_b64 s[30:31], s[4:5]
127; GFX803-NEXT:    s_endpgm
128;
129; GFX900-LABEL: test_kern_stack_and_call:
130; GFX900:       ; %bb.0: ; %entry
131; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
132; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
133; GFX900-NEXT:    s_add_u32 s0, s0, s7
134; GFX900-NEXT:    s_addc_u32 s1, s1, 0
135; GFX900-NEXT:    v_mov_b32_e32 v0, 0
136; GFX900-NEXT:    s_getpc_b64 s[4:5]
137; GFX900-NEXT:    s_add_u32 s4, s4, ex@rel32@lo+4
138; GFX900-NEXT:    s_addc_u32 s5, s5, ex@rel32@hi+12
139; GFX900-NEXT:    s_movk_i32 s32, 0x400
140; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
141; GFX900-NEXT:    s_waitcnt vmcnt(0)
142; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5]
143; GFX900-NEXT:    s_endpgm
144;
145; GFX1010-LABEL: test_kern_stack_and_call:
146; GFX1010:       ; %bb.0: ; %entry
147; GFX1010-NEXT:    s_add_u32 s4, s4, s7
148; GFX1010-NEXT:    s_movk_i32 s32, 0x200
149; GFX1010-NEXT:    s_addc_u32 s5, s5, 0
150; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
151; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
152; GFX1010-NEXT:    v_mov_b32_e32 v0, 0
153; GFX1010-NEXT:    s_add_u32 s0, s0, s7
154; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
155; GFX1010-NEXT:    s_getpc_b64 s[4:5]
156; GFX1010-NEXT:    s_add_u32 s4, s4, ex@rel32@lo+4
157; GFX1010-NEXT:    s_addc_u32 s5, s5, ex@rel32@hi+12
158; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
159; GFX1010-NEXT:    s_waitcnt_vscnt null, 0x0
160; GFX1010-NEXT:    s_swappc_b64 s[30:31], s[4:5]
161; GFX1010-NEXT:    s_endpgm
162entry:
163  %x = alloca i32, align 4, addrspace(5)
164  store volatile i32 0, i32 addrspace(5)* %x, align 4
165  tail call void @ex() #0
166  ret void
167}
168
169define amdgpu_kernel void @test_force_fp_kern_empty() local_unnamed_addr #2 {
170; GFX803-LABEL: test_force_fp_kern_empty:
171; GFX803:       ; %bb.0: ; %entry
172; GFX803-NEXT:    s_mov_b32 s33, 0
173; GFX803-NEXT:    s_endpgm
174;
175; GFX900-LABEL: test_force_fp_kern_empty:
176; GFX900:       ; %bb.0: ; %entry
177; GFX900-NEXT:    s_mov_b32 s33, 0
178; GFX900-NEXT:    s_endpgm
179;
180; GFX1010-LABEL: test_force_fp_kern_empty:
181; GFX1010:       ; %bb.0: ; %entry
182; GFX1010-NEXT:    s_mov_b32 s33, 0
183; GFX1010-NEXT:    s_endpgm
184entry:
185  ret void
186}
187
188define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 {
189; GFX803-LABEL: test_force_fp_kern_stack:
190; GFX803:       ; %bb.0: ; %entry
191; GFX803-NEXT:    s_add_i32 s4, s4, s7
192; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
193; GFX803-NEXT:    s_add_u32 s0, s0, s7
194; GFX803-NEXT:    s_mov_b32 s33, 0
195; GFX803-NEXT:    s_addc_u32 s1, s1, 0
196; GFX803-NEXT:    v_mov_b32_e32 v0, 0
197; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s5
198; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
199; GFX803-NEXT:    s_waitcnt vmcnt(0)
200; GFX803-NEXT:    s_endpgm
201;
202; GFX900-LABEL: test_force_fp_kern_stack:
203; GFX900:       ; %bb.0: ; %entry
204; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
205; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
206; GFX900-NEXT:    s_add_u32 s0, s0, s7
207; GFX900-NEXT:    s_mov_b32 s33, 0
208; GFX900-NEXT:    s_addc_u32 s1, s1, 0
209; GFX900-NEXT:    v_mov_b32_e32 v0, 0
210; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
211; GFX900-NEXT:    s_waitcnt vmcnt(0)
212; GFX900-NEXT:    s_endpgm
213;
214; GFX1010-LABEL: test_force_fp_kern_stack:
215; GFX1010:       ; %bb.0: ; %entry
216; GFX1010-NEXT:    s_add_u32 s4, s4, s7
217; GFX1010-NEXT:    s_mov_b32 s33, 0
218; GFX1010-NEXT:    s_addc_u32 s5, s5, 0
219; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
220; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
221; GFX1010-NEXT:    v_mov_b32_e32 v0, 0
222; GFX1010-NEXT:    s_add_u32 s0, s0, s7
223; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
224; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
225; GFX1010-NEXT:    s_waitcnt_vscnt null, 0x0
226; GFX1010-NEXT:    s_endpgm
227entry:
228  %x = alloca i32, align 4, addrspace(5)
229  store volatile i32 0, i32 addrspace(5)* %x, align 4
230  ret void
231}
232
233define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
234; GFX803-LABEL: test_force_fp_kern_call:
235; GFX803:       ; %bb.0: ; %entry
236; GFX803-NEXT:    s_add_i32 s4, s4, s7
237; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
238; GFX803-NEXT:    s_add_u32 s0, s0, s7
239; GFX803-NEXT:    s_addc_u32 s1, s1, 0
240; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s5
241; GFX803-NEXT:    s_getpc_b64 s[4:5]
242; GFX803-NEXT:    s_add_u32 s4, s4, ex@rel32@lo+4
243; GFX803-NEXT:    s_addc_u32 s5, s5, ex@rel32@hi+12
244; GFX803-NEXT:    s_mov_b32 s32, 0
245; GFX803-NEXT:    s_mov_b32 s33, 0
246; GFX803-NEXT:    s_swappc_b64 s[30:31], s[4:5]
247; GFX803-NEXT:    s_endpgm
248;
249; GFX900-LABEL: test_force_fp_kern_call:
250; GFX900:       ; %bb.0: ; %entry
251; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
252; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
253; GFX900-NEXT:    s_add_u32 s0, s0, s7
254; GFX900-NEXT:    s_addc_u32 s1, s1, 0
255; GFX900-NEXT:    s_getpc_b64 s[4:5]
256; GFX900-NEXT:    s_add_u32 s4, s4, ex@rel32@lo+4
257; GFX900-NEXT:    s_addc_u32 s5, s5, ex@rel32@hi+12
258; GFX900-NEXT:    s_mov_b32 s32, 0
259; GFX900-NEXT:    s_mov_b32 s33, 0
260; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5]
261; GFX900-NEXT:    s_endpgm
262;
263; GFX1010-LABEL: test_force_fp_kern_call:
264; GFX1010:       ; %bb.0: ; %entry
265; GFX1010-NEXT:    s_add_u32 s4, s4, s7
266; GFX1010-NEXT:    s_mov_b32 s32, 0
267; GFX1010-NEXT:    s_mov_b32 s33, 0
268; GFX1010-NEXT:    s_addc_u32 s5, s5, 0
269; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
270; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
271; GFX1010-NEXT:    s_add_u32 s0, s0, s7
272; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
273; GFX1010-NEXT:    s_getpc_b64 s[4:5]
274; GFX1010-NEXT:    s_add_u32 s4, s4, ex@rel32@lo+4
275; GFX1010-NEXT:    s_addc_u32 s5, s5, ex@rel32@hi+12
276; GFX1010-NEXT:    s_swappc_b64 s[30:31], s[4:5]
277; GFX1010-NEXT:    s_endpgm
278entry:
279  tail call void @ex() #2
280  ret void
281}
282
283define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_addr #2 {
284; GFX803-LABEL: test_force_fp_kern_stack_and_call:
285; GFX803:       ; %bb.0: ; %entry
286; GFX803-NEXT:    s_add_i32 s4, s4, s7
287; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
288; GFX803-NEXT:    s_add_u32 s0, s0, s7
289; GFX803-NEXT:    s_mov_b32 s33, 0
290; GFX803-NEXT:    s_addc_u32 s1, s1, 0
291; GFX803-NEXT:    v_mov_b32_e32 v0, 0
292; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s5
293; GFX803-NEXT:    s_getpc_b64 s[4:5]
294; GFX803-NEXT:    s_add_u32 s4, s4, ex@rel32@lo+4
295; GFX803-NEXT:    s_addc_u32 s5, s5, ex@rel32@hi+12
296; GFX803-NEXT:    s_movk_i32 s32, 0x400
297; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
298; GFX803-NEXT:    s_waitcnt vmcnt(0)
299; GFX803-NEXT:    s_swappc_b64 s[30:31], s[4:5]
300; GFX803-NEXT:    s_endpgm
301;
302; GFX900-LABEL: test_force_fp_kern_stack_and_call:
303; GFX900:       ; %bb.0: ; %entry
304; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
305; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
306; GFX900-NEXT:    s_add_u32 s0, s0, s7
307; GFX900-NEXT:    s_addc_u32 s1, s1, 0
308; GFX900-NEXT:    s_mov_b32 s33, 0
309; GFX900-NEXT:    v_mov_b32_e32 v0, 0
310; GFX900-NEXT:    s_getpc_b64 s[4:5]
311; GFX900-NEXT:    s_add_u32 s4, s4, ex@rel32@lo+4
312; GFX900-NEXT:    s_addc_u32 s5, s5, ex@rel32@hi+12
313; GFX900-NEXT:    s_movk_i32 s32, 0x400
314; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
315; GFX900-NEXT:    s_waitcnt vmcnt(0)
316; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5]
317; GFX900-NEXT:    s_endpgm
318;
319; GFX1010-LABEL: test_force_fp_kern_stack_and_call:
320; GFX1010:       ; %bb.0: ; %entry
321; GFX1010-NEXT:    s_add_u32 s4, s4, s7
322; GFX1010-NEXT:    s_movk_i32 s32, 0x200
323; GFX1010-NEXT:    s_mov_b32 s33, 0
324; GFX1010-NEXT:    s_addc_u32 s5, s5, 0
325; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
326; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
327; GFX1010-NEXT:    v_mov_b32_e32 v0, 0
328; GFX1010-NEXT:    s_add_u32 s0, s0, s7
329; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
330; GFX1010-NEXT:    s_getpc_b64 s[4:5]
331; GFX1010-NEXT:    s_add_u32 s4, s4, ex@rel32@lo+4
332; GFX1010-NEXT:    s_addc_u32 s5, s5, ex@rel32@hi+12
333; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
334; GFX1010-NEXT:    s_waitcnt_vscnt null, 0x0
335; GFX1010-NEXT:    s_swappc_b64 s[30:31], s[4:5]
336; GFX1010-NEXT:    s_endpgm
337entry:
338  %x = alloca i32, align 4, addrspace(5)
339  store volatile i32 0, i32 addrspace(5)* %x, align 4
340  tail call void @ex() #2
341  ret void
342}
343
344define amdgpu_kernel void @test_sgpr_offset_kernel() #1 {
345; GFX803-LABEL: test_sgpr_offset_kernel:
346; GFX803:       ; %bb.0: ; %entry
347; GFX803-NEXT:    s_add_i32 s4, s4, s7
348; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
349; GFX803-NEXT:    s_add_u32 s0, s0, s7
350; GFX803-NEXT:    s_addc_u32 s1, s1, 0
351; GFX803-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
352; GFX803-NEXT:    s_waitcnt vmcnt(0)
353; GFX803-NEXT:    s_mov_b32 s4, 0x40000
354; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s5
355; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
356; GFX803-NEXT:    ;;#ASMSTART
357; GFX803-NEXT:    ;;#ASMEND
358; GFX803-NEXT:    s_mov_b32 s4, 0x40000
359; GFX803-NEXT:    buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
360; GFX803-NEXT:    s_waitcnt vmcnt(0)
361; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
362; GFX803-NEXT:    s_waitcnt vmcnt(0)
363; GFX803-NEXT:    s_endpgm
364;
365; GFX900-LABEL: test_sgpr_offset_kernel:
366; GFX900:       ; %bb.0: ; %entry
367; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
368; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
369; GFX900-NEXT:    s_add_u32 s0, s0, s7
370; GFX900-NEXT:    s_addc_u32 s1, s1, 0
371; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
372; GFX900-NEXT:    s_waitcnt vmcnt(0)
373; GFX900-NEXT:    s_mov_b32 s6, 0x40000
374; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
375; GFX900-NEXT:    ;;#ASMSTART
376; GFX900-NEXT:    ;;#ASMEND
377; GFX900-NEXT:    s_mov_b32 s6, 0x40000
378; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
379; GFX900-NEXT:    s_waitcnt vmcnt(0)
380; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
381; GFX900-NEXT:    s_waitcnt vmcnt(0)
382; GFX900-NEXT:    s_endpgm
383;
384; GFX1010-LABEL: test_sgpr_offset_kernel:
385; GFX1010:       ; %bb.0: ; %entry
386; GFX1010-NEXT:    s_add_u32 s4, s4, s7
387; GFX1010-NEXT:    s_addc_u32 s5, s5, 0
388; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
389; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
390; GFX1010-NEXT:    s_add_u32 s0, s0, s7
391; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
392; GFX1010-NEXT:    s_mov_b32 s6, 0x20000
393; GFX1010-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8 glc dlc
394; GFX1010-NEXT:    s_waitcnt vmcnt(0)
395; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
396; GFX1010-NEXT:    s_waitcnt_depctr 0xffe3
397; GFX1010-NEXT:    s_mov_b32 s6, 0x20000
398; GFX1010-NEXT:    ;;#ASMSTART
399; GFX1010-NEXT:    ;;#ASMEND
400; GFX1010-NEXT:    buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
401; GFX1010-NEXT:    s_waitcnt vmcnt(0)
402; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
403; GFX1010-NEXT:    s_waitcnt_vscnt null, 0x0
404; GFX1010-NEXT:    s_endpgm
405entry:
406  ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
407  ; fit in the instruction, and has to live in the SGPR offset.
408  %alloca = alloca i8, i32 4092, align 4, addrspace(5)
409  %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
410
411  %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
412  ; 0x40000 / 64 = 4096 (for wave64)
413  ; CHECK: s_add_u32 s6, s7, 0x40000
414  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill
415  %a = load volatile i32, i32 addrspace(5)* %aptr
416
417  ; Force %a to spill
418  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
419
420  %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
421  store volatile i32 %a, i32 addrspace(5)* %outptr
422
423  ret void
424}
425
426declare hidden void @ex() local_unnamed_addr #0
427
428attributes #0 = { nounwind }
429attributes #1 = { nounwind "amdgpu-num-vgpr"="8" }
430attributes #2 = { nounwind "frame-pointer"="all" }
431