1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=GFX803 %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX900 %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX1010 %s
5
6define amdgpu_kernel void @test_kern_empty() local_unnamed_addr #0 {
7; GFX803-LABEL: test_kern_empty:
8; GFX803:       ; %bb.0: ; %entry
9; GFX803-NEXT:    s_endpgm
10;
11; GFX900-LABEL: test_kern_empty:
12; GFX900:       ; %bb.0: ; %entry
13; GFX900-NEXT:    s_endpgm
14;
15; GFX1010-LABEL: test_kern_empty:
16; GFX1010:       ; %bb.0: ; %entry
17; GFX1010-NEXT:    s_endpgm
18entry:
19  ret void
20}
21
22define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 {
23; GFX803-LABEL: test_kern_stack:
24; GFX803:       ; %bb.0: ; %entry
25; GFX803-NEXT:    s_add_u32 s0, s0, s7
26; GFX803-NEXT:    s_addc_u32 s1, s1, 0
27; GFX803-NEXT:    v_mov_b32_e32 v0, 0
28; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
29; GFX803-NEXT:    s_waitcnt vmcnt(0)
30; GFX803-NEXT:    s_endpgm
31;
32; GFX900-LABEL: test_kern_stack:
33; GFX900:       ; %bb.0: ; %entry
34; GFX900-NEXT:    s_add_u32 s0, s0, s7
35; GFX900-NEXT:    s_addc_u32 s1, s1, 0
36; GFX900-NEXT:    v_mov_b32_e32 v0, 0
37; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
38; GFX900-NEXT:    s_waitcnt vmcnt(0)
39; GFX900-NEXT:    s_endpgm
40;
41; GFX1010-LABEL: test_kern_stack:
42; GFX1010:       ; %bb.0: ; %entry
43; GFX1010-NEXT:    v_mov_b32_e32 v0, 0
44; GFX1010-NEXT:    s_add_u32 s0, s0, s7
45; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
46; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
47; GFX1010-NEXT:    s_waitcnt_vscnt null, 0x0
48; GFX1010-NEXT:    s_endpgm
49entry:
50  %x = alloca i32, align 4, addrspace(5)
51  store volatile i32 0, i32 addrspace(5)* %x, align 4
52  ret void
53}
54
55define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
56; GFX803-LABEL: test_kern_call:
57; GFX803:       ; %bb.0: ; %entry
58; GFX803-NEXT:    s_add_i32 s12, s12, s17
59; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
60; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
61; GFX803-NEXT:    s_add_u32 s0, s0, s17
62; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
63; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
64; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s13
65; GFX803-NEXT:    s_addc_u32 s1, s1, 0
66; GFX803-NEXT:    s_mov_b32 s13, s15
67; GFX803-NEXT:    s_mov_b32 s12, s14
68; GFX803-NEXT:    v_or_b32_e32 v31, v0, v2
69; GFX803-NEXT:    s_mov_b32 s14, s16
70; GFX803-NEXT:    s_mov_b32 s32, 0
71; GFX803-NEXT:    s_getpc_b64 s[18:19]
72; GFX803-NEXT:    s_add_u32 s18, s18, ex@rel32@lo+4
73; GFX803-NEXT:    s_addc_u32 s19, s19, ex@rel32@hi+12
74; GFX803-NEXT:    s_swappc_b64 s[30:31], s[18:19]
75; GFX803-NEXT:    s_endpgm
76;
77; GFX900-LABEL: test_kern_call:
78; GFX900:       ; %bb.0: ; %entry
79; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
80; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
81; GFX900-NEXT:    s_add_u32 s0, s0, s17
82; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
83; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
84; GFX900-NEXT:    s_addc_u32 s1, s1, 0
85; GFX900-NEXT:    s_mov_b32 s13, s15
86; GFX900-NEXT:    s_mov_b32 s12, s14
87; GFX900-NEXT:    v_or3_b32 v31, v0, v1, v2
88; GFX900-NEXT:    s_mov_b32 s14, s16
89; GFX900-NEXT:    s_mov_b32 s32, 0
90; GFX900-NEXT:    s_getpc_b64 s[18:19]
91; GFX900-NEXT:    s_add_u32 s18, s18, ex@rel32@lo+4
92; GFX900-NEXT:    s_addc_u32 s19, s19, ex@rel32@hi+12
93; GFX900-NEXT:    s_swappc_b64 s[30:31], s[18:19]
94; GFX900-NEXT:    s_endpgm
95;
96; GFX1010-LABEL: test_kern_call:
97; GFX1010:       ; %bb.0: ; %entry
98; GFX1010-NEXT:    s_add_u32 s12, s12, s17
99; GFX1010-NEXT:    s_mov_b32 s32, 0
100; GFX1010-NEXT:    s_addc_u32 s13, s13, 0
101; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
102; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
103; GFX1010-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
104; GFX1010-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
105; GFX1010-NEXT:    s_add_u32 s0, s0, s17
106; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
107; GFX1010-NEXT:    s_mov_b32 s13, s15
108; GFX1010-NEXT:    s_mov_b32 s12, s14
109; GFX1010-NEXT:    v_or3_b32 v31, v0, v1, v2
110; GFX1010-NEXT:    s_mov_b32 s14, s16
111; GFX1010-NEXT:    s_getpc_b64 s[18:19]
112; GFX1010-NEXT:    s_add_u32 s18, s18, ex@rel32@lo+4
113; GFX1010-NEXT:    s_addc_u32 s19, s19, ex@rel32@hi+12
114; GFX1010-NEXT:    s_swappc_b64 s[30:31], s[18:19]
115; GFX1010-NEXT:    s_endpgm
116entry:
117  tail call void @ex() #0
118  ret void
119}
120
121define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
122; GFX803-LABEL: test_kern_stack_and_call:
123; GFX803:       ; %bb.0: ; %entry
124; GFX803-NEXT:    s_add_i32 s12, s12, s17
125; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
126; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
127; GFX803-NEXT:    s_add_u32 s0, s0, s17
128; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
129; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
130; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s13
131; GFX803-NEXT:    s_addc_u32 s1, s1, 0
132; GFX803-NEXT:    s_mov_b32 s13, s15
133; GFX803-NEXT:    s_mov_b32 s12, s14
134; GFX803-NEXT:    v_mov_b32_e32 v3, 0
135; GFX803-NEXT:    v_or_b32_e32 v31, v0, v2
136; GFX803-NEXT:    s_mov_b32 s14, s16
137; GFX803-NEXT:    s_movk_i32 s32, 0x400
138; GFX803-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:4
139; GFX803-NEXT:    s_waitcnt vmcnt(0)
140; GFX803-NEXT:    s_getpc_b64 s[18:19]
141; GFX803-NEXT:    s_add_u32 s18, s18, ex@rel32@lo+4
142; GFX803-NEXT:    s_addc_u32 s19, s19, ex@rel32@hi+12
143; GFX803-NEXT:    s_swappc_b64 s[30:31], s[18:19]
144; GFX803-NEXT:    s_endpgm
145;
146; GFX900-LABEL: test_kern_stack_and_call:
147; GFX900:       ; %bb.0: ; %entry
148; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
149; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
150; GFX900-NEXT:    s_add_u32 s0, s0, s17
151; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
152; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
153; GFX900-NEXT:    s_addc_u32 s1, s1, 0
154; GFX900-NEXT:    s_mov_b32 s13, s15
155; GFX900-NEXT:    s_mov_b32 s12, s14
156; GFX900-NEXT:    v_mov_b32_e32 v3, 0
157; GFX900-NEXT:    v_or3_b32 v31, v0, v1, v2
158; GFX900-NEXT:    s_mov_b32 s14, s16
159; GFX900-NEXT:    s_movk_i32 s32, 0x400
160; GFX900-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:4
161; GFX900-NEXT:    s_waitcnt vmcnt(0)
162; GFX900-NEXT:    s_getpc_b64 s[18:19]
163; GFX900-NEXT:    s_add_u32 s18, s18, ex@rel32@lo+4
164; GFX900-NEXT:    s_addc_u32 s19, s19, ex@rel32@hi+12
165; GFX900-NEXT:    s_swappc_b64 s[30:31], s[18:19]
166; GFX900-NEXT:    s_endpgm
167;
168; GFX1010-LABEL: test_kern_stack_and_call:
169; GFX1010:       ; %bb.0: ; %entry
170; GFX1010-NEXT:    s_add_u32 s12, s12, s17
171; GFX1010-NEXT:    s_movk_i32 s32, 0x200
172; GFX1010-NEXT:    s_addc_u32 s13, s13, 0
173; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
174; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
175; GFX1010-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
176; GFX1010-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
177; GFX1010-NEXT:    v_mov_b32_e32 v3, 0
178; GFX1010-NEXT:    s_add_u32 s0, s0, s17
179; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
180; GFX1010-NEXT:    s_mov_b32 s13, s15
181; GFX1010-NEXT:    v_or3_b32 v31, v0, v1, v2
182; GFX1010-NEXT:    s_mov_b32 s12, s14
183; GFX1010-NEXT:    s_mov_b32 s14, s16
184; GFX1010-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:4
185; GFX1010-NEXT:    s_waitcnt_vscnt null, 0x0
186; GFX1010-NEXT:    s_getpc_b64 s[18:19]
187; GFX1010-NEXT:    s_add_u32 s18, s18, ex@rel32@lo+4
188; GFX1010-NEXT:    s_addc_u32 s19, s19, ex@rel32@hi+12
189; GFX1010-NEXT:    s_swappc_b64 s[30:31], s[18:19]
190; GFX1010-NEXT:    s_endpgm
191entry:
192  %x = alloca i32, align 4, addrspace(5)
193  store volatile i32 0, i32 addrspace(5)* %x, align 4
194  tail call void @ex() #0
195  ret void
196}
197
198define amdgpu_kernel void @test_force_fp_kern_empty() local_unnamed_addr #2 {
199; GFX803-LABEL: test_force_fp_kern_empty:
200; GFX803:       ; %bb.0: ; %entry
201; GFX803-NEXT:    s_mov_b32 s33, 0
202; GFX803-NEXT:    s_endpgm
203;
204; GFX900-LABEL: test_force_fp_kern_empty:
205; GFX900:       ; %bb.0: ; %entry
206; GFX900-NEXT:    s_mov_b32 s33, 0
207; GFX900-NEXT:    s_endpgm
208;
209; GFX1010-LABEL: test_force_fp_kern_empty:
210; GFX1010:       ; %bb.0: ; %entry
211; GFX1010-NEXT:    s_mov_b32 s33, 0
212; GFX1010-NEXT:    s_endpgm
213entry:
214  ret void
215}
216
217define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 {
218; GFX803-LABEL: test_force_fp_kern_stack:
219; GFX803:       ; %bb.0: ; %entry
220; GFX803-NEXT:    s_add_u32 s0, s0, s7
221; GFX803-NEXT:    s_mov_b32 s33, 0
222; GFX803-NEXT:    s_addc_u32 s1, s1, 0
223; GFX803-NEXT:    v_mov_b32_e32 v0, 0
224; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
225; GFX803-NEXT:    s_waitcnt vmcnt(0)
226; GFX803-NEXT:    s_endpgm
227;
228; GFX900-LABEL: test_force_fp_kern_stack:
229; GFX900:       ; %bb.0: ; %entry
230; GFX900-NEXT:    s_add_u32 s0, s0, s7
231; GFX900-NEXT:    s_mov_b32 s33, 0
232; GFX900-NEXT:    s_addc_u32 s1, s1, 0
233; GFX900-NEXT:    v_mov_b32_e32 v0, 0
234; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
235; GFX900-NEXT:    s_waitcnt vmcnt(0)
236; GFX900-NEXT:    s_endpgm
237;
238; GFX1010-LABEL: test_force_fp_kern_stack:
239; GFX1010:       ; %bb.0: ; %entry
240; GFX1010-NEXT:    v_mov_b32_e32 v0, 0
241; GFX1010-NEXT:    s_add_u32 s0, s0, s7
242; GFX1010-NEXT:    s_mov_b32 s33, 0
243; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
244; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
245; GFX1010-NEXT:    s_waitcnt_vscnt null, 0x0
246; GFX1010-NEXT:    s_endpgm
247entry:
248  %x = alloca i32, align 4, addrspace(5)
249  store volatile i32 0, i32 addrspace(5)* %x, align 4
250  ret void
251}
252
253define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
254; GFX803-LABEL: test_force_fp_kern_call:
255; GFX803:       ; %bb.0: ; %entry
256; GFX803-NEXT:    s_add_i32 s12, s12, s17
257; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
258; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
259; GFX803-NEXT:    s_add_u32 s0, s0, s17
260; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
261; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
262; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s13
263; GFX803-NEXT:    s_addc_u32 s1, s1, 0
264; GFX803-NEXT:    s_mov_b32 s13, s15
265; GFX803-NEXT:    s_mov_b32 s12, s14
266; GFX803-NEXT:    v_or_b32_e32 v31, v0, v2
267; GFX803-NEXT:    s_mov_b32 s14, s16
268; GFX803-NEXT:    s_mov_b32 s32, 0
269; GFX803-NEXT:    s_mov_b32 s33, 0
270; GFX803-NEXT:    s_getpc_b64 s[18:19]
271; GFX803-NEXT:    s_add_u32 s18, s18, ex@rel32@lo+4
272; GFX803-NEXT:    s_addc_u32 s19, s19, ex@rel32@hi+12
273; GFX803-NEXT:    s_swappc_b64 s[30:31], s[18:19]
274; GFX803-NEXT:    s_endpgm
275;
276; GFX900-LABEL: test_force_fp_kern_call:
277; GFX900:       ; %bb.0: ; %entry
278; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
279; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
280; GFX900-NEXT:    s_add_u32 s0, s0, s17
281; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
282; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
283; GFX900-NEXT:    s_addc_u32 s1, s1, 0
284; GFX900-NEXT:    s_mov_b32 s13, s15
285; GFX900-NEXT:    s_mov_b32 s12, s14
286; GFX900-NEXT:    v_or3_b32 v31, v0, v1, v2
287; GFX900-NEXT:    s_mov_b32 s14, s16
288; GFX900-NEXT:    s_mov_b32 s32, 0
289; GFX900-NEXT:    s_mov_b32 s33, 0
290; GFX900-NEXT:    s_getpc_b64 s[18:19]
291; GFX900-NEXT:    s_add_u32 s18, s18, ex@rel32@lo+4
292; GFX900-NEXT:    s_addc_u32 s19, s19, ex@rel32@hi+12
293; GFX900-NEXT:    s_swappc_b64 s[30:31], s[18:19]
294; GFX900-NEXT:    s_endpgm
295;
296; GFX1010-LABEL: test_force_fp_kern_call:
297; GFX1010:       ; %bb.0: ; %entry
298; GFX1010-NEXT:    s_add_u32 s12, s12, s17
299; GFX1010-NEXT:    s_mov_b32 s32, 0
300; GFX1010-NEXT:    s_mov_b32 s33, 0
301; GFX1010-NEXT:    s_addc_u32 s13, s13, 0
302; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
303; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
304; GFX1010-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
305; GFX1010-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
306; GFX1010-NEXT:    s_add_u32 s0, s0, s17
307; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
308; GFX1010-NEXT:    s_mov_b32 s13, s15
309; GFX1010-NEXT:    s_mov_b32 s12, s14
310; GFX1010-NEXT:    v_or3_b32 v31, v0, v1, v2
311; GFX1010-NEXT:    s_mov_b32 s14, s16
312; GFX1010-NEXT:    s_getpc_b64 s[18:19]
313; GFX1010-NEXT:    s_add_u32 s18, s18, ex@rel32@lo+4
314; GFX1010-NEXT:    s_addc_u32 s19, s19, ex@rel32@hi+12
315; GFX1010-NEXT:    s_swappc_b64 s[30:31], s[18:19]
316; GFX1010-NEXT:    s_endpgm
317entry:
318  tail call void @ex() #2
319  ret void
320}
321
322define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_addr #2 {
323; GFX803-LABEL: test_force_fp_kern_stack_and_call:
324; GFX803:       ; %bb.0: ; %entry
325; GFX803-NEXT:    s_add_i32 s12, s12, s17
326; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
327; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
328; GFX803-NEXT:    s_add_u32 s0, s0, s17
329; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
330; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
331; GFX803-NEXT:    s_mov_b32 s33, 0
332; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s13
333; GFX803-NEXT:    s_addc_u32 s1, s1, 0
334; GFX803-NEXT:    s_mov_b32 s13, s15
335; GFX803-NEXT:    s_mov_b32 s12, s14
336; GFX803-NEXT:    v_mov_b32_e32 v3, 0
337; GFX803-NEXT:    v_or_b32_e32 v31, v0, v2
338; GFX803-NEXT:    s_mov_b32 s14, s16
339; GFX803-NEXT:    s_movk_i32 s32, 0x400
340; GFX803-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:4
341; GFX803-NEXT:    s_waitcnt vmcnt(0)
342; GFX803-NEXT:    s_getpc_b64 s[18:19]
343; GFX803-NEXT:    s_add_u32 s18, s18, ex@rel32@lo+4
344; GFX803-NEXT:    s_addc_u32 s19, s19, ex@rel32@hi+12
345; GFX803-NEXT:    s_swappc_b64 s[30:31], s[18:19]
346; GFX803-NEXT:    s_endpgm
347;
348; GFX900-LABEL: test_force_fp_kern_stack_and_call:
349; GFX900:       ; %bb.0: ; %entry
350; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
351; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
352; GFX900-NEXT:    s_add_u32 s0, s0, s17
353; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
354; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
355; GFX900-NEXT:    s_mov_b32 s33, 0
356; GFX900-NEXT:    s_addc_u32 s1, s1, 0
357; GFX900-NEXT:    s_mov_b32 s13, s15
358; GFX900-NEXT:    s_mov_b32 s12, s14
359; GFX900-NEXT:    v_mov_b32_e32 v3, 0
360; GFX900-NEXT:    v_or3_b32 v31, v0, v1, v2
361; GFX900-NEXT:    s_mov_b32 s14, s16
362; GFX900-NEXT:    s_movk_i32 s32, 0x400
363; GFX900-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:4
364; GFX900-NEXT:    s_waitcnt vmcnt(0)
365; GFX900-NEXT:    s_getpc_b64 s[18:19]
366; GFX900-NEXT:    s_add_u32 s18, s18, ex@rel32@lo+4
367; GFX900-NEXT:    s_addc_u32 s19, s19, ex@rel32@hi+12
368; GFX900-NEXT:    s_swappc_b64 s[30:31], s[18:19]
369; GFX900-NEXT:    s_endpgm
370;
371; GFX1010-LABEL: test_force_fp_kern_stack_and_call:
372; GFX1010:       ; %bb.0: ; %entry
373; GFX1010-NEXT:    s_add_u32 s12, s12, s17
374; GFX1010-NEXT:    s_movk_i32 s32, 0x200
375; GFX1010-NEXT:    s_mov_b32 s33, 0
376; GFX1010-NEXT:    s_addc_u32 s13, s13, 0
377; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
378; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
379; GFX1010-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
380; GFX1010-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
381; GFX1010-NEXT:    v_mov_b32_e32 v3, 0
382; GFX1010-NEXT:    s_add_u32 s0, s0, s17
383; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
384; GFX1010-NEXT:    s_mov_b32 s13, s15
385; GFX1010-NEXT:    v_or3_b32 v31, v0, v1, v2
386; GFX1010-NEXT:    s_mov_b32 s12, s14
387; GFX1010-NEXT:    s_mov_b32 s14, s16
388; GFX1010-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:4
389; GFX1010-NEXT:    s_waitcnt_vscnt null, 0x0
390; GFX1010-NEXT:    s_getpc_b64 s[18:19]
391; GFX1010-NEXT:    s_add_u32 s18, s18, ex@rel32@lo+4
392; GFX1010-NEXT:    s_addc_u32 s19, s19, ex@rel32@hi+12
393; GFX1010-NEXT:    s_swappc_b64 s[30:31], s[18:19]
394; GFX1010-NEXT:    s_endpgm
395entry:
396  %x = alloca i32, align 4, addrspace(5)
397  store volatile i32 0, i32 addrspace(5)* %x, align 4
398  tail call void @ex() #2
399  ret void
400}
401
402define amdgpu_kernel void @test_sgpr_offset_kernel() #1 {
403; GFX803-LABEL: test_sgpr_offset_kernel:
404; GFX803:       ; %bb.0: ; %entry
405; GFX803-NEXT:    s_add_u32 s0, s0, s7
406; GFX803-NEXT:    s_addc_u32 s1, s1, 0
407; GFX803-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
408; GFX803-NEXT:    s_waitcnt vmcnt(0)
409; GFX803-NEXT:    s_mov_b32 s4, 0x40000
410; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
411; GFX803-NEXT:    ;;#ASMSTART
412; GFX803-NEXT:    ;;#ASMEND
413; GFX803-NEXT:    s_mov_b32 s4, 0x40000
414; GFX803-NEXT:    buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
415; GFX803-NEXT:    s_waitcnt vmcnt(0)
416; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
417; GFX803-NEXT:    s_waitcnt vmcnt(0)
418; GFX803-NEXT:    s_endpgm
419;
420; GFX900-LABEL: test_sgpr_offset_kernel:
421; GFX900:       ; %bb.0: ; %entry
422; GFX900-NEXT:    s_add_u32 s0, s0, s7
423; GFX900-NEXT:    s_addc_u32 s1, s1, 0
424; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
425; GFX900-NEXT:    s_waitcnt vmcnt(0)
426; GFX900-NEXT:    s_mov_b32 s4, 0x40000
427; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
428; GFX900-NEXT:    ;;#ASMSTART
429; GFX900-NEXT:    ;;#ASMEND
430; GFX900-NEXT:    s_mov_b32 s4, 0x40000
431; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
432; GFX900-NEXT:    s_waitcnt vmcnt(0)
433; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
434; GFX900-NEXT:    s_waitcnt vmcnt(0)
435; GFX900-NEXT:    s_endpgm
436;
437; GFX1010-LABEL: test_sgpr_offset_kernel:
438; GFX1010:       ; %bb.0: ; %entry
439; GFX1010-NEXT:    s_add_u32 s0, s0, s7
440; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
441; GFX1010-NEXT:    s_mov_b32 s4, 0x20000
442; GFX1010-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8 glc dlc
443; GFX1010-NEXT:    s_waitcnt vmcnt(0)
444; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
445; GFX1010-NEXT:    s_waitcnt_depctr 0xffe3
446; GFX1010-NEXT:    s_mov_b32 s4, 0x20000
447; GFX1010-NEXT:    ;;#ASMSTART
448; GFX1010-NEXT:    ;;#ASMEND
449; GFX1010-NEXT:    buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
450; GFX1010-NEXT:    s_waitcnt vmcnt(0)
451; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
452; GFX1010-NEXT:    s_waitcnt_vscnt null, 0x0
453; GFX1010-NEXT:    s_endpgm
454entry:
455  ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
456  ; fit in the instruction, and has to live in the SGPR offset.
457  %alloca = alloca i8, i32 4092, align 4, addrspace(5)
458  %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
459
460  %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
461  ; 0x40000 / 64 = 4096 (for wave64)
462  ; CHECK: s_add_u32 s6, s7, 0x40000
463  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill
464  %a = load volatile i32, i32 addrspace(5)* %aptr
465
466  ; Force %a to spill
467  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
468
469  %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
470  store volatile i32 %a, i32 addrspace(5)* %outptr
471
472  ret void
473}
474
475declare hidden void @ex() local_unnamed_addr #0
476
477attributes #0 = { nounwind }
478attributes #1 = { nounwind "amdgpu-num-vgpr"="8" }
479attributes #2 = { nounwind "frame-pointer"="all" }
480