1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=GFX803 %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX900 %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX1010 %s
5
6define amdgpu_kernel void @test_kern_empty() local_unnamed_addr #0 {
7; GFX803-LABEL: test_kern_empty:
8; GFX803:       ; %bb.0: ; %entry
9; GFX803-NEXT:    s_endpgm
10;
11; GFX900-LABEL: test_kern_empty:
12; GFX900:       ; %bb.0: ; %entry
13; GFX900-NEXT:    s_endpgm
14;
15; GFX1010-LABEL: test_kern_empty:
16; GFX1010:       ; %bb.0: ; %entry
17; GFX1010-NEXT:    s_endpgm
18entry:
19  ret void
20}
21
22define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 {
23; GFX803-LABEL: test_kern_stack:
24; GFX803:       ; %bb.0: ; %entry
25; GFX803-NEXT:    s_add_u32 s4, s4, s7
26; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
27; GFX803-NEXT:    s_add_u32 s0, s0, s7
28; GFX803-NEXT:    s_addc_u32 s1, s1, 0
29; GFX803-NEXT:    v_mov_b32_e32 v0, 0
30; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s5
31; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
32; GFX803-NEXT:    s_waitcnt vmcnt(0)
33; GFX803-NEXT:    s_endpgm
34;
35; GFX900-LABEL: test_kern_stack:
36; GFX900:       ; %bb.0: ; %entry
37; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
38; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
39; GFX900-NEXT:    s_add_u32 s0, s0, s7
40; GFX900-NEXT:    s_addc_u32 s1, s1, 0
41; GFX900-NEXT:    v_mov_b32_e32 v0, 0
42; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
43; GFX900-NEXT:    s_waitcnt vmcnt(0)
44; GFX900-NEXT:    s_endpgm
45;
46; GFX1010-LABEL: test_kern_stack:
47; GFX1010:       ; %bb.0: ; %entry
48; GFX1010-NEXT:    s_add_u32 s4, s4, s7
49; GFX1010-NEXT:    s_addc_u32 s5, s5, 0
50; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
51; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
52; GFX1010-NEXT:    v_mov_b32_e32 v0, 0
53; GFX1010-NEXT:    s_add_u32 s0, s0, s7
54; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
55; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
56; GFX1010-NEXT:    s_waitcnt_vscnt null, 0x0
57; GFX1010-NEXT:    s_endpgm
58entry:
59  %x = alloca i32, align 4, addrspace(5)
60  store volatile i32 0, i32 addrspace(5)* %x, align 4
61  ret void
62}
63
64define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
65; GFX803-LABEL: test_kern_call:
66; GFX803:       ; %bb.0: ; %entry
67; GFX803-NEXT:    s_add_u32 s12, s12, s17
68; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
69; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
70; GFX803-NEXT:    s_add_u32 s0, s0, s17
71; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
72; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
73; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s13
74; GFX803-NEXT:    s_addc_u32 s1, s1, 0
75; GFX803-NEXT:    s_mov_b32 s12, s14
76; GFX803-NEXT:    s_mov_b32 s13, s15
77; GFX803-NEXT:    v_or_b32_e32 v31, v0, v2
78; GFX803-NEXT:    s_mov_b32 s14, s16
79; GFX803-NEXT:    s_getpc_b64 s[18:19]
80; GFX803-NEXT:    s_add_u32 s18, s18, ex@rel32@lo+4
81; GFX803-NEXT:    s_addc_u32 s19, s19, ex@rel32@hi+12
82; GFX803-NEXT:    s_mov_b32 s32, 0
83; GFX803-NEXT:    s_swappc_b64 s[30:31], s[18:19]
84; GFX803-NEXT:    s_endpgm
85;
86; GFX900-LABEL: test_kern_call:
87; GFX900:       ; %bb.0: ; %entry
88; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
89; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
90; GFX900-NEXT:    s_add_u32 s0, s0, s17
91; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
92; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
93; GFX900-NEXT:    s_addc_u32 s1, s1, 0
94; GFX900-NEXT:    s_mov_b32 s12, s14
95; GFX900-NEXT:    s_mov_b32 s13, s15
96; GFX900-NEXT:    v_or3_b32 v31, v0, v1, v2
97; GFX900-NEXT:    s_mov_b32 s14, s16
98; GFX900-NEXT:    s_getpc_b64 s[18:19]
99; GFX900-NEXT:    s_add_u32 s18, s18, ex@rel32@lo+4
100; GFX900-NEXT:    s_addc_u32 s19, s19, ex@rel32@hi+12
101; GFX900-NEXT:    s_mov_b32 s32, 0
102; GFX900-NEXT:    s_swappc_b64 s[30:31], s[18:19]
103; GFX900-NEXT:    s_endpgm
104
105; GFX1010-LABEL: test_kern_call:
106; GFX1010:       ; %bb.0: ; %entry
107; GFX1010-NEXT:   s_add_u32 s12, s12, s17
108; GFX1010-NEXT:   s_mov_b32 s32, 0
109; GFX1010-NEXT:   s_addc_u32 s13, s13, 0
110; GFX1010-NEXT:   s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
111; GFX1010-NEXT:   s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
112; GFX1010-NEXT:   v_lshlrev_b32_e32 v2, 20, v2
113; GFX1010-NEXT:   v_lshlrev_b32_e32 v1, 10, v1
114; GFX1010-NEXT:   s_add_u32 s0, s0, s17
115; GFX1010-NEXT:   s_addc_u32 s1, s1, 0
116; GFX1010-NEXT:   s_mov_b32 s12, s14
117; GFX1010-NEXT:   s_mov_b32 s13, s15
118; GFX1010-NEXT:   v_or3_b32 v31, v0, v1, v2
119; GFX1010-NEXT:   s_mov_b32 s14, s16
120; GFX1010-NEXT:   s_getpc_b64 s[18:19]
121; GFX1010-NEXT:   s_add_u32 s18, s18, ex@rel32@lo+4
122; GFX1010-NEXT:   s_addc_u32 s19, s19, ex@rel32@hi+12
123; GFX1010-NEXT:   s_swappc_b64 s[30:31], s[18:19]
124; GFX1010-NEXT:   s_endpgm
125entry:
126  tail call void @ex() #0
127  ret void
128}
129
130define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
131; GFX803-LABEL: test_kern_stack_and_call:
132; GFX803:       ; %bb.0: ; %entry
133; GFX803-NEXT:    s_add_u32 s12, s12, s17
134; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
135; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
136; GFX803-NEXT:    s_add_u32 s0, s0, s17
137; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
138; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
139; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s13
140; GFX803-NEXT:    s_addc_u32 s1, s1, 0
141; GFX803-NEXT:    s_mov_b32 s12, s14
142; GFX803-NEXT:    v_mov_b32_e32 v3, 0
143; GFX803-NEXT:    s_mov_b32 s13, s15
144; GFX803-NEXT:    v_or_b32_e32 v31, v0, v2
145; GFX803-NEXT:    s_mov_b32 s14, s16
146; GFX803-NEXT:    s_getpc_b64 s[18:19]
147; GFX803-NEXT:    s_add_u32 s18, s18, ex@rel32@lo+4
148; GFX803-NEXT:    s_addc_u32 s19, s19, ex@rel32@hi+12
149; GFX803-NEXT:    s_movk_i32 s32, 0x400
150; GFX803-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:4
151; GFX803-NEXT:    s_waitcnt vmcnt(0)
152; GFX803-NEXT:    s_swappc_b64 s[30:31], s[18:19]
153; GFX803-NEXT:    s_endpgm
154;
155; GFX900-LABEL: test_kern_stack_and_call:
156; GFX900:       ; %bb.0: ; %entry
157; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
158; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
159; GFX900-NEXT:    s_add_u32 s0, s0, s17
160; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
161; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
162; GFX900-NEXT:    s_addc_u32 s1, s1, 0
163; GFX900-NEXT:    s_mov_b32 s12, s14
164; GFX900-NEXT:    v_mov_b32_e32 v3, 0
165; GFX900-NEXT:    s_mov_b32 s13, s15
166; GFX900-NEXT:    v_or3_b32 v31, v0, v1, v2
167; GFX900-NEXT:    s_mov_b32 s14, s16
168; GFX900-NEXT:    s_getpc_b64 s[18:19]
169; GFX900-NEXT:    s_add_u32 s18, s18, ex@rel32@lo+4
170; GFX900-NEXT:    s_addc_u32 s19, s19, ex@rel32@hi+12
171; GFX900-NEXT:    s_movk_i32 s32, 0x400
172; GFX900-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:4
173; GFX900-NEXT:    s_waitcnt vmcnt(0)
174; GFX900-NEXT:    s_swappc_b64 s[30:31], s[18:19]
175; GFX900-NEXT:    s_endpgm
176
177; GFX1010-LABEL: test_kern_stack_and_call:
178; GFX1010:      ; %bb.0: ; %entry
179; GFX1010-NEXT:   s_add_u32 s12, s12, s17
180; GFX1010-NEXT:   s_movk_i32 s32, 0x200
181; GFX1010-NEXT:   s_addc_u32 s13, s13, 0
182; GFX1010-NEXT:   s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
183; GFX1010-NEXT:   s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
184; GFX1010-NEXT:   v_lshlrev_b32_e32 v2, 20, v2
185; GFX1010-NEXT:   v_lshlrev_b32_e32 v1, 10, v1
186; GFX1010-NEXT:   v_mov_b32_e32 v3, 0
187; GFX1010-NEXT:   s_add_u32 s0, s0, s17
188; GFX1010-NEXT:   s_addc_u32 s1, s1, 0
189; GFX1010-NEXT:   s_mov_b32 s12, s14
190; GFX1010-NEXT:   v_or3_b32 v31, v0, v1, v2
191; GFX1010-NEXT:   s_mov_b32 s13, s15
192; GFX1010-NEXT:   s_mov_b32 s14, s16
193; GFX1010-NEXT:   s_getpc_b64 s[18:19]
194; GFX1010-NEXT:   s_add_u32 s18, s18, ex@rel32@lo+4
195; GFX1010-NEXT:   s_addc_u32 s19, s19, ex@rel32@hi+12
196; GFX1010-NEXT:   buffer_store_dword v3, off, s[0:3], 0 offset:4
197; GFX1010-NEXT:   s_waitcnt_vscnt null, 0x0
198; GFX1010-NEXT:   s_swappc_b64 s[30:31], s[18:19]
199; GFX1010-NEXT:   s_endpgm
200entry:
201  %x = alloca i32, align 4, addrspace(5)
202  store volatile i32 0, i32 addrspace(5)* %x, align 4
203  tail call void @ex() #0
204  ret void
205}
206
207define amdgpu_kernel void @test_force_fp_kern_empty() local_unnamed_addr #2 {
208; GFX803-LABEL: test_force_fp_kern_empty:
209; GFX803:       ; %bb.0: ; %entry
210; GFX803-NEXT:    s_mov_b32 s33, 0
211; GFX803-NEXT:    s_endpgm
212
213; GFX900-LABEL: test_force_fp_kern_empty:
214; GFX900:       ; %bb.0: ; %entry
215; GFX900-NEXT:    s_mov_b32 s33, 0
216; GFX900-NEXT:    s_endpgm
217;
218; GFX1010-LABEL: test_force_fp_kern_empty:
219; GFX1010:       ; %bb.0: ; %entry
220; GFX1010-NEXT:    s_mov_b32 s33, 0
221; GFX1010-NEXT:    s_endpgm
222entry:
223  ret void
224}
225
226define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 {
227; GFX803-LABEL: test_force_fp_kern_stack:
228; GFX803:       ; %bb.0: ; %entry
229; GFX803-NEXT:    s_add_u32 s4, s4, s7
230; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
231; GFX803-NEXT:    s_add_u32 s0, s0, s7
232; GFX803-NEXT:    s_mov_b32 s33, 0
233; GFX803-NEXT:    s_addc_u32 s1, s1, 0
234; GFX803-NEXT:    v_mov_b32_e32 v0, 0
235; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s5
236; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
237; GFX803-NEXT:    s_waitcnt vmcnt(0)
238; GFX803-NEXT:    s_endpgm
239;
240; GFX900-LABEL: test_force_fp_kern_stack:
241; GFX900:       ; %bb.0: ; %entry
242; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
243; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
244; GFX900-NEXT:    s_add_u32 s0, s0, s7
245; GFX900-NEXT:    s_mov_b32 s33, 0
246; GFX900-NEXT:    s_addc_u32 s1, s1, 0
247; GFX900-NEXT:    v_mov_b32_e32 v0, 0
248; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
249; GFX900-NEXT:    s_waitcnt vmcnt(0)
250; GFX900-NEXT:    s_endpgm
251;
252; GFX1010-LABEL: test_force_fp_kern_stack:
253; GFX1010:       ; %bb.0: ; %entry
254; GFX1010-NEXT:    s_add_u32 s4, s4, s7
255; GFX1010-NEXT:    s_mov_b32 s33, 0
256; GFX1010-NEXT:    s_addc_u32 s5, s5, 0
257; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
258; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
259; GFX1010-NEXT:    v_mov_b32_e32 v0, 0
260; GFX1010-NEXT:    s_add_u32 s0, s0, s7
261; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
262; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
263; GFX1010-NEXT:    s_waitcnt_vscnt null, 0x0
264; GFX1010-NEXT:    s_endpgm
265entry:
266  %x = alloca i32, align 4, addrspace(5)
267  store volatile i32 0, i32 addrspace(5)* %x, align 4
268  ret void
269}
270
271define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
272; GFX803-LABEL: test_force_fp_kern_call:
273; GFX803:       ; %bb.0: ; %entry
274; GFX803-NEXT:    s_add_u32 s12, s12, s17
275; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
276; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
277; GFX803-NEXT:    s_add_u32 s0, s0, s17
278; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
279; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
280; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s13
281; GFX803-NEXT:    s_addc_u32 s1, s1, 0
282; GFX803-NEXT:    s_mov_b32 s12, s14
283; GFX803-NEXT:    s_mov_b32 s13, s15
284; GFX803-NEXT:    v_or_b32_e32 v31, v0, v2
285; GFX803-NEXT:    s_mov_b32 s14, s16
286; GFX803-NEXT:    s_getpc_b64 s[18:19]
287; GFX803-NEXT:    s_add_u32 s18, s18, ex@rel32@lo+4
288; GFX803-NEXT:    s_addc_u32 s19, s19, ex@rel32@hi+12
289; GFX803-NEXT:    s_mov_b32 s32, 0
290; GFX803-NEXT:    s_mov_b32 s33, 0
291; GFX803-NEXT:    s_swappc_b64 s[30:31], s[18:19]
292; GFX803-NEXT:    s_endpgm
293;
294; GFX900-LABEL: test_force_fp_kern_call:
295; GFX900:       ; %bb.0: ; %entry
296; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
297; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
298; GFX900-NEXT:    s_add_u32 s0, s0, s17
299; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
300; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
301; GFX900-NEXT:    s_addc_u32 s1, s1, 0
302; GFX900-NEXT:    s_mov_b32 s12, s14
303; GFX900-NEXT:    s_mov_b32 s13, s15
304; GFX900-NEXT:    v_or3_b32 v31, v0, v1, v2
305; GFX900-NEXT:    s_mov_b32 s14, s16
306; GFX900-NEXT:    s_getpc_b64 s[18:19]
307; GFX900-NEXT:    s_add_u32 s18, s18, ex@rel32@lo+4
308; GFX900-NEXT:    s_addc_u32 s19, s19, ex@rel32@hi+12
309; GFX900-NEXT:    s_mov_b32 s32, 0
310; GFX900-NEXT:    s_mov_b32 s33, 0
311; GFX900-NEXT:    s_swappc_b64 s[30:31], s[18:19]
312; GFX900-NEXT:    s_endpgm
313;
314; GFX1010-LABEL: test_force_fp_kern_call:
315; GFX1010:       ; %bb.0: ; %entry
316; GFX1010-NEXT    s_add_u32 s12, s12, s17
317; GFX1010-NEXT    s_mov_b32 s32, 0
318; GFX1010-NEXT    s_mov_b32 s33, 0
319; GFX1010-NEXT    s_addc_u32 s13, s13, 0
320; GFX1010-NEXT    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
321; GFX1010-NEXT    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
322; GFX1010-NEXT    v_lshlrev_b32_e32 v2, 20, v2
323; GFX1010-NEXT    v_lshlrev_b32_e32 v1, 10, v1
324; GFX1010-NEXT    s_add_u32 s0, s0, s17
325; GFX1010-NEXT    s_addc_u32 s1, s1, 0
326; GFX1010-NEXT    s_mov_b32 s12, s14
327; GFX1010-NEXT    s_mov_b32 s13, s15
328; GFX1010-NEXT    v_or3_b32 v31, v0, v1, v2
329; GFX1010-NEXT    s_mov_b32 s14, s16
330; GFX1010-NEXT    s_getpc_b64 s[18:19]
331; GFX1010-NEXT    s_add_u32 s18, s18, ex@rel32@lo+4
332; GFX1010-NEXT    s_addc_u32 s19, s19, ex@rel32@hi+12
333; GFX1010-NEXT    s_swappc_b64 s[30:31], s[18:19]
334; GFX1010-NEXT    s_endpgm
335entry:
336  tail call void @ex() #2
337  ret void
338}
339
340define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_addr #2 {
341; GFX803-LABEL: test_force_fp_kern_stack_and_call:
342; GFX803:       ; %bb.0: ; %entry
343; GFX803-NEXT:    s_add_u32 s12, s12, s17
344; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
345; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
346; GFX803-NEXT:    s_add_u32 s0, s0, s17
347; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
348; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
349; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s13
350; GFX803-NEXT:    s_addc_u32 s1, s1, 0
351; GFX803-NEXT:    s_mov_b32 s12, s14
352; GFX803-NEXT:    s_mov_b32 s33, 0
353; GFX803-NEXT:    v_mov_b32_e32 v3, 0
354; GFX803-NEXT:    s_mov_b32 s13, s15
355; GFX803-NEXT:    v_or_b32_e32 v31, v0, v2
356; GFX803-NEXT:    s_mov_b32 s14, s16
357; GFX803-NEXT:    s_getpc_b64 s[18:19]
358; GFX803-NEXT:    s_add_u32 s18, s18, ex@rel32@lo+4
359; GFX803-NEXT:    s_addc_u32 s19, s19, ex@rel32@hi+12
360; GFX803-NEXT:    s_movk_i32 s32, 0x400
361; GFX803-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:4
362; GFX803-NEXT:    s_waitcnt vmcnt(0)
363; GFX803-NEXT:    s_swappc_b64 s[30:31], s[18:19]
364; GFX803-NEXT:    s_endpgm
365;
366; GFX900-LABEL: test_force_fp_kern_stack_and_call:
367; GFX900:       ; %bb.0: ; %entry
368; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
369; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
370; GFX900-NEXT:    s_add_u32 s0, s0, s17
371; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
372; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
373; GFX900-NEXT:    s_addc_u32 s1, s1, 0
374; GFX900-NEXT:    s_mov_b32 s12, s14
375; GFX900-NEXT:    s_mov_b32 s33, 0
376; GFX900-NEXT:    v_mov_b32_e32 v3, 0
377; GFX900-NEXT:    s_mov_b32 s13, s15
378; GFX900-NEXT:    v_or3_b32 v31, v0, v1, v2
379; GFX900-NEXT:    s_mov_b32 s14, s16
380; GFX900-NEXT:    s_getpc_b64 s[18:19]
381; GFX900-NEXT:    s_add_u32 s18, s18, ex@rel32@lo+4
382; GFX900-NEXT:    s_addc_u32 s19, s19, ex@rel32@hi+12
383; GFX900-NEXT:    s_movk_i32 s32, 0x400
384; GFX900-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:4
385; GFX900-NEXT:    s_waitcnt vmcnt(0)
386; GFX900-NEXT:    s_swappc_b64 s[30:31], s[18:19]
387; GFX900-NEXT:    s_endpgm
388;
389; GFX1010-LABEL: test_force_fp_kern_stack_and_call:
390; GFX1010:       ; %bb.0: ; %entry
391; GFX1010-NEXT:   s_add_u32 s12, s12, s17
392; GFX1010-NEXT:   s_movk_i32 s32, 0x200
393; GFX1010-NEXT:   s_mov_b32 s33, 0
394; GFX1010-NEXT:   s_addc_u32 s13, s13, 0
395; GFX1010-NEXT:   s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
396; GFX1010-NEXT:   s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
397; GFX1010-NEXT:   v_lshlrev_b32_e32 v2, 20, v2
398; GFX1010-NEXT:   v_lshlrev_b32_e32 v1, 10, v1
399; GFX1010-NEXT:   v_mov_b32_e32 v3, 0
400; GFX1010-NEXT:   s_add_u32 s0, s0, s17
401; GFX1010-NEXT:   s_addc_u32 s1, s1, 0
402; GFX1010-NEXT:   s_mov_b32 s12, s14
403; GFX1010-NEXT:   v_or3_b32 v31, v0, v1, v2
404; GFX1010-NEXT:   s_mov_b32 s13, s15
405; GFX1010-NEXT:   s_mov_b32 s14, s16
406; GFX1010-NEXT:   s_getpc_b64 s[18:19]
407; GFX1010-NEXT:   s_add_u32 s18, s18, ex@rel32@lo+4
408; GFX1010-NEXT:   s_addc_u32 s19, s19, ex@rel32@hi+12
409; GFX1010-NEXT:   buffer_store_dword v3, off, s[0:3], s33 offset:4
410; GFX1010-NEXT:   s_waitcnt_vscnt null, 0x0
411; GFX1010-NEXT:   s_swappc_b64 s[30:31], s[18:19]
412; GFX1010-NEXT:   s_endpgm
413entry:
414  %x = alloca i32, align 4, addrspace(5)
415  store volatile i32 0, i32 addrspace(5)* %x, align 4
416  tail call void @ex() #2
417  ret void
418}
419
420define amdgpu_kernel void @test_sgpr_offset_kernel() #1 {
421; GFX803-LABEL: test_sgpr_offset_kernel:
422; GFX803:       ; %bb.0: ; %entry
423; GFX803-NEXT:    s_add_u32 s4, s4, s7
424; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
425; GFX803-NEXT:    s_add_u32 s0, s0, s7
426; GFX803-NEXT:    s_addc_u32 s1, s1, 0
427; GFX803-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
428; GFX803-NEXT:    s_waitcnt vmcnt(0)
429; GFX803-NEXT:    s_mov_b32 s4, 0x40000
430; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s5
431; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
432; GFX803-NEXT:    ;;#ASMSTART
433; GFX803-NEXT:    ;;#ASMEND
434; GFX803-NEXT:    s_mov_b32 s4, 0x40000
435; GFX803-NEXT:    buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
436; GFX803-NEXT:    s_waitcnt vmcnt(0)
437; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
438; GFX803-NEXT:    s_waitcnt vmcnt(0)
439; GFX803-NEXT:    s_endpgm
440;
441; GFX900-LABEL: test_sgpr_offset_kernel:
442; GFX900:       ; %bb.0: ; %entry
443; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
444; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
445; GFX900-NEXT:    s_add_u32 s0, s0, s7
446; GFX900-NEXT:    s_addc_u32 s1, s1, 0
447; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
448; GFX900-NEXT:    s_waitcnt vmcnt(0)
449; GFX900-NEXT:    s_mov_b32 s6, 0x40000
450; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
451; GFX900-NEXT:    ;;#ASMSTART
452; GFX900-NEXT:    ;;#ASMEND
453; GFX900-NEXT:    s_mov_b32 s6, 0x40000
454; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
455; GFX900-NEXT:    s_waitcnt vmcnt(0)
456; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
457; GFX900-NEXT:    s_waitcnt vmcnt(0)
458; GFX900-NEXT:    s_endpgm
459;
460; GFX1010-LABEL: test_sgpr_offset_kernel:
461; GFX1010:       ; %bb.0: ; %entry
462; GFX1010-NEXT:    s_add_u32 s4, s4, s7
463; GFX1010-NEXT:    s_addc_u32 s5, s5, 0
464; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
465; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
466; GFX1010-NEXT:    s_add_u32 s0, s0, s7
467; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
468; GFX1010-NEXT:    s_mov_b32 s6, 0x20000
469; GFX1010-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8 glc dlc
470; GFX1010-NEXT:    s_waitcnt vmcnt(0)
471; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill
472; GFX1010-NEXT:    s_waitcnt_depctr 0xffe3
473; GFX1010-NEXT:    s_mov_b32 s6, 0x20000
474; GFX1010-NEXT:    ;;#ASMSTART
475; GFX1010-NEXT:    ;;#ASMEND
476; GFX1010-NEXT:    buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload
477; GFX1010-NEXT:    s_waitcnt vmcnt(0)
478; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
479; GFX1010-NEXT:    s_waitcnt_vscnt null, 0x0
480; GFX1010-NEXT:    s_endpgm
481entry:
482  ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
483  ; fit in the instruction, and has to live in the SGPR offset.
484  %alloca = alloca i8, i32 4092, align 4, addrspace(5)
485  %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
486
487  %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
488  ; 0x40000 / 64 = 4096 (for wave64)
489  ; CHECK: s_add_u32 s6, s7, 0x40000
490  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill
491  %a = load volatile i32, i32 addrspace(5)* %aptr
492
493  ; Force %a to spill
494  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
495
496  %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
497  store volatile i32 %a, i32 addrspace(5)* %outptr
498
499  ret void
500}
501
502declare hidden void @ex() local_unnamed_addr #0
503
504attributes #0 = { nounwind }
505attributes #1 = { nounwind "amdgpu-num-vgpr"="8" }
506attributes #2 = { nounwind "frame-pointer"="all" }
507