1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=GFX803 %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX900 %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX1010 %s
5; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefix=GFX1100 %s
6
7define amdgpu_kernel void @test_kern_empty() local_unnamed_addr #0 {
8; GFX803-LABEL: test_kern_empty:
9; GFX803:       ; %bb.0: ; %entry
10; GFX803-NEXT:    s_endpgm
11;
12; GFX900-LABEL: test_kern_empty:
13; GFX900:       ; %bb.0: ; %entry
14; GFX900-NEXT:    s_endpgm
15;
16; GFX1010-LABEL: test_kern_empty:
17; GFX1010:       ; %bb.0: ; %entry
18; GFX1010-NEXT:    s_endpgm
19;
20; GFX1100-LABEL: test_kern_empty:
21; GFX1100:       ; %bb.0: ; %entry
22; GFX1100-NEXT:    s_endpgm
23entry:
24  ret void
25}
26
27define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 {
28; GFX803-LABEL: test_kern_stack:
29; GFX803:       ; %bb.0: ; %entry
30; GFX803-NEXT:    s_add_u32 s0, s0, s7
31; GFX803-NEXT:    s_addc_u32 s1, s1, 0
32; GFX803-NEXT:    v_mov_b32_e32 v0, 0
33; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
34; GFX803-NEXT:    s_waitcnt vmcnt(0)
35; GFX803-NEXT:    s_endpgm
36;
37; GFX900-LABEL: test_kern_stack:
38; GFX900:       ; %bb.0: ; %entry
39; GFX900-NEXT:    s_add_u32 s0, s0, s7
40; GFX900-NEXT:    s_addc_u32 s1, s1, 0
41; GFX900-NEXT:    v_mov_b32_e32 v0, 0
42; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
43; GFX900-NEXT:    s_waitcnt vmcnt(0)
44; GFX900-NEXT:    s_endpgm
45;
46; GFX1010-LABEL: test_kern_stack:
47; GFX1010:       ; %bb.0: ; %entry
48; GFX1010-NEXT:    v_mov_b32_e32 v0, 0
49; GFX1010-NEXT:    s_add_u32 s0, s0, s7
50; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
51; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
52; GFX1010-NEXT:    s_waitcnt_vscnt null, 0x0
53; GFX1010-NEXT:    s_endpgm
54;
55; GFX1100-LABEL: test_kern_stack:
56; GFX1100:       ; %bb.0: ; %entry
57; GFX1100-NEXT:    v_mov_b32_e32 v0, 0
58; GFX1100-NEXT:    scratch_store_b32 off, v0, off offset:4 dlc
59; GFX1100-NEXT:    s_waitcnt_vscnt null, 0x0
60; GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
61; GFX1100-NEXT:    s_endpgm
62entry:
63  %x = alloca i32, align 4, addrspace(5)
64  store volatile i32 0, i32 addrspace(5)* %x, align 4
65  ret void
66}
67
68define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 {
69; GFX803-LABEL: test_kern_call:
70; GFX803:       ; %bb.0: ; %entry
71; GFX803-NEXT:    s_add_i32 s12, s12, s17
72; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
73; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
74; GFX803-NEXT:    s_add_u32 s0, s0, s17
75; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
76; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
77; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s13
78; GFX803-NEXT:    s_addc_u32 s1, s1, 0
79; GFX803-NEXT:    s_mov_b32 s13, s15
80; GFX803-NEXT:    s_mov_b32 s12, s14
81; GFX803-NEXT:    v_or_b32_e32 v31, v0, v2
82; GFX803-NEXT:    s_mov_b32 s14, s16
83; GFX803-NEXT:    s_mov_b32 s32, 0
84; GFX803-NEXT:    s_getpc_b64 s[18:19]
85; GFX803-NEXT:    s_add_u32 s18, s18, ex@rel32@lo+4
86; GFX803-NEXT:    s_addc_u32 s19, s19, ex@rel32@hi+12
87; GFX803-NEXT:    s_swappc_b64 s[30:31], s[18:19]
88; GFX803-NEXT:    s_endpgm
89;
90; GFX900-LABEL: test_kern_call:
91; GFX900:       ; %bb.0: ; %entry
92; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
93; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
94; GFX900-NEXT:    s_add_u32 s0, s0, s17
95; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
96; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
97; GFX900-NEXT:    s_addc_u32 s1, s1, 0
98; GFX900-NEXT:    s_mov_b32 s13, s15
99; GFX900-NEXT:    s_mov_b32 s12, s14
100; GFX900-NEXT:    v_or3_b32 v31, v0, v1, v2
101; GFX900-NEXT:    s_mov_b32 s14, s16
102; GFX900-NEXT:    s_mov_b32 s32, 0
103; GFX900-NEXT:    s_getpc_b64 s[18:19]
104; GFX900-NEXT:    s_add_u32 s18, s18, ex@rel32@lo+4
105; GFX900-NEXT:    s_addc_u32 s19, s19, ex@rel32@hi+12
106; GFX900-NEXT:    s_swappc_b64 s[30:31], s[18:19]
107; GFX900-NEXT:    s_endpgm
108;
109; GFX1010-LABEL: test_kern_call:
110; GFX1010:       ; %bb.0: ; %entry
111; GFX1010-NEXT:    s_add_u32 s12, s12, s17
112; GFX1010-NEXT:    s_mov_b32 s32, 0
113; GFX1010-NEXT:    s_addc_u32 s13, s13, 0
114; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
115; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
116; GFX1010-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
117; GFX1010-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
118; GFX1010-NEXT:    s_add_u32 s0, s0, s17
119; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
120; GFX1010-NEXT:    s_mov_b32 s13, s15
121; GFX1010-NEXT:    s_mov_b32 s12, s14
122; GFX1010-NEXT:    v_or3_b32 v31, v0, v1, v2
123; GFX1010-NEXT:    s_mov_b32 s14, s16
124; GFX1010-NEXT:    s_getpc_b64 s[18:19]
125; GFX1010-NEXT:    s_add_u32 s18, s18, ex@rel32@lo+4
126; GFX1010-NEXT:    s_addc_u32 s19, s19, ex@rel32@hi+12
127; GFX1010-NEXT:    s_swappc_b64 s[30:31], s[18:19]
128; GFX1010-NEXT:    s_endpgm
129;
130; GFX1100-LABEL: test_kern_call:
131; GFX1100:       ; %bb.0: ; %entry
132; GFX1100-NEXT:    v_mov_b32_e32 v31, v0
133; GFX1100-NEXT:    s_mov_b32 s12, s13
134; GFX1100-NEXT:    s_mov_b64 s[10:11], s[6:7]
135; GFX1100-NEXT:    s_mov_b64 s[8:9], s[4:5]
136; GFX1100-NEXT:    s_mov_b64 s[4:5], s[0:1]
137; GFX1100-NEXT:    s_mov_b64 s[6:7], s[2:3]
138; GFX1100-NEXT:    s_mov_b32 s13, s14
139; GFX1100-NEXT:    s_mov_b32 s14, s15
140; GFX1100-NEXT:    s_mov_b32 s32, 0
141; GFX1100-NEXT:    s_getpc_b64 s[16:17]
142; GFX1100-NEXT:    s_add_u32 s16, s16, ex@rel32@lo+4
143; GFX1100-NEXT:    s_addc_u32 s17, s17, ex@rel32@hi+12
144; GFX1100-NEXT:    s_swappc_b64 s[30:31], s[16:17]
145; GFX1100-NEXT:    s_endpgm
146
147entry:
148  tail call void @ex() #0
149  ret void
150}
151
152define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 {
153; GFX803-LABEL: test_kern_stack_and_call:
154; GFX803:       ; %bb.0: ; %entry
155; GFX803-NEXT:    s_add_i32 s12, s12, s17
156; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
157; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
158; GFX803-NEXT:    s_add_u32 s0, s0, s17
159; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
160; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
161; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s13
162; GFX803-NEXT:    s_addc_u32 s1, s1, 0
163; GFX803-NEXT:    s_mov_b32 s13, s15
164; GFX803-NEXT:    s_mov_b32 s12, s14
165; GFX803-NEXT:    v_mov_b32_e32 v3, 0
166; GFX803-NEXT:    v_or_b32_e32 v31, v0, v2
167; GFX803-NEXT:    s_mov_b32 s14, s16
168; GFX803-NEXT:    s_movk_i32 s32, 0x400
169; GFX803-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:4
170; GFX803-NEXT:    s_waitcnt vmcnt(0)
171; GFX803-NEXT:    s_getpc_b64 s[18:19]
172; GFX803-NEXT:    s_add_u32 s18, s18, ex@rel32@lo+4
173; GFX803-NEXT:    s_addc_u32 s19, s19, ex@rel32@hi+12
174; GFX803-NEXT:    s_swappc_b64 s[30:31], s[18:19]
175; GFX803-NEXT:    s_endpgm
176;
177; GFX900-LABEL: test_kern_stack_and_call:
178; GFX900:       ; %bb.0: ; %entry
179; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
180; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
181; GFX900-NEXT:    s_add_u32 s0, s0, s17
182; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
183; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
184; GFX900-NEXT:    s_addc_u32 s1, s1, 0
185; GFX900-NEXT:    s_mov_b32 s13, s15
186; GFX900-NEXT:    s_mov_b32 s12, s14
187; GFX900-NEXT:    v_mov_b32_e32 v3, 0
188; GFX900-NEXT:    v_or3_b32 v31, v0, v1, v2
189; GFX900-NEXT:    s_mov_b32 s14, s16
190; GFX900-NEXT:    s_movk_i32 s32, 0x400
191; GFX900-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:4
192; GFX900-NEXT:    s_waitcnt vmcnt(0)
193; GFX900-NEXT:    s_getpc_b64 s[18:19]
194; GFX900-NEXT:    s_add_u32 s18, s18, ex@rel32@lo+4
195; GFX900-NEXT:    s_addc_u32 s19, s19, ex@rel32@hi+12
196; GFX900-NEXT:    s_swappc_b64 s[30:31], s[18:19]
197; GFX900-NEXT:    s_endpgm
198;
199; GFX1010-LABEL: test_kern_stack_and_call:
200; GFX1010:       ; %bb.0: ; %entry
201; GFX1010-NEXT:    s_add_u32 s12, s12, s17
202; GFX1010-NEXT:    s_movk_i32 s32, 0x200
203; GFX1010-NEXT:    s_addc_u32 s13, s13, 0
204; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
205; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
206; GFX1010-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
207; GFX1010-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
208; GFX1010-NEXT:    v_mov_b32_e32 v3, 0
209; GFX1010-NEXT:    s_add_u32 s0, s0, s17
210; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
211; GFX1010-NEXT:    s_mov_b32 s13, s15
212; GFX1010-NEXT:    v_or3_b32 v31, v0, v1, v2
213; GFX1010-NEXT:    s_mov_b32 s12, s14
214; GFX1010-NEXT:    s_mov_b32 s14, s16
215; GFX1010-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:4
216; GFX1010-NEXT:    s_waitcnt_vscnt null, 0x0
217; GFX1010-NEXT:    s_getpc_b64 s[18:19]
218; GFX1010-NEXT:    s_add_u32 s18, s18, ex@rel32@lo+4
219; GFX1010-NEXT:    s_addc_u32 s19, s19, ex@rel32@hi+12
220; GFX1010-NEXT:    s_swappc_b64 s[30:31], s[18:19]
221; GFX1010-NEXT:    s_endpgm
222;
223; GFX1100-LABEL: test_kern_stack_and_call:
224; GFX1100:       ; %bb.0: ; %entry
225; GFX1100-NEXT:    v_mov_b32_e32 v1, 0
226; GFX1100-NEXT:    v_mov_b32_e32 v31, v0
227; GFX1100-NEXT:    s_mov_b32 s12, s13
228; GFX1100-NEXT:    s_mov_b64 s[10:11], s[6:7]
229; GFX1100-NEXT:    s_mov_b64 s[8:9], s[4:5]
230; GFX1100-NEXT:    s_mov_b64 s[4:5], s[0:1]
231; GFX1100-NEXT:    s_mov_b64 s[6:7], s[2:3]
232; GFX1100-NEXT:    s_mov_b32 s13, s14
233; GFX1100-NEXT:    s_mov_b32 s14, s15
234; GFX1100-NEXT:    s_mov_b32 s32, 16
235; GFX1100-NEXT:    scratch_store_b32 off, v1, off offset:4 dlc
236; GFX1100-NEXT:    s_waitcnt_vscnt null, 0x0
237; GFX1100-NEXT:    s_getpc_b64 s[16:17]
238; GFX1100-NEXT:    s_add_u32 s16, s16, ex@rel32@lo+4
239; GFX1100-NEXT:    s_addc_u32 s17, s17, ex@rel32@hi+12
240; GFX1100-NEXT:    s_swappc_b64 s[30:31], s[16:17]
241; GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
242; GFX1100-NEXT:    s_endpgm
243
244entry:
245  %x = alloca i32, align 4, addrspace(5)
246  store volatile i32 0, i32 addrspace(5)* %x, align 4
247  tail call void @ex() #0
248  ret void
249}
250
251define amdgpu_kernel void @test_force_fp_kern_empty() local_unnamed_addr #2 {
252; GFX803-LABEL: test_force_fp_kern_empty:
253; GFX803:       ; %bb.0: ; %entry
254; GFX803-NEXT:    s_mov_b32 s33, 0
255; GFX803-NEXT:    s_endpgm
256;
257; GFX900-LABEL: test_force_fp_kern_empty:
258; GFX900:       ; %bb.0: ; %entry
259; GFX900-NEXT:    s_mov_b32 s33, 0
260; GFX900-NEXT:    s_endpgm
261;
262; GFX1010-LABEL: test_force_fp_kern_empty:
263; GFX1010:       ; %bb.0: ; %entry
264; GFX1010-NEXT:    s_mov_b32 s33, 0
265; GFX1010-NEXT:    s_endpgm
266;
267; GFX1100-LABEL: test_force_fp_kern_empty:
268; GFX1100:       ; %bb.0: ; %entry
269; GFX1100-NEXT:    s_mov_b32 s33, 0
270; GFX1100-NEXT:    s_endpgm
271
272entry:
273  ret void
274}
275
276define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 {
277; GFX803-LABEL: test_force_fp_kern_stack:
278; GFX803:       ; %bb.0: ; %entry
279; GFX803-NEXT:    s_add_u32 s0, s0, s7
280; GFX803-NEXT:    s_mov_b32 s33, 0
281; GFX803-NEXT:    s_addc_u32 s1, s1, 0
282; GFX803-NEXT:    v_mov_b32_e32 v0, 0
283; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
284; GFX803-NEXT:    s_waitcnt vmcnt(0)
285; GFX803-NEXT:    s_endpgm
286;
287; GFX900-LABEL: test_force_fp_kern_stack:
288; GFX900:       ; %bb.0: ; %entry
289; GFX900-NEXT:    s_add_u32 s0, s0, s7
290; GFX900-NEXT:    s_mov_b32 s33, 0
291; GFX900-NEXT:    s_addc_u32 s1, s1, 0
292; GFX900-NEXT:    v_mov_b32_e32 v0, 0
293; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
294; GFX900-NEXT:    s_waitcnt vmcnt(0)
295; GFX900-NEXT:    s_endpgm
296;
297; GFX1010-LABEL: test_force_fp_kern_stack:
298; GFX1010:       ; %bb.0: ; %entry
299; GFX1010-NEXT:    v_mov_b32_e32 v0, 0
300; GFX1010-NEXT:    s_add_u32 s0, s0, s7
301; GFX1010-NEXT:    s_mov_b32 s33, 0
302; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
303; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4
304; GFX1010-NEXT:    s_waitcnt_vscnt null, 0x0
305; GFX1010-NEXT:    s_endpgm
306;
307; GFX1100-LABEL: test_force_fp_kern_stack:
308; GFX1100:       ; %bb.0: ; %entry
309; GFX1100-NEXT:    v_mov_b32_e32 v0, 0
310; GFX1100-NEXT:    s_mov_b32 s33, 0
311; GFX1100-NEXT:    scratch_store_b32 off, v0, s33 offset:4 dlc
312; GFX1100-NEXT:    s_waitcnt_vscnt null, 0x0
313; GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
314; GFX1100-NEXT:    s_endpgm
315entry:
316  %x = alloca i32, align 4, addrspace(5)
317  store volatile i32 0, i32 addrspace(5)* %x, align 4
318  ret void
319}
320
321define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 {
322; GFX803-LABEL: test_force_fp_kern_call:
323; GFX803:       ; %bb.0: ; %entry
324; GFX803-NEXT:    s_add_i32 s12, s12, s17
325; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
326; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
327; GFX803-NEXT:    s_add_u32 s0, s0, s17
328; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
329; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
330; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s13
331; GFX803-NEXT:    s_addc_u32 s1, s1, 0
332; GFX803-NEXT:    s_mov_b32 s13, s15
333; GFX803-NEXT:    s_mov_b32 s12, s14
334; GFX803-NEXT:    v_or_b32_e32 v31, v0, v2
335; GFX803-NEXT:    s_mov_b32 s14, s16
336; GFX803-NEXT:    s_mov_b32 s32, 0
337; GFX803-NEXT:    s_mov_b32 s33, 0
338; GFX803-NEXT:    s_getpc_b64 s[18:19]
339; GFX803-NEXT:    s_add_u32 s18, s18, ex@rel32@lo+4
340; GFX803-NEXT:    s_addc_u32 s19, s19, ex@rel32@hi+12
341; GFX803-NEXT:    s_swappc_b64 s[30:31], s[18:19]
342; GFX803-NEXT:    s_endpgm
343;
344; GFX900-LABEL: test_force_fp_kern_call:
345; GFX900:       ; %bb.0: ; %entry
346; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
347; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
348; GFX900-NEXT:    s_add_u32 s0, s0, s17
349; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
350; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
351; GFX900-NEXT:    s_addc_u32 s1, s1, 0
352; GFX900-NEXT:    s_mov_b32 s13, s15
353; GFX900-NEXT:    s_mov_b32 s12, s14
354; GFX900-NEXT:    v_or3_b32 v31, v0, v1, v2
355; GFX900-NEXT:    s_mov_b32 s14, s16
356; GFX900-NEXT:    s_mov_b32 s32, 0
357; GFX900-NEXT:    s_mov_b32 s33, 0
358; GFX900-NEXT:    s_getpc_b64 s[18:19]
359; GFX900-NEXT:    s_add_u32 s18, s18, ex@rel32@lo+4
360; GFX900-NEXT:    s_addc_u32 s19, s19, ex@rel32@hi+12
361; GFX900-NEXT:    s_swappc_b64 s[30:31], s[18:19]
362; GFX900-NEXT:    s_endpgm
363;
364; GFX1010-LABEL: test_force_fp_kern_call:
365; GFX1010:       ; %bb.0: ; %entry
366; GFX1010-NEXT:    s_add_u32 s12, s12, s17
367; GFX1010-NEXT:    s_mov_b32 s32, 0
368; GFX1010-NEXT:    s_mov_b32 s33, 0
369; GFX1010-NEXT:    s_addc_u32 s13, s13, 0
370; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
371; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
372; GFX1010-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
373; GFX1010-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
374; GFX1010-NEXT:    s_add_u32 s0, s0, s17
375; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
376; GFX1010-NEXT:    s_mov_b32 s13, s15
377; GFX1010-NEXT:    s_mov_b32 s12, s14
378; GFX1010-NEXT:    v_or3_b32 v31, v0, v1, v2
379; GFX1010-NEXT:    s_mov_b32 s14, s16
380; GFX1010-NEXT:    s_getpc_b64 s[18:19]
381; GFX1010-NEXT:    s_add_u32 s18, s18, ex@rel32@lo+4
382; GFX1010-NEXT:    s_addc_u32 s19, s19, ex@rel32@hi+12
383; GFX1010-NEXT:    s_swappc_b64 s[30:31], s[18:19]
384; GFX1010-NEXT:    s_endpgm
385;
386; GFX1100-LABEL: test_force_fp_kern_call:
387; GFX1100:       ; %bb.0: ; %entry
388; GFX1100-NEXT:    v_mov_b32_e32 v31, v0
389; GFX1100-NEXT:    s_mov_b32 s12, s13
390; GFX1100-NEXT:    s_mov_b64 s[10:11], s[6:7]
391; GFX1100-NEXT:    s_mov_b64 s[8:9], s[4:5]
392; GFX1100-NEXT:    s_mov_b64 s[4:5], s[0:1]
393; GFX1100-NEXT:    s_mov_b64 s[6:7], s[2:3]
394; GFX1100-NEXT:    s_mov_b32 s13, s14
395; GFX1100-NEXT:    s_mov_b32 s14, s15
396; GFX1100-NEXT:    s_mov_b32 s32, 0
397; GFX1100-NEXT:    s_mov_b32 s33, 0
398; GFX1100-NEXT:    s_getpc_b64 s[16:17]
399; GFX1100-NEXT:    s_add_u32 s16, s16, ex@rel32@lo+4
400; GFX1100-NEXT:    s_addc_u32 s17, s17, ex@rel32@hi+12
401; GFX1100-NEXT:    s_swappc_b64 s[30:31], s[16:17]
402; GFX1100-NEXT:    s_endpgm
403; GFX1010-NEXT    s_add_u32 s12, s12, s17
404; GFX1010-NEXT    s_mov_b32 s32, 0
405; GFX1010-NEXT    s_mov_b32 s33, 0
406; GFX1010-NEXT    s_addc_u32 s13, s13, 0
407; GFX1010-NEXT    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
408; GFX1010-NEXT    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
409; GFX1010-NEXT    v_lshlrev_b32_e32 v2, 20, v2
410; GFX1010-NEXT    v_lshlrev_b32_e32 v1, 10, v1
411; GFX1010-NEXT    s_add_u32 s0, s0, s17
412; GFX1010-NEXT    s_addc_u32 s1, s1, 0
413; GFX1010-NEXT    s_mov_b32 s12, s14
414; GFX1010-NEXT    s_mov_b32 s13, s15
415; GFX1010-NEXT    v_or3_b32 v31, v0, v1, v2
416; GFX1010-NEXT    s_mov_b32 s14, s16
417; GFX1010-NEXT    s_getpc_b64 s[18:19]
418; GFX1010-NEXT    s_add_u32 s18, s18, ex@rel32@lo+4
419; GFX1010-NEXT    s_addc_u32 s19, s19, ex@rel32@hi+12
420; GFX1010-NEXT    s_swappc_b64 s[30:31], s[18:19]
421; GFX1010-NEXT    s_endpgm
422entry:
423  tail call void @ex() #2
424  ret void
425}
426
427define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_addr #2 {
428; GFX803-LABEL: test_force_fp_kern_stack_and_call:
429; GFX803:       ; %bb.0: ; %entry
430; GFX803-NEXT:    s_add_i32 s12, s12, s17
431; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
432; GFX803-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
433; GFX803-NEXT:    s_add_u32 s0, s0, s17
434; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
435; GFX803-NEXT:    v_or_b32_e32 v0, v0, v1
436; GFX803-NEXT:    s_mov_b32 s33, 0
437; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s13
438; GFX803-NEXT:    s_addc_u32 s1, s1, 0
439; GFX803-NEXT:    s_mov_b32 s13, s15
440; GFX803-NEXT:    s_mov_b32 s12, s14
441; GFX803-NEXT:    v_mov_b32_e32 v3, 0
442; GFX803-NEXT:    v_or_b32_e32 v31, v0, v2
443; GFX803-NEXT:    s_mov_b32 s14, s16
444; GFX803-NEXT:    s_movk_i32 s32, 0x400
445; GFX803-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:4
446; GFX803-NEXT:    s_waitcnt vmcnt(0)
447; GFX803-NEXT:    s_getpc_b64 s[18:19]
448; GFX803-NEXT:    s_add_u32 s18, s18, ex@rel32@lo+4
449; GFX803-NEXT:    s_addc_u32 s19, s19, ex@rel32@hi+12
450; GFX803-NEXT:    s_swappc_b64 s[30:31], s[18:19]
451; GFX803-NEXT:    s_endpgm
452;
453; GFX900-LABEL: test_force_fp_kern_stack_and_call:
454; GFX900:       ; %bb.0: ; %entry
455; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
456; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
457; GFX900-NEXT:    s_add_u32 s0, s0, s17
458; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
459; GFX900-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
460; GFX900-NEXT:    s_mov_b32 s33, 0
461; GFX900-NEXT:    s_addc_u32 s1, s1, 0
462; GFX900-NEXT:    s_mov_b32 s13, s15
463; GFX900-NEXT:    s_mov_b32 s12, s14
464; GFX900-NEXT:    v_mov_b32_e32 v3, 0
465; GFX900-NEXT:    v_or3_b32 v31, v0, v1, v2
466; GFX900-NEXT:    s_mov_b32 s14, s16
467; GFX900-NEXT:    s_movk_i32 s32, 0x400
468; GFX900-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:4
469; GFX900-NEXT:    s_waitcnt vmcnt(0)
470; GFX900-NEXT:    s_getpc_b64 s[18:19]
471; GFX900-NEXT:    s_add_u32 s18, s18, ex@rel32@lo+4
472; GFX900-NEXT:    s_addc_u32 s19, s19, ex@rel32@hi+12
473; GFX900-NEXT:    s_swappc_b64 s[30:31], s[18:19]
474; GFX900-NEXT:    s_endpgm
475;
476; GFX1010-LABEL: test_force_fp_kern_stack_and_call:
477; GFX1010:       ; %bb.0: ; %entry
478; GFX1010-NEXT:    s_add_u32 s12, s12, s17
479; GFX1010-NEXT:    s_movk_i32 s32, 0x200
480; GFX1010-NEXT:    s_mov_b32 s33, 0
481; GFX1010-NEXT:    s_addc_u32 s13, s13, 0
482; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
483; GFX1010-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
484; GFX1010-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
485; GFX1010-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
486; GFX1010-NEXT:    v_mov_b32_e32 v3, 0
487; GFX1010-NEXT:    s_add_u32 s0, s0, s17
488; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
489; GFX1010-NEXT:    s_mov_b32 s13, s15
490; GFX1010-NEXT:    v_or3_b32 v31, v0, v1, v2
491; GFX1010-NEXT:    s_mov_b32 s12, s14
492; GFX1010-NEXT:    s_mov_b32 s14, s16
493; GFX1010-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:4
494; GFX1010-NEXT:    s_waitcnt_vscnt null, 0x0
495; GFX1010-NEXT:    s_getpc_b64 s[18:19]
496; GFX1010-NEXT:    s_add_u32 s18, s18, ex@rel32@lo+4
497; GFX1010-NEXT:    s_addc_u32 s19, s19, ex@rel32@hi+12
498; GFX1010-NEXT:    s_swappc_b64 s[30:31], s[18:19]
499; GFX1010-NEXT:    s_endpgm
500;
501; GFX1100-LABEL: test_force_fp_kern_stack_and_call:
502; GFX1100:       ; %bb.0: ; %entry
503; GFX1100-NEXT:    v_mov_b32_e32 v1, 0
504; GFX1100-NEXT:    v_mov_b32_e32 v31, v0
505; GFX1100-NEXT:    s_mov_b32 s33, 0
506; GFX1100-NEXT:    s_mov_b32 s12, s13
507; GFX1100-NEXT:    s_mov_b64 s[10:11], s[6:7]
508; GFX1100-NEXT:    s_mov_b64 s[8:9], s[4:5]
509; GFX1100-NEXT:    s_mov_b64 s[4:5], s[0:1]
510; GFX1100-NEXT:    s_mov_b64 s[6:7], s[2:3]
511; GFX1100-NEXT:    s_mov_b32 s13, s14
512; GFX1100-NEXT:    s_mov_b32 s14, s15
513; GFX1100-NEXT:    s_mov_b32 s32, 16
514; GFX1100-NEXT:    scratch_store_b32 off, v1, s33 offset:4 dlc
515; GFX1100-NEXT:    s_waitcnt_vscnt null, 0x0
516; GFX1100-NEXT:    s_getpc_b64 s[16:17]
517; GFX1100-NEXT:    s_add_u32 s16, s16, ex@rel32@lo+4
518; GFX1100-NEXT:    s_addc_u32 s17, s17, ex@rel32@hi+12
519; GFX1100-NEXT:    s_swappc_b64 s[30:31], s[16:17]
520; GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
521; GFX1100-NEXT:    s_endpgm
522entry:
523  %x = alloca i32, align 4, addrspace(5)
524  store volatile i32 0, i32 addrspace(5)* %x, align 4
525  tail call void @ex() #2
526  ret void
527}
528
529define amdgpu_kernel void @test_sgpr_offset_kernel() #1 {
530; GFX803-LABEL: test_sgpr_offset_kernel:
531; GFX803:       ; %bb.0: ; %entry
532; GFX803-NEXT:    s_add_u32 s0, s0, s7
533; GFX803-NEXT:    s_addc_u32 s1, s1, 0
534; GFX803-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
535; GFX803-NEXT:    s_waitcnt vmcnt(0)
536; GFX803-NEXT:    s_mov_b32 s4, 0x40000
537; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
538; GFX803-NEXT:    ;;#ASMSTART
539; GFX803-NEXT:    ;;#ASMEND
540; GFX803-NEXT:    s_mov_b32 s4, 0x40000
541; GFX803-NEXT:    buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
542; GFX803-NEXT:    s_waitcnt vmcnt(0)
543; GFX803-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
544; GFX803-NEXT:    s_waitcnt vmcnt(0)
545; GFX803-NEXT:    s_endpgm
546;
547; GFX900-LABEL: test_sgpr_offset_kernel:
548; GFX900:       ; %bb.0: ; %entry
549; GFX900-NEXT:    s_add_u32 s0, s0, s7
550; GFX900-NEXT:    s_addc_u32 s1, s1, 0
551; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8 glc
552; GFX900-NEXT:    s_waitcnt vmcnt(0)
553; GFX900-NEXT:    s_mov_b32 s4, 0x40000
554; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
555; GFX900-NEXT:    ;;#ASMSTART
556; GFX900-NEXT:    ;;#ASMEND
557; GFX900-NEXT:    s_mov_b32 s4, 0x40000
558; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
559; GFX900-NEXT:    s_waitcnt vmcnt(0)
560; GFX900-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
561; GFX900-NEXT:    s_waitcnt vmcnt(0)
562; GFX900-NEXT:    s_endpgm
563;
564; GFX1010-LABEL: test_sgpr_offset_kernel:
565; GFX1010:       ; %bb.0: ; %entry
566; GFX1010-NEXT:    s_add_u32 s0, s0, s7
567; GFX1010-NEXT:    s_addc_u32 s1, s1, 0
568; GFX1010-NEXT:    s_mov_b32 s4, 0x20000
569; GFX1010-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8 glc dlc
570; GFX1010-NEXT:    s_waitcnt vmcnt(0)
571; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill
572; GFX1010-NEXT:    s_waitcnt_depctr 0xffe3
573; GFX1010-NEXT:    s_mov_b32 s4, 0x20000
574; GFX1010-NEXT:    ;;#ASMSTART
575; GFX1010-NEXT:    ;;#ASMEND
576; GFX1010-NEXT:    buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload
577; GFX1010-NEXT:    s_waitcnt vmcnt(0)
578; GFX1010-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:8
579; GFX1010-NEXT:    s_waitcnt_vscnt null, 0x0
580; GFX1010-NEXT:    s_endpgm
581;
582; GFX1100-LABEL: test_sgpr_offset_kernel:
583; GFX1100:       ; %bb.0: ; %entry
584; GFX1100-NEXT:    scratch_load_b32 v0, off, off offset:8 glc dlc
585; GFX1100-NEXT:    s_waitcnt vmcnt(0)
586; GFX1100-NEXT:    s_movk_i32 s0, 0x1000
587; GFX1100-NEXT:    scratch_store_b32 off, v0, s0 ; 4-byte Folded Spill
588; GFX1100-NEXT:    s_movk_i32 s0, 0x1000
589; GFX1100-NEXT:    ;;#ASMSTART
590; GFX1100-NEXT:    ;;#ASMEND
591; GFX1100-NEXT:    scratch_load_b32 v0, off, s0 ; 4-byte Folded Reload
592; GFX1100-NEXT:    s_waitcnt vmcnt(0)
593; GFX1100-NEXT:    scratch_store_b32 off, v0, off offset:8 dlc
594; GFX1100-NEXT:    s_waitcnt_vscnt null, 0x0
595; GFX1100-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
596; GFX1100-NEXT:    s_endpgm
597entry:
598  ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not
599  ; fit in the instruction, and has to live in the SGPR offset.
600  %alloca = alloca i8, i32 4092, align 4, addrspace(5)
601  %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)*
602
603  %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
604  ; 0x40000 / 64 = 4096 (for wave64)
605  ; CHECK: s_add_u32 s6, s7, 0x40000
606  ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill
607  %a = load volatile i32, i32 addrspace(5)* %aptr
608
609  ; Force %a to spill
610  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" ()
611
612  %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1
613  store volatile i32 %a, i32 addrspace(5)* %outptr
614
615  ret void
616}
617
618declare hidden void @ex() local_unnamed_addr #0
619
620attributes #0 = { nounwind }
621attributes #1 = { nounwind "amdgpu-num-vgpr"="8" }
622attributes #2 = { nounwind "frame-pointer"="all" }
623