1; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck  -enable-var-scope -check-prefixes=GCN,MUBUF %s
2; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck  -enable-var-scope -check-prefixes=GCN,MUBUF %s
3; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-flat-scratch < %s | FileCheck  -enable-var-scope -check-prefixes=GCN,FLATSCR %s
4
5; GCN-LABEL: {{^}}callee_no_stack:
6; GCN: ; %bb.0:
7; GCN-NEXT: s_waitcnt
8; GCN-NEXT: s_setpc_b64
9define void @callee_no_stack() #0 {
10  ret void
11}
12
13; GCN-LABEL: {{^}}callee_no_stack_no_fp_elim_all:
14; GCN: ; %bb.0:
15; GCN-NEXT: s_waitcnt
16; MUBUF-NEXT:   s_mov_b32 [[FP_COPY:s4]], s33
17; FLATSCR-NEXT: s_mov_b32 [[FP_COPY:s0]], s33
18; GCN-NEXT: s_mov_b32 s33, s32
19; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]]
20; GCN-NEXT: s_setpc_b64
21define void @callee_no_stack_no_fp_elim_all() #1 {
22  ret void
23}
24
25; GCN-LABEL: {{^}}callee_no_stack_no_fp_elim_nonleaf:
26; GCN: ; %bb.0:
27; GCN-NEXT: s_waitcnt
28; GCN-NEXT: s_setpc_b64
29define void @callee_no_stack_no_fp_elim_nonleaf() #2 {
30  ret void
31}
32
33; GCN-LABEL: {{^}}callee_with_stack:
34; GCN: ; %bb.0:
35; GCN-NEXT: s_waitcnt
36; GCN-NEXT: v_mov_b32_e32 v0, 0{{$}}
37; MUBUF-NEXT:   buffer_store_dword v0, off, s[0:3], s32{{$}}
38; FLATSCR-NEXT: scratch_store_dword off, v0, s32
39; GCN-NEXT: s_waitcnt
40; GCN-NEXT: s_setpc_b64
41define void @callee_with_stack() #0 {
42  %alloca = alloca i32, addrspace(5)
43  store volatile i32 0, i32 addrspace(5)* %alloca
44  ret void
45}
46
47; Can use free call clobbered register to preserve original FP value.
48
49; GCN-LABEL: {{^}}callee_with_stack_no_fp_elim_all:
50; GCN: ; %bb.0:
51; GCN-NEXT: s_waitcnt
52; MUBUF-NEXT:   s_mov_b32 [[FP_COPY:s4]], s33
53; FLATSCR-NEXT: s_mov_b32 [[FP_COPY:s0]], s33
54; GCN-NEXT: s_mov_b32 s33, s32
55; MUBUF-NEXT:   s_addk_i32 s32, 0x200
56; FLATSCR-NEXT: s_add_i32 s32, s32, 8
57; GCN-NEXT: v_mov_b32_e32 v0, 0{{$}}
58; MUBUF-NEXT:   buffer_store_dword v0, off, s[0:3], s33{{$}}
59; FLATSCR-NEXT: scratch_store_dword off, v0, s33{{$}}
60; GCN-NEXT: s_waitcnt vmcnt(0)
61; MUBUF-NEXT:   s_addk_i32 s32, 0xfe00
62; FLATSCR-NEXT: s_add_i32 s32, s32, -8
63; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]]
64; GCN-NEXT: s_setpc_b64
65define void @callee_with_stack_no_fp_elim_all() #1 {
66  %alloca = alloca i32, addrspace(5)
67  store volatile i32 0, i32 addrspace(5)* %alloca
68  ret void
69}
70
71; GCN-LABEL: {{^}}callee_with_stack_no_fp_elim_non_leaf:
72; GCN: ; %bb.0:
73; GCN-NEXT: s_waitcnt
74; GCN-NEXT: v_mov_b32_e32 v0, 0{{$}}
75; MUBUF-NEXT:   buffer_store_dword v0, off, s[0:3], s32{{$}}
76; FLATSCR-NEXT: scratch_store_dword off, v0, s32{{$}}
77; GCN-NEXT: s_waitcnt
78; GCN-NEXT: s_setpc_b64
79define void @callee_with_stack_no_fp_elim_non_leaf() #2 {
80  %alloca = alloca i32, addrspace(5)
81  store volatile i32 0, i32 addrspace(5)* %alloca
82  ret void
83}
84
85; GCN-LABEL: {{^}}callee_with_stack_and_call:
86; GCN: ; %bb.0:
87; GCN-NEXT: s_waitcnt
88; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
89; MUBUF-NEXT:   buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
90; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:4 ; 4-byte Folded Spill
91; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
92; GCN: v_writelane_b32 [[CSR_VGPR]], s33, 2
93; GCN-DAG: s_mov_b32 s33, s32
94; MUBUF-DAG:   s_addk_i32 s32, 0x400{{$}}
95; FLATSCR-DAG: s_add_i32 s32, s32, 16{{$}}
96; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
97; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s30,
98; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31,
99
100; MUBUF-DAG:   buffer_store_dword [[ZERO]], off, s[0:3], s33{{$}}
101; FLATSCR-DAG: scratch_store_dword off, [[ZERO]], s33{{$}}
102
103; GCN: s_swappc_b64
104
105; MUBUF-DAG: v_readlane_b32 s5, [[CSR_VGPR]]
106; MUBUF-DAG: v_readlane_b32 s4, [[CSR_VGPR]]
107; FLATSCR-DAG: v_readlane_b32 s0, [[CSR_VGPR]]
108; FLATSCR-DAG: v_readlane_b32 s1, [[CSR_VGPR]]
109
110; MUBUF:    s_addk_i32 s32, 0xfc00{{$}}
111; FLATSCR:  s_add_i32 s32, s32, -16{{$}}
112; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2
113; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
114; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
115; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:4 ; 4-byte Folded Reload
116; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
117; GCN-NEXT: s_waitcnt vmcnt(0)
118
119; GCN-NEXT: s_setpc_b64
120define void @callee_with_stack_and_call() #0 {
121  %alloca = alloca i32, addrspace(5)
122  store volatile i32 0, i32 addrspace(5)* %alloca
123  call void @external_void_func_void()
124  ret void
125}
126
127; Should be able to copy incoming stack pointer directly to inner
128; call's stack pointer argument.
129
130; There is stack usage only because of the need to evict a VGPR for
131; spilling CSR SGPRs.
132
133; GCN-LABEL: {{^}}callee_no_stack_with_call:
134; GCN: s_waitcnt
135; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
136; MUBUF-NEXT:   buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 ; 4-byte Folded Spill
137; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 ; 4-byte Folded Spill
138; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
139; MUBUF-DAG:   s_addk_i32 s32, 0x400
140; FLATSCR-DAG: s_add_i32 s32, s32, 16
141; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s33, [[FP_SPILL_LANE:[0-9]+]]
142
143; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s30, 0
144; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1
145; GCN: s_swappc_b64
146
147; MUBUF-DAG: v_readlane_b32 s4, v40, 0
148; MUBUF-DAG: v_readlane_b32 s5, v40, 1
149; FLATSCR-DAG: v_readlane_b32 s0, v40, 0
150; FLATSCR-DAG: v_readlane_b32 s1, v40, 1
151
152; MUBUF:   s_addk_i32 s32, 0xfc00
153; FLATSCR: s_add_i32 s32, s32, -16
154; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], [[FP_SPILL_LANE]]
155; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
156; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 ; 4-byte Folded Reload
157; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 ; 4-byte Folded Reload
158; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
159; GCN-NEXT: s_waitcnt vmcnt(0)
160; GCN-NEXT: s_setpc_b64
161define void @callee_no_stack_with_call() #0 {
162  call void @external_void_func_void()
163  ret void
164}
165
166declare hidden void @external_void_func_void() #0
167
168; Make sure if a CSR vgpr is used for SGPR spilling, it is saved and
169; restored. No FP is required.
170;
171; GCN-LABEL: {{^}}callee_func_sgpr_spill_no_calls:
172; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
173; MUBUF-NEXT:   buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 ; 4-byte Folded Spill
174; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 ; 4-byte Folded Spill
175; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
176; GCN: v_writelane_b32 [[CSR_VGPR]], s
177; GCN: v_writelane_b32 [[CSR_VGPR]], s
178
179; GCN: ;;#ASMSTART
180; GCN: v_readlane_b32 s{{[0-9]+}}, [[CSR_VGPR]]
181; GCN: v_readlane_b32 s{{[0-9]+}}, [[CSR_VGPR]]
182
183; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
184; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 ; 4-byte Folded Reload
185; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 ; 4-byte Folded Reload
186; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
187; GCN-NEXT: s_waitcnt
188; GCN-NEXT: s_setpc_b64
189define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 {
190  call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0
191  call void asm sideeffect "", "~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15}"() #0
192  call void asm sideeffect "", "~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23}"() #0
193  call void asm sideeffect "", "~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"() #0
194  call void asm sideeffect "", "~{v32},~{v33},~{v34},~{v35},~{v36},~{v37},~{v38},~{v39}"() #0
195
196  %wide.sgpr0 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
197  %wide.sgpr1 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
198  %wide.sgpr2 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
199  %wide.sgpr5 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
200  %wide.sgpr3 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
201  %wide.sgpr4 = call <2 x i32> asm sideeffect "; def $0", "=s" () #0
202
203  call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr0) #0
204  call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr1) #0
205  call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr2) #0
206  call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr3) #0
207  call void asm sideeffect "; use $0", "s"(<2 x i32> %wide.sgpr4) #0
208  call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr5) #0
209  ret void
210}
211
212; Has no spilled CSR VGPRs used for SGPR spilling, so no need to
213; enable all lanes and restore.
214
215; GCN-LABEL: {{^}}spill_only_csr_sgpr:
216; GCN: s_waitcnt
217; GCN-NEXT: s_or_saveexec_b64
218; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
219; FLATSCR-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
220; GCN-NEXT: s_mov_b64 exec,
221; GCN-NEXT: v_writelane_b32 v0, s42, 0
222; GCN-NEXT: ;;#ASMSTART
223; GCN-NEXT: ; clobber s42
224; GCN-NEXT: ;;#ASMEND
225; GCN-NEXT: v_readlane_b32 s42, v0, 0
226; GCN-NEXT: s_or_saveexec_b64
227; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
228; FLATSCR-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
229; GCN-NEXT: s_mov_b64 exec,
230; GCN-NEXT: s_waitcnt vmcnt(0)
231; GCN-NEXT: s_setpc_b64
232define void @spill_only_csr_sgpr() {
233  call void asm sideeffect "; clobber s42", "~{s42}"()
234  ret void
235}
236
237; TODO: Can the SP inc/deec be remvoed?
238; GCN-LABEL: {{^}}callee_with_stack_no_fp_elim_csr_vgpr:
239; GCN: s_waitcnt
240; GCN-NEXT:s_mov_b32 [[FP_COPY:s[0-9]+]], s33
241; GCN-NEXT: s_mov_b32 s33, s32
242; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
243; MUBUF-DAG:   buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
244; FLATSCR-DAG: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
245; MUBUF-DAG:   buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:4
246; FLATSCR-DAG: scratch_store_dword off, [[ZERO]], s33 offset:4
247
248; GCN:	;;#ASMSTART
249; GCN-NEXT: ; clobber v41
250; GCN-NEXT: ;;#ASMEND
251
252; MUBUF:   buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
253; FLATSCR: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload
254; MUBUF:        s_addk_i32 s32, 0x300
255; MUBUF-NEXT:   s_addk_i32 s32, 0xfd00
256; MUBUF-NEXT:   s_mov_b32 s33, s4
257; FLATSCR:      s_add_i32 s32, s32, 12
258; FLATSCR-NEXT: s_add_i32 s32, s32, -12
259; FLATSCR-NEXT: s_mov_b32 s33, s0
260; GCN-NEXT: s_waitcnt vmcnt(0)
261; GCN-NEXT: s_setpc_b64
262define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 {
263  %alloca = alloca i32, addrspace(5)
264  store volatile i32 0, i32 addrspace(5)* %alloca
265  call void asm sideeffect "; clobber v41", "~{v41}"()
266  ret void
267}
268
269; Use a copy to a free SGPR instead of introducing a second CSR VGPR.
270; GCN-LABEL: {{^}}last_lane_vgpr_for_fp_csr:
271; GCN: s_waitcnt
272; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
273; MUBUF-NEXT:   buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
274; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:8 ; 4-byte Folded Spill
275; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
276; GCN-NEXT: v_writelane_b32 v0, s33, 63
277; GCN-COUNT-60: v_writelane_b32 v0
278; GCN: s_mov_b32 s33, s32
279; GCN: v_writelane_b32 v0
280; MUBUF:   buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
281; FLATSCR: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
282; GCN: v_writelane_b32 v0
283; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:4
284; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s33 offset:4
285; GCN: ;;#ASMSTART
286; GCN: v_writelane_b32 v0
287
288; MUBUF:        s_addk_i32 s32, 0x400
289; MUBUF:        s_addk_i32 s32, 0xfc00
290; FLATSCR:      s_add_i32 s32, s32, 16
291; FLATSCR:      s_add_i32 s32, s32, -16
292; GCN-NEXT: v_readlane_b32 s33, v0, 63
293; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
294; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
295; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:8 ; 4-byte Folded Reload
296; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
297; GCN-NEXT: s_waitcnt vmcnt(0)
298; GCN-NEXT: s_setpc_b64
299define void @last_lane_vgpr_for_fp_csr() #1 {
300  %alloca = alloca i32, addrspace(5)
301  store volatile i32 0, i32 addrspace(5)* %alloca
302  call void asm sideeffect "; clobber v41", "~{v41}"()
303  call void asm sideeffect "",
304    "~{s40},~{s41},~{s42},~{s43},~{s44},~{s45},~{s46},~{s47},~{s48},~{s49}
305    ,~{s50},~{s51},~{s52},~{s53},~{s54},~{s55},~{s56},~{s57},~{s58},~{s59}
306    ,~{s60},~{s61},~{s62},~{s63},~{s64},~{s65},~{s66},~{s67},~{s68},~{s69}
307    ,~{s70},~{s71},~{s72},~{s73},~{s74},~{s75},~{s76},~{s77},~{s78},~{s79}
308    ,~{s80},~{s81},~{s82},~{s83},~{s84},~{s85},~{s86},~{s87},~{s88},~{s89}
309    ,~{s90},~{s91},~{s92},~{s93},~{s94},~{s95},~{s96},~{s97},~{s98},~{s99}
310    ,~{s100},~{s101},~{s102}"() #1
311
312  ret void
313}
314
315; Use a copy to a free SGPR instead of introducing a second CSR VGPR.
316; GCN-LABEL: {{^}}no_new_vgpr_for_fp_csr:
317; GCN: s_waitcnt
318; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
319; MUBUF-NEXT:   buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
320; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:8 ; 4-byte Folded Spill
321; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
322; GCN-COUNT-61: v_writelane_b32 v0,
323; FLATSCR: s_mov_b32 [[FP_COPY:s[0-9]+]], s33
324; FLATSCR-NEXT: s_mov_b32 s33, s32
325; MUBUF: s_mov_b32 [[FP_COPY:s[0-9]+]], s33
326; MUBUF-NEXT: s_mov_b32 s33, s32
327; MUBUF:   buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
328; FLATSCR: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
329; GCN: v_writelane_b32 v0,
330; MUBUF:   buffer_store_dword
331; FLATSCR: scratch_store_dword
332; GCN: ;;#ASMSTART
333; GCN: v_writelane_b32 v0,
334; MUBUF:   buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
335; FLATSCR: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload
336; MUBUF:        s_addk_i32 s32, 0x400
337; FLATSCR:      s_add_i32 s32, s32, 16
338; GCN-COUNT-64: v_readlane_b32 s{{[0-9]+}}, v0
339; MUBUF-NEXT:   s_addk_i32 s32, 0xfc00
340; FLATSCR-NEXT: s_add_i32 s32, s32, -16
341; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]]
342; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
343; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
344; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:8 ; 4-byte Folded Reload
345; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
346; GCN-NEXT: s_waitcnt vmcnt(0)
347; GCN-NEXT: s_setpc_b64
348define void @no_new_vgpr_for_fp_csr() #1 {
349  %alloca = alloca i32, addrspace(5)
350  store volatile i32 0, i32 addrspace(5)* %alloca
351  call void asm sideeffect "; clobber v41", "~{v41}"()
352  call void asm sideeffect "",
353    "~{s39},~{s40},~{s41},~{s42},~{s43},~{s44},~{s45},~{s46},~{s47},~{s48},~{s49}
354    ,~{s50},~{s51},~{s52},~{s53},~{s54},~{s55},~{s56},~{s57},~{s58},~{s59}
355    ,~{s60},~{s61},~{s62},~{s63},~{s64},~{s65},~{s66},~{s67},~{s68},~{s69}
356    ,~{s70},~{s71},~{s72},~{s73},~{s74},~{s75},~{s76},~{s77},~{s78},~{s79}
357    ,~{s80},~{s81},~{s82},~{s83},~{s84},~{s85},~{s86},~{s87},~{s88},~{s89}
358    ,~{s90},~{s91},~{s92},~{s93},~{s94},~{s95},~{s96},~{s97},~{s98},~{s99}
359    ,~{s100},~{s101},~{s102}"() #1
360
361  ret void
362}
363
364; GCN-LABEL: {{^}}realign_stack_no_fp_elim:
365; GCN: s_waitcnt
366; MUBUF-NEXT:   s_mov_b32 [[FP_COPY:s4]], s33
367; FLATSCR-NEXT: s_mov_b32 [[FP_COPY:s0]], s33
368; MUBUF-NEXT:   s_add_i32 s33, s32, 0x7ffc0
369; FLATSCR-NEXT: s_add_i32 s33, s32, 0x1fff
370; MUBUF-NEXT:   s_and_b32 s33, s33, 0xfff80000
371; FLATSCR-NEXT: s_and_b32 s33, s33, 0xffffe000
372; MUBUF-NEXT:   s_add_i32 s32, s32, 0x180000
373; FLATSCR-NEXT: s_addk_i32 s32, 0x6000
374; GCN-NEXT:     v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
375; MUBUF-NEXT:   v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x2000{{$}}
376; MUBUF-NEXT:   buffer_store_dword [[ZERO]], [[OFFSET]], s[0:3], s33 offen{{$}}
377; FLATSCR-NEXT: s_add_i32 vcc_hi, s33, 0x2000
378; FLATSCR-NEXT: scratch_store_dword off, [[ZERO]], vcc_hi
379; GCN-NEXT: s_waitcnt vmcnt(0)
380; MUBUF-NEXT:   s_add_i32 s32, s32, 0xffe80000
381; FLATSCR-NEXT: s_addk_i32 s32, 0xa000
382; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]]
383; GCN-NEXT: s_setpc_b64
384define void @realign_stack_no_fp_elim() #1 {
385  %alloca = alloca i32, align 8192, addrspace(5)
386  store volatile i32 0, i32 addrspace(5)* %alloca
387  ret void
388}
389
390; GCN-LABEL: {{^}}no_unused_non_csr_sgpr_for_fp:
391; GCN: s_waitcnt
392; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
393; MUBUF-NEXT:   buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
394; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:4 ; 4-byte Folded Spill
395; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
396; GCN-NEXT: v_writelane_b32 v0, s33, 2
397; GCN-NEXT: s_mov_b32 s33, s32
398; GCN-NEXT: v_writelane_b32 v0, s30, 0
399; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
400; GCN: v_writelane_b32 v0, s31, 1
401; MUBUF:   buffer_store_dword [[ZERO]], off, s[0:3], s33{{$}}
402; FLATSCR: scratch_store_dword off, [[ZERO]], s33{{$}}
403; GCN-NEXT:     s_waitcnt vmcnt(0)
404; GCN: ;;#ASMSTART
405; MUBUF:        s_addk_i32 s32, 0x300
406; MUBUF-NEXT:   v_readlane_b32 s4, v0, 0
407; MUBUF-NEXT:   v_readlane_b32 s5, v0, 1
408; FLATSCR:      s_add_i32 s32, s32, 12
409; FLATSCR-NEXT: v_readlane_b32 s0, v0, 0
410; FLATSCR-NEXT: v_readlane_b32 s1, v0, 1
411; MUBUF-NEXT:   s_addk_i32 s32, 0xfd00
412; FLATSCR-NEXT: s_add_i32 s32, s32, -12
413; GCN-NEXT:     v_readlane_b32 s33, v0, 2
414; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
415; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
416; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:4 ; 4-byte Folded Reload
417; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
418; GCN-NEXT: s_waitcnt vmcnt(0)
419; MUBUF-NEXT:   s_setpc_b64 s[4:5]
420; FLATSCR-NEXT: s_setpc_b64 s[0:1]
421define void @no_unused_non_csr_sgpr_for_fp() #1 {
422  %alloca = alloca i32, addrspace(5)
423  store volatile i32 0, i32 addrspace(5)* %alloca
424
425  ; Use all clobberable registers, so FP has to spill to a VGPR.
426  call void asm sideeffect "",
427    "~{s0},~{s1},~{s2},~{s3},~{s4},~{s5},~{s6},~{s7},~{s8},~{s9}
428    ,~{s10},~{s11},~{s12},~{s13},~{s14},~{s15},~{s16},~{s17},~{s18},~{s19}
429    ,~{s20},~{s21},~{s22},~{s23},~{s24},~{s25},~{s26},~{s27},~{s28},~{s29}
430    ,~{s30},~{s31}"() #0
431
432  ret void
433}
434
435; Need a new CSR VGPR to satisfy the FP spill.
436; GCN-LABEL: {{^}}no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr:
437; GCN: s_waitcnt
438; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
439; MUBUF-NEXT:   buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
440; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:4 ; 4-byte Folded Spill
441; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
442; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s33, 2
443; GCN-NEXT: s_mov_b32 s33, s32
444; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s30, 0
445
446; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1
447; MUBUF-DAG:   buffer_store_dword
448; FLATSCR-DAG: scratch_store_dword
449; MUBUF:       s_addk_i32 s32, 0x300{{$}}
450; FLATSCR:     s_add_i32 s32, s32, 12{{$}}
451
452; MUBUF:        v_readlane_b32 s4, [[CSR_VGPR]], 0
453; FLATSCR:      v_readlane_b32 s0, [[CSR_VGPR]], 0
454; GCN: ;;#ASMSTART
455; MUBUF:        v_readlane_b32 s5, [[CSR_VGPR]], 1
456; FLATSCR:      v_readlane_b32 s1, [[CSR_VGPR]], 1
457; MUBUF-NEXT:   s_addk_i32 s32, 0xfd00{{$}}
458; FLATSCR-NEXT: s_add_i32 s32, s32, -12{{$}}
459; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2
460; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
461; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
462; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:4 ; 4-byte Folded Reload
463; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
464; GCN-NEXT: s_waitcnt vmcnt(0)
465; GCN-NEXT: s_setpc_b64
466define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 {
467  %alloca = alloca i32, addrspace(5)
468  store volatile i32 0, i32 addrspace(5)* %alloca
469
470  ; Use all clobberable registers, so FP has to spill to a VGPR.
471  call void asm sideeffect "",
472    "~{s0},~{s1},~{s2},~{s3},~{s4},~{s5},~{s6},~{s7},~{s8},~{s9}
473    ,~{s10},~{s11},~{s12},~{s13},~{s14},~{s15},~{s16},~{s17},~{s18},~{s19}
474    ,~{s20},~{s21},~{s22},~{s23},~{s24},~{s25},~{s26},~{s27},~{s28},~{s29}
475    ,~{s30},~{s31}"() #0
476
477  call void asm sideeffect "; clobber nonpreserved initial VGPRs",
478    "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9}
479    ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19}
480    ,~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29}
481    ,~{v30},~{v31},~{v32},~{v33},~{v34},~{v35},~{v36},~{v37},~{v38},~{v39}"() #1
482
483  ret void
484}
485
486; The byval argument exceeds the MUBUF constant offset, so a scratch
487; register is needed to access the CSR VGPR slot.
488; GCN-LABEL: {{^}}scratch_reg_needed_mubuf_offset:
489; GCN: s_waitcnt
490; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
491; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x40100
492; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill
493; FLATSCR-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x1004
494; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], [[SCRATCH_SGPR]] ; 4-byte Folded Spill
495; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
496; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s33, 2
497; GCN-DAG:  v_writelane_b32 [[CSR_VGPR]], s30, 0
498; GCN-DAG:  s_mov_b32 s33, s32
499; GCN-DAG:  v_writelane_b32 [[CSR_VGPR]], s31, 1
500; MUBUF-DAG:   s_add_i32 s32, s32, 0x40300{{$}}
501; FLATSCR-DAG: s_addk_i32 s32, 0x100c{{$}}
502; MUBUF-DAG:   buffer_store_dword
503; FLATSCR-DAG: scratch_store_dword
504
505; MUBUF:   v_readlane_b32 s4, [[CSR_VGPR]], 0
506; FLATSCR: v_readlane_b32 s0, [[CSR_VGPR]], 0
507; GCN: ;;#ASMSTART
508; MUBUF:   v_readlane_b32 s5, [[CSR_VGPR]], 1
509; FLATSCR: v_readlane_b32 s1, [[CSR_VGPR]], 1
510; MUBUF-NEXT:   s_add_i32 s32, s32, 0xfffbfd00{{$}}
511; FLATSCR-NEXT: s_addk_i32 s32, 0xeff4{{$}}
512; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2
513; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
514; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x40100
515; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Reload
516; FLATSCR-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x1004
517; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, [[SCRATCH_SGPR]] ; 4-byte Folded Reload
518; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
519; GCN-NEXT: s_waitcnt vmcnt(0)
520; GCN-NEXT: s_setpc_b64
521define void @scratch_reg_needed_mubuf_offset([4096 x i8] addrspace(5)* byval([4096 x i8]) align 4 %arg) #1 {
522  %alloca = alloca i32, addrspace(5)
523  store volatile i32 0, i32 addrspace(5)* %alloca
524
525  ; Use all clobberable registers, so FP has to spill to a VGPR.
526  call void asm sideeffect "; clobber nonpreserved SGPRs",
527    "~{s0},~{s1},~{s2},~{s3},~{s4},~{s5},~{s6},~{s7},~{s8},~{s9}
528    ,~{s10},~{s11},~{s12},~{s13},~{s14},~{s15},~{s16},~{s17},~{s18},~{s19}
529    ,~{s20},~{s21},~{s22},~{s23},~{s24},~{s25},~{s26},~{s27},~{s28},~{s29}
530    ,~{s30},~{s31}"() #0
531
532  ; Use all clobberable VGPRs, so a CSR spill is needed for the VGPR
533  call void asm sideeffect "; clobber nonpreserved VGPRs",
534    "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9}
535    ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19}
536    ,~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29}
537    ,~{v30},~{v31},~{v32},~{v33},~{v34},~{v35},~{v36},~{v37},~{v38},~{v39}"() #1
538
539  ret void
540}
541
542; GCN-LABEL: {{^}}local_empty_func:
543; GCN: s_waitcnt
544; GCN-NEXT: s_setpc_b64
545define internal void @local_empty_func() #0 {
546  ret void
547}
548
549; An FP is needed, despite not needing any spills
550; TODO: Ccould see callee does not use stack and omit FP.
551; GCN-LABEL: {{^}}ipra_call_with_stack:
552; GCN: s_mov_b32 [[FP_COPY:s[0-9]+]], s33
553; GCN: s_mov_b32 s33, s32
554; MUBUF:   s_addk_i32 s32, 0x400
555; FLATSCR: s_add_i32 s32, s32, 16
556; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33{{$}}
557; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s33{{$}}
558; GCN:     s_swappc_b64
559; MUBUF:   s_addk_i32 s32, 0xfc00
560; FLATSCR: s_add_i32 s32, s32, -16
561; GCN: s_mov_b32 s33, [[FP_COPY:s[0-9]+]]
562define void @ipra_call_with_stack() #0 {
563  %alloca = alloca i32, addrspace(5)
564  store volatile i32 0, i32 addrspace(5)* %alloca
565  call void @local_empty_func()
566  ret void
567}
568
569; With no free registers, we must spill the FP to memory.
570; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_memory:
571; MUBUF:   v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], s33
572; MUBUF:   buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s32 ; 4-byte Folded Spill
573; FLATSCR: s_mov_b32 s0, s33
574; GCN:     s_mov_b32 s33, s32
575; MUBUF:   buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s32 ; 4-byte Folded Reload
576; FLATSCR: s_mov_b32 s33, s0
577; MUBUF:   s_waitcnt vmcnt(0)
578; MUBUF:   v_readfirstlane_b32 s33, [[TMP_VGPR2]]
579; GCN:     s_setpc_b64
580; MUBUF:   ScratchSize: 8
581; FLATSCR: ScratchSize: 0
582define void @callee_need_to_spill_fp_to_memory() #3 {
583  call void asm sideeffect "; clobber nonpreserved SGPRs",
584    "~{s4},~{s5},~{s6},~{s7},~{s8},~{s9}
585    ,~{s10},~{s11},~{s12},~{s13},~{s14},~{s15},~{s16},~{s17},~{s18},~{s19}
586    ,~{s20},~{s21},~{s22},~{s23},~{s24},~{s25},~{s26},~{s27},~{s28},~{s29}
587    ,~{vcc}"()
588
589  call void asm sideeffect "; clobber all VGPRs",
590    "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9}
591    ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19}
592    ,~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29}
593    ,~{v30},~{v31},~{v32},~{v33},~{v34},~{v35},~{v36},~{v37},~{v38},~{v39}"()
594  ret void
595}
596
597; If we have a reserved VGPR that can be used for SGPR spills, we may still
598; need to spill the FP to memory if there are no free lanes in the reserved
599; VGPR.
600; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_memory_full_reserved_vgpr:
601; MUBUF:   s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
602; MUBUF:   s_mov_b64 exec, [[COPY_EXEC1]]
603; MUBUF:   v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], s33
604; MUBUF:   buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s32 offset:[[OFF:[0-9]+]]
605; GCN-NOT: v_writelane_b32 v40, s33
606; MUBUF:   s_mov_b32 s33, s32
607; FLATSCR: s_mov_b32 s33, s0
608; GCN-NOT: v_readlane_b32 s33, v40
609; MUBUF:   buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s32 offset:[[OFF]]
610; MUBUF:   v_readfirstlane_b32 s33, [[TMP_VGPR2]]
611; MUBUF:   s_or_saveexec_b64 [[COPY_EXEC2:s\[[0-9]+:[0-9]+\]]], -1{{$}}
612; MUBUF:   s_mov_b64 exec, [[COPY_EXEC2]]
613; GCN:     s_setpc_b64
614define void @callee_need_to_spill_fp_to_memory_full_reserved_vgpr() #3 {
615  call void asm sideeffect "; clobber nonpreserved SGPRs and 64 CSRs",
616    "~{s4},~{s5},~{s6},~{s7},~{s8},~{s9}
617    ,~{s10},~{s11},~{s12},~{s13},~{s14},~{s15},~{s16},~{s17},~{s18},~{s19}
618    ,~{s20},~{s21},~{s22},~{s23},~{s24},~{s25},~{s26},~{s27},~{s28},~{s29}
619    ,~{s40},~{s41},~{s42},~{s43},~{s44},~{s45},~{s46},~{s47},~{s48},~{s49}
620    ,~{s50},~{s51},~{s52},~{s53},~{s54},~{s55},~{s56},~{s57},~{s58},~{s59}
621    ,~{s60},~{s61},~{s62},~{s63},~{s64},~{s65},~{s66},~{s67},~{s68},~{s69}
622    ,~{s70},~{s71},~{s72},~{s73},~{s74},~{s75},~{s76},~{s77},~{s78},~{s79}
623    ,~{s80},~{s81},~{s82},~{s83},~{s84},~{s85},~{s86},~{s87},~{s88},~{s89}
624    ,~{s90},~{s91},~{s92},~{s93},~{s94},~{s95},~{s96},~{s97},~{s98},~{s99}
625    ,~{s100},~{s101},~{s102},~{s39},~{vcc}"()
626
627  call void asm sideeffect "; clobber all VGPRs except CSR v40",
628    "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9}
629    ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19}
630    ,~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29}
631    ,~{v30},~{v31},~{v32},~{v33},~{v34},~{v35},~{v36},~{v37},~{v38}"()
632  ret void
633}
634
635; When flat-scratch is enabled, we save the FP to s0. At the same time,
636; the exec register is saved to s0 when saving CSR in the function prolog.
637; Make sure that the FP save happens after restoring exec from the same
638; register.
639; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_reg:
640; GCN-NOT: v_writelane_b32 v40, s33
641; FLATSCR: s_or_saveexec_b64 s[0:1], -1
642; FLATSCR: s_mov_b64 exec, s[0:1]
643; FLATSCR: s_mov_b32 s0, s33
644; FLATSCR: s_mov_b32 s33, s32
645; FLATSCR: s_mov_b32 s33, s0
646; FLATSCR: s_or_saveexec_b64 s[0:1], -1
647; GCN-NOT: v_readlane_b32 s33, v40
648; GCN:     s_setpc_b64
649define void @callee_need_to_spill_fp_to_reg() #1 {
650  call void asm sideeffect "; clobber nonpreserved SGPRs and 64 CSRs",
651    "~{s4},~{s5},~{s6},~{s7},~{s8},~{s9}
652    ,~{s10},~{s11},~{s12},~{s13},~{s14},~{s15},~{s16},~{s17},~{s18},~{s19}
653    ,~{s20},~{s21},~{s22},~{s23},~{s24},~{s25},~{s26},~{s27},~{s28},~{s29}
654    ,~{s40},~{s41},~{s42},~{s43},~{s44},~{s45},~{s46},~{s47},~{s48},~{s49}
655    ,~{s50},~{s51},~{s52},~{s53},~{s54},~{s55},~{s56},~{s57},~{s58},~{s59}
656    ,~{s60},~{s61},~{s62},~{s63},~{s64},~{s65},~{s66},~{s67},~{s68},~{s69}
657    ,~{s70},~{s71},~{s72},~{s73},~{s74},~{s75},~{s76},~{s77},~{s78},~{s79}
658    ,~{s80},~{s81},~{s82},~{s83},~{s84},~{s85},~{s86},~{s87},~{s88},~{s89}
659    ,~{s90},~{s91},~{s92},~{s93},~{s94},~{s95},~{s96},~{s97},~{s98},~{s99}
660    ,~{s100},~{s101},~{s102},~{s39},~{vcc}"()
661
662  call void asm sideeffect "; clobber all VGPRs except CSR v40",
663    "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9}
664    ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19}
665    ,~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29}
666    ,~{v30},~{v31},~{v32},~{v33},~{v34},~{v35},~{v36},~{v37},~{v38},~{v39}"()
667  ret void
668}
669
670; If the size of the offset exceeds the MUBUF offset field we need another
671; scratch VGPR to hold the offset.
672; GCN-LABEL: {{^}}spill_fp_to_memory_scratch_reg_needed_mubuf_offset
673; MUBUF: s_or_saveexec_b64 s[4:5], -1
674; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x40100
675; MUBUF-NEXT: buffer_store_dword v39, off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill
676; MUBUF: v_mov_b32_e32 v0, s33
677; GCN-NOT: v_mov_b32_e32 v0, 0x100c
678; MUBUF-NEXT: s_add_i32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x40200
679; MUBUF: buffer_store_dword v0, off, s[0:3], [[SCRATCH_SGPR]] ; 4-byte Folded Spill
680; FLATSCR: v_mov_b32_e32 v0, 0
681; FLATSCR: s_add_i32 [[SOFF:s[0-9]+]], s33, 0x1000
682; FLATSCR: scratch_store_dword off, v0, [[SOFF]]
683define void @spill_fp_to_memory_scratch_reg_needed_mubuf_offset([4096 x i8] addrspace(5)* byval([4096 x i8]) align 4 %arg) #3 {
684  %alloca = alloca i32, addrspace(5)
685  store volatile i32 0, i32 addrspace(5)* %alloca
686
687  call void asm sideeffect "; clobber nonpreserved SGPRs and 64 CSRs",
688    "~{s4},~{s5},~{s6},~{s7},~{s8},~{s9}
689    ,~{s10},~{s11},~{s12},~{s13},~{s14},~{s15},~{s16},~{s17},~{s18},~{s19}
690    ,~{s20},~{s21},~{s22},~{s23},~{s24},~{s25},~{s26},~{s27},~{s28},~{s29}
691    ,~{s40},~{s41},~{s42},~{s43},~{s44},~{s45},~{s46},~{s47},~{s48},~{s49}
692    ,~{s50},~{s51},~{s52},~{s53},~{s54},~{s55},~{s56},~{s57},~{s58},~{s59}
693    ,~{s60},~{s61},~{s62},~{s63},~{s64},~{s65},~{s66},~{s67},~{s68},~{s69}
694    ,~{s70},~{s71},~{s72},~{s73},~{s74},~{s75},~{s76},~{s77},~{s78},~{s79}
695    ,~{s80},~{s81},~{s82},~{s83},~{s84},~{s85},~{s86},~{s87},~{s88},~{s89}
696    ,~{s90},~{s91},~{s92},~{s93},~{s94},~{s95},~{s96},~{s97},~{s98},~{s99}
697    ,~{s100},~{s101},~{s102},~{s39},~{vcc}"()
698
699  call void asm sideeffect "; clobber all VGPRs except CSR v40",
700    "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9}
701    ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19}
702    ,~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29}
703    ,~{v30},~{v31},~{v32},~{v33},~{v34},~{v35},~{v36},~{v37},~{v38}"()
704  ret void
705}
706
707attributes #0 = { nounwind }
708attributes #1 = { nounwind "frame-pointer"="all" }
709attributes #2 = { nounwind "frame-pointer"="non-leaf" }
710attributes #3 = { nounwind "frame-pointer"="all" "amdgpu-waves-per-eu"="6,6" }
711