1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=DEFAULTSIZE,MUBUF %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-assume-dynamic-stack-object-size=1024 < %s | FileCheck -check-prefixes=ASSUME1024,MUBUF %s
4; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=DEFAULTSIZE,FLATSCR %s
5; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -mattr=+enable-flat-scratch -amdgpu-assume-dynamic-stack-object-size=1024 < %s | FileCheck -check-prefixes=ASSUME1024,FLATSCR %s
6
7; FIXME: Generated test checks do not check metadata at the end of the
8; function, so this also includes manually added checks.
9
10; Test that we can select a statically sized alloca outside of the
11; entry block.
12
13; FIXME: FunctionLoweringInfo unhelpfully doesn't preserve an
14; alignment less than the stack alignment.
15define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align4(i32 addrspace(1)* %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) #1 {
16; MUBUF-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4:
17; MUBUF:       ; %bb.0: ; %entry
18; MUBUF-NEXT:    s_add_u32 s0, s0, s9
19; MUBUF-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
20; MUBUF-NEXT:    s_addc_u32 s1, s1, 0
21; MUBUF-NEXT:    s_movk_i32 s32, 0x400
22; MUBUF-NEXT:    s_mov_b32 s33, 0
23; MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
24; MUBUF-NEXT:    s_cmp_lg_u32 s8, 0
25; MUBUF-NEXT:    s_cbranch_scc1 .LBB0_3
26; MUBUF-NEXT:  ; %bb.1: ; %bb.0
27; MUBUF-NEXT:    s_cmp_lg_u32 s9, 0
28; MUBUF-NEXT:    s_cbranch_scc1 .LBB0_3
29; MUBUF-NEXT:  ; %bb.2: ; %bb.1
30; MUBUF-NEXT:    s_add_i32 s6, s32, 0x1000
31; MUBUF-NEXT:    s_lshl_b32 s7, s10, 2
32; MUBUF-NEXT:    s_mov_b32 s32, s6
33; MUBUF-NEXT:    v_mov_b32_e32 v1, 0
34; MUBUF-NEXT:    v_mov_b32_e32 v2, s6
35; MUBUF-NEXT:    v_mov_b32_e32 v3, 1
36; MUBUF-NEXT:    s_add_i32 s6, s6, s7
37; MUBUF-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
38; MUBUF-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
39; MUBUF-NEXT:    v_mov_b32_e32 v2, s6
40; MUBUF-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
41; MUBUF-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
42; MUBUF-NEXT:    s_waitcnt vmcnt(0)
43; MUBUF-NEXT:    v_add_u32_e32 v0, v2, v0
44; MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
45; MUBUF-NEXT:    global_store_dword v1, v0, s[4:5]
46; MUBUF-NEXT:  .LBB0_3: ; %bb.2
47; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
48; MUBUF-NEXT:    global_store_dword v[0:1], v0, off
49; MUBUF-NEXT:    s_waitcnt vmcnt(0)
50; MUBUF-NEXT:    s_endpgm
51;
52; FLATSCR-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4:
53; FLATSCR:       ; %bb.0: ; %entry
54; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
55; FLATSCR-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x8
56; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
57; FLATSCR-NEXT:    s_mov_b32 s32, 16
58; FLATSCR-NEXT:    s_mov_b32 s33, 0
59; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
60; FLATSCR-NEXT:    s_cmp_lg_u32 s4, 0
61; FLATSCR-NEXT:    s_cbranch_scc1 .LBB0_3
62; FLATSCR-NEXT:  ; %bb.1: ; %bb.0
63; FLATSCR-NEXT:    s_cmp_lg_u32 s5, 0
64; FLATSCR-NEXT:    s_cbranch_scc1 .LBB0_3
65; FLATSCR-NEXT:  ; %bb.2: ; %bb.1
66; FLATSCR-NEXT:    v_mov_b32_e32 v1, 0
67; FLATSCR-NEXT:    s_add_i32 s2, s32, 0x1000
68; FLATSCR-NEXT:    v_mov_b32_e32 v2, 1
69; FLATSCR-NEXT:    s_lshl_b32 s3, s6, 2
70; FLATSCR-NEXT:    s_mov_b32 s32, s2
71; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[1:2], s2
72; FLATSCR-NEXT:    s_add_i32 s2, s2, s3
73; FLATSCR-NEXT:    scratch_load_dword v2, off, s2
74; FLATSCR-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
75; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
76; FLATSCR-NEXT:    v_add_u32_e32 v0, v2, v0
77; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
78; FLATSCR-NEXT:    global_store_dword v1, v0, s[0:1]
79; FLATSCR-NEXT:  .LBB0_3: ; %bb.2
80; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
81; FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
82; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
83; FLATSCR-NEXT:    s_endpgm
84
85entry:
86  %cond0 = icmp eq i32 %arg.cond0, 0
87  br i1 %cond0, label %bb.0, label %bb.2
88
89bb.0:
90  %alloca = alloca [16 x i32], align 4, addrspace(5)
91  %gep0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0
92  %gep1 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1
93  %cond1 = icmp eq i32 %arg.cond1, 0
94  br i1 %cond1, label %bb.1, label %bb.2
95
96bb.1:
97  ; Use the alloca outside of the defining block.
98  store i32 0, i32 addrspace(5)* %gep0
99  store i32 1, i32 addrspace(5)* %gep1
100  %gep2 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %in
101  %load = load i32, i32 addrspace(5)* %gep2
102  %tid = call i32 @llvm.amdgcn.workitem.id.x()
103  %add = add i32 %load, %tid
104  store i32 %add, i32 addrspace(1)* %out
105  br label %bb.2
106
107bb.2:
108  store volatile i32 0, i32 addrspace(1)* undef
109  ret void
110}
111; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4112
112; DEFAULTSIZE: ; ScratchSize: 4112
113
114; ASSUME1024: .amdhsa_private_segment_fixed_size 1040
115; ASSUME1024: ; ScratchSize: 1040
116
117define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align64(i32 addrspace(1)* %out, i32 %arg.cond, i32 %in) {
118; MUBUF-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64:
119; MUBUF:       ; %bb.0: ; %entry
120; MUBUF-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x8
121; MUBUF-NEXT:    s_add_u32 s0, s0, s9
122; MUBUF-NEXT:    s_addc_u32 s1, s1, 0
123; MUBUF-NEXT:    s_movk_i32 s32, 0x1000
124; MUBUF-NEXT:    s_mov_b32 s33, 0
125; MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
126; MUBUF-NEXT:    s_cmp_lg_u32 s6, 0
127; MUBUF-NEXT:    s_cbranch_scc1 .LBB1_2
128; MUBUF-NEXT:  ; %bb.1: ; %bb.0
129; MUBUF-NEXT:    s_add_i32 s6, s32, 0x1000
130; MUBUF-NEXT:    s_and_b32 s6, s6, 0xfffff000
131; MUBUF-NEXT:    s_lshl_b32 s7, s7, 2
132; MUBUF-NEXT:    s_mov_b32 s32, s6
133; MUBUF-NEXT:    v_mov_b32_e32 v1, 0
134; MUBUF-NEXT:    v_mov_b32_e32 v2, s6
135; MUBUF-NEXT:    v_mov_b32_e32 v3, 1
136; MUBUF-NEXT:    s_add_i32 s6, s6, s7
137; MUBUF-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
138; MUBUF-NEXT:    buffer_store_dword v3, v2, s[0:3], 0 offen offset:4
139; MUBUF-NEXT:    v_mov_b32_e32 v2, s6
140; MUBUF-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
141; MUBUF-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
142; MUBUF-NEXT:    s_waitcnt vmcnt(0)
143; MUBUF-NEXT:    v_add_u32_e32 v0, v2, v0
144; MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
145; MUBUF-NEXT:    global_store_dword v1, v0, s[4:5]
146; MUBUF-NEXT:  .LBB1_2: ; %bb.1
147; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
148; MUBUF-NEXT:    global_store_dword v[0:1], v0, off
149; MUBUF-NEXT:    s_waitcnt vmcnt(0)
150; MUBUF-NEXT:    s_endpgm
151;
152; FLATSCR-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64:
153; FLATSCR:       ; %bb.0: ; %entry
154; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
155; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
156; FLATSCR-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x8
157; FLATSCR-NEXT:    s_mov_b32 s32, 64
158; FLATSCR-NEXT:    s_mov_b32 s33, 0
159; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
160; FLATSCR-NEXT:    s_cmp_lg_u32 s2, 0
161; FLATSCR-NEXT:    s_cbranch_scc1 .LBB1_2
162; FLATSCR-NEXT:  ; %bb.1: ; %bb.0
163; FLATSCR-NEXT:    s_add_i32 s2, s32, 0x1000
164; FLATSCR-NEXT:    v_mov_b32_e32 v1, 0
165; FLATSCR-NEXT:    s_and_b32 s2, s2, 0xfffff000
166; FLATSCR-NEXT:    v_mov_b32_e32 v2, 1
167; FLATSCR-NEXT:    s_lshl_b32 s3, s3, 2
168; FLATSCR-NEXT:    s_mov_b32 s32, s2
169; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[1:2], s2
170; FLATSCR-NEXT:    s_add_i32 s2, s2, s3
171; FLATSCR-NEXT:    scratch_load_dword v2, off, s2
172; FLATSCR-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
173; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
174; FLATSCR-NEXT:    v_add_u32_e32 v0, v2, v0
175; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
176; FLATSCR-NEXT:    global_store_dword v1, v0, s[0:1]
177; FLATSCR-NEXT:  .LBB1_2: ; %bb.1
178; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
179; FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
180; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
181; FLATSCR-NEXT:    s_endpgm
182entry:
183  %cond = icmp eq i32 %arg.cond, 0
184  br i1 %cond, label %bb.0, label %bb.1
185
186bb.0:
187  %alloca = alloca [16 x i32], align 64, addrspace(5)
188  %gep0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0
189  %gep1 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1
190  store i32 0, i32 addrspace(5)* %gep0
191  store i32 1, i32 addrspace(5)* %gep1
192  %gep2 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %in
193  %load = load i32, i32 addrspace(5)* %gep2
194  %tid = call i32 @llvm.amdgcn.workitem.id.x()
195  %add = add i32 %load, %tid
196  store i32 %add, i32 addrspace(1)* %out
197  br label %bb.1
198
199bb.1:
200  store volatile i32 0, i32 addrspace(1)* undef
201  ret void
202}
203
204; DEFAULTSIZE: .amdhsa_private_segment_fixed_size 4160
205; DEFAULTSIZE: ; ScratchSize: 4160
206
207; ASSUME1024: .amdhsa_private_segment_fixed_size 1088
208; ASSUME1024: ; ScratchSize: 1088
209
210
211define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) {
212; MUBUF-LABEL: func_non_entry_block_static_alloca_align4:
213; MUBUF:       ; %bb.0: ; %entry
214; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
215; MUBUF-NEXT:    s_mov_b32 s7, s33
216; MUBUF-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
217; MUBUF-NEXT:    s_mov_b32 s33, s32
218; MUBUF-NEXT:    s_addk_i32 s32, 0x400
219; MUBUF-NEXT:    s_and_saveexec_b64 s[4:5], vcc
220; MUBUF-NEXT:    s_cbranch_execz .LBB2_3
221; MUBUF-NEXT:  ; %bb.1: ; %bb.0
222; MUBUF-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
223; MUBUF-NEXT:    s_and_b64 exec, exec, vcc
224; MUBUF-NEXT:    s_cbranch_execz .LBB2_3
225; MUBUF-NEXT:  ; %bb.2: ; %bb.1
226; MUBUF-NEXT:    s_add_i32 s6, s32, 0x1000
227; MUBUF-NEXT:    v_mov_b32_e32 v2, 0
228; MUBUF-NEXT:    v_mov_b32_e32 v3, s6
229; MUBUF-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
230; MUBUF-NEXT:    v_mov_b32_e32 v2, 1
231; MUBUF-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen offset:4
232; MUBUF-NEXT:    v_lshl_add_u32 v2, v4, 2, s6
233; MUBUF-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
234; MUBUF-NEXT:    v_and_b32_e32 v3, 0x3ff, v31
235; MUBUF-NEXT:    s_mov_b32 s32, s6
236; MUBUF-NEXT:    s_waitcnt vmcnt(0)
237; MUBUF-NEXT:    v_add_u32_e32 v2, v2, v3
238; MUBUF-NEXT:    global_store_dword v[0:1], v2, off
239; MUBUF-NEXT:  .LBB2_3: ; %bb.2
240; MUBUF-NEXT:    s_or_b64 exec, exec, s[4:5]
241; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
242; MUBUF-NEXT:    global_store_dword v[0:1], v0, off
243; MUBUF-NEXT:    s_waitcnt vmcnt(0)
244; MUBUF-NEXT:    s_addk_i32 s32, 0xfc00
245; MUBUF-NEXT:    s_mov_b32 s33, s7
246; MUBUF-NEXT:    s_setpc_b64 s[30:31]
247;
248; FLATSCR-LABEL: func_non_entry_block_static_alloca_align4:
249; FLATSCR:       ; %bb.0: ; %entry
250; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
251; FLATSCR-NEXT:    s_mov_b32 s3, s33
252; FLATSCR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
253; FLATSCR-NEXT:    s_mov_b32 s33, s32
254; FLATSCR-NEXT:    s_add_i32 s32, s32, 16
255; FLATSCR-NEXT:    s_and_saveexec_b64 s[0:1], vcc
256; FLATSCR-NEXT:    s_cbranch_execz .LBB2_3
257; FLATSCR-NEXT:  ; %bb.1: ; %bb.0
258; FLATSCR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
259; FLATSCR-NEXT:    s_and_b64 exec, exec, vcc
260; FLATSCR-NEXT:    s_cbranch_execz .LBB2_3
261; FLATSCR-NEXT:  ; %bb.2: ; %bb.1
262; FLATSCR-NEXT:    s_add_i32 s2, s32, 0x1000
263; FLATSCR-NEXT:    v_mov_b32_e32 v2, 0
264; FLATSCR-NEXT:    v_mov_b32_e32 v3, 1
265; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[2:3], s2
266; FLATSCR-NEXT:    v_lshl_add_u32 v2, v4, 2, s2
267; FLATSCR-NEXT:    scratch_load_dword v2, v2, off
268; FLATSCR-NEXT:    v_and_b32_e32 v3, 0x3ff, v31
269; FLATSCR-NEXT:    s_mov_b32 s32, s2
270; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
271; FLATSCR-NEXT:    v_add_u32_e32 v2, v2, v3
272; FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
273; FLATSCR-NEXT:  .LBB2_3: ; %bb.2
274; FLATSCR-NEXT:    s_or_b64 exec, exec, s[0:1]
275; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
276; FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
277; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
278; FLATSCR-NEXT:    s_add_i32 s32, s32, -16
279; FLATSCR-NEXT:    s_mov_b32 s33, s3
280; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
281
282entry:
283  %cond0 = icmp eq i32 %arg.cond0, 0
284  br i1 %cond0, label %bb.0, label %bb.2
285
286bb.0:
287  %alloca = alloca [16 x i32], align 4, addrspace(5)
288  %gep0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0
289  %gep1 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1
290  %cond1 = icmp eq i32 %arg.cond1, 0
291  br i1 %cond1, label %bb.1, label %bb.2
292
293bb.1:
294  ; Use the alloca outside of the defining block.
295  store i32 0, i32 addrspace(5)* %gep0
296  store i32 1, i32 addrspace(5)* %gep1
297  %gep2 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %in
298  %load = load i32, i32 addrspace(5)* %gep2
299  %tid = call i32 @llvm.amdgcn.workitem.id.x()
300  %add = add i32 %load, %tid
301  store i32 %add, i32 addrspace(1)* %out
302  br label %bb.2
303
304bb.2:
305  store volatile i32 0, i32 addrspace(1)* undef
306  ret void
307}
308
309define void @func_non_entry_block_static_alloca_align64(i32 addrspace(1)* %out, i32 %arg.cond, i32 %in) {
310; MUBUF-LABEL: func_non_entry_block_static_alloca_align64:
311; MUBUF:       ; %bb.0: ; %entry
312; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
313; MUBUF-NEXT:    s_mov_b32 s7, s33
314; MUBUF-NEXT:    s_add_i32 s33, s32, 0xfc0
315; MUBUF-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
316; MUBUF-NEXT:    s_and_b32 s33, s33, 0xfffff000
317; MUBUF-NEXT:    s_addk_i32 s32, 0x2000
318; MUBUF-NEXT:    s_and_saveexec_b64 s[4:5], vcc
319; MUBUF-NEXT:    s_cbranch_execz .LBB3_2
320; MUBUF-NEXT:  ; %bb.1: ; %bb.0
321; MUBUF-NEXT:    s_add_i32 s6, s32, 0x1000
322; MUBUF-NEXT:    s_and_b32 s6, s6, 0xfffff000
323; MUBUF-NEXT:    v_mov_b32_e32 v2, 0
324; MUBUF-NEXT:    v_mov_b32_e32 v4, s6
325; MUBUF-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen
326; MUBUF-NEXT:    v_mov_b32_e32 v2, 1
327; MUBUF-NEXT:    buffer_store_dword v2, v4, s[0:3], 0 offen offset:4
328; MUBUF-NEXT:    v_lshl_add_u32 v2, v3, 2, s6
329; MUBUF-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
330; MUBUF-NEXT:    v_and_b32_e32 v3, 0x3ff, v31
331; MUBUF-NEXT:    s_mov_b32 s32, s6
332; MUBUF-NEXT:    s_waitcnt vmcnt(0)
333; MUBUF-NEXT:    v_add_u32_e32 v2, v2, v3
334; MUBUF-NEXT:    global_store_dword v[0:1], v2, off
335; MUBUF-NEXT:  .LBB3_2: ; %bb.1
336; MUBUF-NEXT:    s_or_b64 exec, exec, s[4:5]
337; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
338; MUBUF-NEXT:    global_store_dword v[0:1], v0, off
339; MUBUF-NEXT:    s_waitcnt vmcnt(0)
340; MUBUF-NEXT:    s_addk_i32 s32, 0xe000
341; MUBUF-NEXT:    s_mov_b32 s33, s7
342; MUBUF-NEXT:    s_setpc_b64 s[30:31]
343;
344; FLATSCR-LABEL: func_non_entry_block_static_alloca_align64:
345; FLATSCR:       ; %bb.0: ; %entry
346; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
347; FLATSCR-NEXT:    s_mov_b32 s3, s33
348; FLATSCR-NEXT:    s_add_i32 s33, s32, 63
349; FLATSCR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
350; FLATSCR-NEXT:    s_andn2_b32 s33, s33, 63
351; FLATSCR-NEXT:    s_addk_i32 s32, 0x80
352; FLATSCR-NEXT:    s_and_saveexec_b64 s[0:1], vcc
353; FLATSCR-NEXT:    s_cbranch_execz .LBB3_2
354; FLATSCR-NEXT:  ; %bb.1: ; %bb.0
355; FLATSCR-NEXT:    s_add_i32 s2, s32, 0x1000
356; FLATSCR-NEXT:    s_and_b32 s2, s2, 0xfffff000
357; FLATSCR-NEXT:    v_mov_b32_e32 v4, 0
358; FLATSCR-NEXT:    v_mov_b32_e32 v5, 1
359; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[4:5], s2
360; FLATSCR-NEXT:    v_lshl_add_u32 v2, v3, 2, s2
361; FLATSCR-NEXT:    scratch_load_dword v2, v2, off
362; FLATSCR-NEXT:    v_and_b32_e32 v3, 0x3ff, v31
363; FLATSCR-NEXT:    s_mov_b32 s32, s2
364; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
365; FLATSCR-NEXT:    v_add_u32_e32 v2, v2, v3
366; FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
367; FLATSCR-NEXT:  .LBB3_2: ; %bb.1
368; FLATSCR-NEXT:    s_or_b64 exec, exec, s[0:1]
369; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
370; FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
371; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
372; FLATSCR-NEXT:    s_addk_i32 s32, 0xff80
373; FLATSCR-NEXT:    s_mov_b32 s33, s3
374; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
375entry:
376  %cond = icmp eq i32 %arg.cond, 0
377  br i1 %cond, label %bb.0, label %bb.1
378
379bb.0:
380  %alloca = alloca [16 x i32], align 64, addrspace(5)
381  %gep0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0
382  %gep1 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 1
383  store i32 0, i32 addrspace(5)* %gep0
384  store i32 1, i32 addrspace(5)* %gep1
385  %gep2 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %in
386  %load = load i32, i32 addrspace(5)* %gep2
387  %tid = call i32 @llvm.amdgcn.workitem.id.x()
388  %add = add i32 %load, %tid
389  store i32 %add, i32 addrspace(1)* %out
390  br label %bb.1
391
392bb.1:
393  store volatile i32 0, i32 addrspace(1)* undef
394  ret void
395}
396
397declare i32 @llvm.amdgcn.workitem.id.x() #0
398
399attributes #0 = { nounwind readnone speculatable }
400attributes #1 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
401