1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI %s
3; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck --check-prefix=FLAT %s
4
5define amdgpu_kernel void @break_inserted_outside_of_loop(i32 addrspace(1)* %out, i32 %a) {
6; SI-LABEL: break_inserted_outside_of_loop:
7; SI:       ; %bb.0: ; %main_body
8; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
9; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
10; SI-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, -1, 0
11; SI-NEXT:    s_waitcnt lgkmcnt(0)
12; SI-NEXT:    v_and_b32_e32 v0, s2, v0
13; SI-NEXT:    v_and_b32_e32 v0, 1, v0
14; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
15; SI-NEXT:    s_mov_b64 s[2:3], 0
16; SI-NEXT:  .LBB0_1: ; %ENDIF
17; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
18; SI-NEXT:    s_and_b64 s[4:5], exec, vcc
19; SI-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
20; SI-NEXT:    s_andn2_b64 exec, exec, s[2:3]
21; SI-NEXT:    s_cbranch_execnz .LBB0_1
22; SI-NEXT:  ; %bb.2: ; %ENDLOOP
23; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
24; SI-NEXT:    s_mov_b32 s3, 0xf000
25; SI-NEXT:    s_mov_b32 s2, -1
26; SI-NEXT:    v_mov_b32_e32 v0, 0
27; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
28; SI-NEXT:    s_endpgm
29;
30; FLAT-LABEL: break_inserted_outside_of_loop:
31; FLAT:       ; %bb.0: ; %main_body
32; FLAT-NEXT:    s_load_dword s2, s[0:1], 0x2c
33; FLAT-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
34; FLAT-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
35; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
36; FLAT-NEXT:    v_and_b32_e32 v0, s2, v0
37; FLAT-NEXT:    v_and_b32_e32 v0, 1, v0
38; FLAT-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
39; FLAT-NEXT:    s_mov_b64 s[2:3], 0
40; FLAT-NEXT:  .LBB0_1: ; %ENDIF
41; FLAT-NEXT:    ; =>This Inner Loop Header: Depth=1
42; FLAT-NEXT:    s_and_b64 s[4:5], exec, vcc
43; FLAT-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
44; FLAT-NEXT:    s_andn2_b64 exec, exec, s[2:3]
45; FLAT-NEXT:    s_cbranch_execnz .LBB0_1
46; FLAT-NEXT:  ; %bb.2: ; %ENDLOOP
47; FLAT-NEXT:    s_or_b64 exec, exec, s[2:3]
48; FLAT-NEXT:    s_mov_b32 s3, 0xf000
49; FLAT-NEXT:    s_mov_b32 s2, -1
50; FLAT-NEXT:    v_mov_b32_e32 v0, 0
51; FLAT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
52; FLAT-NEXT:    s_endpgm
53main_body:
54  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
55  %0 = and i32 %a, %tid
56  %1 = trunc i32 %0 to i1
57  br label %ENDIF
58
59ENDLOOP:
60  store i32 0, i32 addrspace(1)* %out
61  ret void
62
63ENDIF:
64  br i1 %1, label %ENDLOOP, label %ENDIF
65}
66
67define amdgpu_kernel void @phi_cond_outside_loop(i32 %b) {
68; SI-LABEL: phi_cond_outside_loop:
69; SI:       ; %bb.0: ; %entry
70; SI-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, -1, 0
71; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
72; SI-NEXT:    s_mov_b64 s[2:3], 0
73; SI-NEXT:    s_mov_b64 s[4:5], 0
74; SI-NEXT:    s_and_saveexec_b64 s[6:7], vcc
75; SI-NEXT:    s_cbranch_execz .LBB1_2
76; SI-NEXT:  ; %bb.1: ; %else
77; SI-NEXT:    s_load_dword s0, s[0:1], 0x9
78; SI-NEXT:    s_waitcnt lgkmcnt(0)
79; SI-NEXT:    s_cmp_eq_u32 s0, 0
80; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
81; SI-NEXT:    s_and_b64 s[4:5], s[0:1], exec
82; SI-NEXT:  .LBB1_2: ; %endif
83; SI-NEXT:    s_or_b64 exec, exec, s[6:7]
84; SI-NEXT:  .LBB1_3: ; %loop
85; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
86; SI-NEXT:    s_and_b64 s[0:1], exec, s[4:5]
87; SI-NEXT:    s_or_b64 s[2:3], s[0:1], s[2:3]
88; SI-NEXT:    s_andn2_b64 exec, exec, s[2:3]
89; SI-NEXT:    s_cbranch_execnz .LBB1_3
90; SI-NEXT:  ; %bb.4: ; %exit
91; SI-NEXT:    s_endpgm
92;
93; FLAT-LABEL: phi_cond_outside_loop:
94; FLAT:       ; %bb.0: ; %entry
95; FLAT-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
96; FLAT-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
97; FLAT-NEXT:    s_mov_b64 s[2:3], 0
98; FLAT-NEXT:    s_mov_b64 s[4:5], 0
99; FLAT-NEXT:    s_and_saveexec_b64 s[6:7], vcc
100; FLAT-NEXT:    s_cbranch_execz .LBB1_2
101; FLAT-NEXT:  ; %bb.1: ; %else
102; FLAT-NEXT:    s_load_dword s0, s[0:1], 0x24
103; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
104; FLAT-NEXT:    s_cmp_eq_u32 s0, 0
105; FLAT-NEXT:    s_cselect_b64 s[0:1], -1, 0
106; FLAT-NEXT:    s_and_b64 s[4:5], s[0:1], exec
107; FLAT-NEXT:  .LBB1_2: ; %endif
108; FLAT-NEXT:    s_or_b64 exec, exec, s[6:7]
109; FLAT-NEXT:  .LBB1_3: ; %loop
110; FLAT-NEXT:    ; =>This Inner Loop Header: Depth=1
111; FLAT-NEXT:    s_and_b64 s[0:1], exec, s[4:5]
112; FLAT-NEXT:    s_or_b64 s[2:3], s[0:1], s[2:3]
113; FLAT-NEXT:    s_andn2_b64 exec, exec, s[2:3]
114; FLAT-NEXT:    s_cbranch_execnz .LBB1_3
115; FLAT-NEXT:  ; %bb.4: ; %exit
116; FLAT-NEXT:    s_endpgm
117entry:
118  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
119  %0 = icmp eq i32 %tid , 0
120  br i1 %0, label %if, label %else
121
122if:
123  br label %endif
124
125else:
126  %1 = icmp eq i32 %b, 0
127  br label %endif
128
129endif:
130  %2 = phi i1 [0, %if], [%1, %else]
131  br label %loop
132
133loop:
134  br i1 %2, label %exit, label %loop
135
136exit:
137  ret void
138}
139
140define amdgpu_kernel void @switch_unreachable(i32 addrspace(1)* %g, i8 addrspace(3)* %l, i32 %x) nounwind {
141; SI-LABEL: switch_unreachable:
142; SI:       ; %bb.0: ; %centry
143;
144; FLAT-LABEL: switch_unreachable:
145; FLAT:       ; %bb.0: ; %centry
146centry:
147  switch i32 %x, label %sw.default [
148    i32 0, label %sw.bb
149    i32 60, label %sw.bb
150  ]
151
152sw.bb:
153  unreachable
154
155sw.default:
156  unreachable
157
158sw.epilog:
159  ret void
160}
161
162declare float @llvm.fabs.f32(float) nounwind readnone
163
164define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 %c3, i32 %x, i32 %y, i1 %arg) nounwind {
165; SI-LABEL: loop_land_info_assert:
166; SI:       ; %bb.0: ; %entry
167; SI-NEXT:    s_mov_b32 s7, 0xf000
168; SI-NEXT:    s_mov_b32 s6, -1
169; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
170; SI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x9
171; SI-NEXT:    s_load_dword s14, s[0:1], 0xc
172; SI-NEXT:    s_brev_b32 s8, 44
173; SI-NEXT:    s_waitcnt lgkmcnt(0)
174; SI-NEXT:    s_cmp_lt_i32 s2, 1
175; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
176; SI-NEXT:    s_cmp_lt_i32 s3, 4
177; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
178; SI-NEXT:    s_cmp_gt_i32 s3, 3
179; SI-NEXT:    s_cselect_b64 s[2:3], -1, 0
180; SI-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
181; SI-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
182; SI-NEXT:    s_and_b64 s[2:3], exec, s[2:3]
183; SI-NEXT:    s_waitcnt vmcnt(0)
184; SI-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s8
185; SI-NEXT:    s_and_b64 s[4:5], exec, s[4:5]
186; SI-NEXT:    v_mov_b32_e32 v0, 3
187; SI-NEXT:    s_branch .LBB3_4
188; SI-NEXT:  .LBB3_1: ; %Flow6
189; SI-NEXT:    ; in Loop: Header=BB3_4 Depth=1
190; SI-NEXT:    s_mov_b64 s[8:9], 0
191; SI-NEXT:  .LBB3_2: ; %Flow5
192; SI-NEXT:    ; in Loop: Header=BB3_4 Depth=1
193; SI-NEXT:    s_mov_b64 s[12:13], 0
194; SI-NEXT:  .LBB3_3: ; %Flow
195; SI-NEXT:    ; in Loop: Header=BB3_4 Depth=1
196; SI-NEXT:    s_and_b64 vcc, exec, s[10:11]
197; SI-NEXT:    s_cbranch_vccnz .LBB3_8
198; SI-NEXT:  .LBB3_4: ; %while.cond
199; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
200; SI-NEXT:    s_mov_b64 s[12:13], -1
201; SI-NEXT:    s_mov_b64 s[8:9], -1
202; SI-NEXT:    s_mov_b64 s[10:11], -1
203; SI-NEXT:    s_mov_b64 vcc, s[0:1]
204; SI-NEXT:    s_cbranch_vccz .LBB3_3
205; SI-NEXT:  ; %bb.5: ; %convex.exit
206; SI-NEXT:    ; in Loop: Header=BB3_4 Depth=1
207; SI-NEXT:    s_mov_b64 s[8:9], -1
208; SI-NEXT:    s_mov_b64 s[10:11], -1
209; SI-NEXT:    s_mov_b64 vcc, s[2:3]
210; SI-NEXT:    s_cbranch_vccz .LBB3_2
211; SI-NEXT:  ; %bb.6: ; %if.end
212; SI-NEXT:    ; in Loop: Header=BB3_4 Depth=1
213; SI-NEXT:    s_mov_b64 s[10:11], -1
214; SI-NEXT:    s_mov_b64 vcc, s[4:5]
215; SI-NEXT:    s_cbranch_vccz .LBB3_1
216; SI-NEXT:  ; %bb.7: ; %if.else
217; SI-NEXT:    ; in Loop: Header=BB3_4 Depth=1
218; SI-NEXT:    s_mov_b64 s[10:11], 0
219; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
220; SI-NEXT:    s_waitcnt vmcnt(0)
221; SI-NEXT:    s_branch .LBB3_1
222; SI-NEXT:  .LBB3_8: ; %loop.exit.guard4
223; SI-NEXT:    ; in Loop: Header=BB3_4 Depth=1
224; SI-NEXT:    s_and_b64 vcc, exec, s[8:9]
225; SI-NEXT:    s_cbranch_vccz .LBB3_4
226; SI-NEXT:  ; %bb.9: ; %loop.exit.guard
227; SI-NEXT:    s_and_b64 vcc, exec, s[12:13]
228; SI-NEXT:    s_cbranch_vccz .LBB3_13
229; SI-NEXT:  ; %bb.10: ; %for.cond.preheader
230; SI-NEXT:    s_cmpk_lt_i32 s14, 0x3e8
231; SI-NEXT:    s_cbranch_scc0 .LBB3_13
232; SI-NEXT:  ; %bb.11: ; %for.body
233; SI-NEXT:    s_and_b64 vcc, exec, 0
234; SI-NEXT:  .LBB3_12: ; %self.loop
235; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
236; SI-NEXT:    s_mov_b64 vcc, vcc
237; SI-NEXT:    s_cbranch_vccz .LBB3_12
238; SI-NEXT:  .LBB3_13: ; %DummyReturnBlock
239; SI-NEXT:    s_endpgm
240;
241; FLAT-LABEL: loop_land_info_assert:
242; FLAT:       ; %bb.0: ; %entry
243; FLAT-NEXT:    s_mov_b32 s7, 0xf000
244; FLAT-NEXT:    s_mov_b32 s6, -1
245; FLAT-NEXT:    buffer_load_dword v0, off, s[4:7], 0
246; FLAT-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
247; FLAT-NEXT:    s_load_dword s14, s[0:1], 0x30
248; FLAT-NEXT:    s_brev_b32 s8, 44
249; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
250; FLAT-NEXT:    s_cmp_lt_i32 s2, 1
251; FLAT-NEXT:    s_cselect_b64 s[4:5], -1, 0
252; FLAT-NEXT:    s_cmp_lt_i32 s3, 4
253; FLAT-NEXT:    s_cselect_b64 s[0:1], -1, 0
254; FLAT-NEXT:    s_cmp_gt_i32 s3, 3
255; FLAT-NEXT:    s_cselect_b64 s[2:3], -1, 0
256; FLAT-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
257; FLAT-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
258; FLAT-NEXT:    s_and_b64 s[2:3], exec, s[2:3]
259; FLAT-NEXT:    s_waitcnt vmcnt(0)
260; FLAT-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s8
261; FLAT-NEXT:    s_and_b64 s[4:5], exec, s[4:5]
262; FLAT-NEXT:    v_mov_b32_e32 v0, 3
263; FLAT-NEXT:    s_branch .LBB3_4
264; FLAT-NEXT:  .LBB3_1: ; %Flow6
265; FLAT-NEXT:    ; in Loop: Header=BB3_4 Depth=1
266; FLAT-NEXT:    s_mov_b64 s[8:9], 0
267; FLAT-NEXT:  .LBB3_2: ; %Flow5
268; FLAT-NEXT:    ; in Loop: Header=BB3_4 Depth=1
269; FLAT-NEXT:    s_mov_b64 s[12:13], 0
270; FLAT-NEXT:  .LBB3_3: ; %Flow
271; FLAT-NEXT:    ; in Loop: Header=BB3_4 Depth=1
272; FLAT-NEXT:    s_and_b64 vcc, exec, s[10:11]
273; FLAT-NEXT:    s_cbranch_vccnz .LBB3_8
274; FLAT-NEXT:  .LBB3_4: ; %while.cond
275; FLAT-NEXT:    ; =>This Inner Loop Header: Depth=1
276; FLAT-NEXT:    s_mov_b64 s[12:13], -1
277; FLAT-NEXT:    s_mov_b64 s[8:9], -1
278; FLAT-NEXT:    s_mov_b64 s[10:11], -1
279; FLAT-NEXT:    s_mov_b64 vcc, s[0:1]
280; FLAT-NEXT:    s_cbranch_vccz .LBB3_3
281; FLAT-NEXT:  ; %bb.5: ; %convex.exit
282; FLAT-NEXT:    ; in Loop: Header=BB3_4 Depth=1
283; FLAT-NEXT:    s_mov_b64 s[8:9], -1
284; FLAT-NEXT:    s_mov_b64 s[10:11], -1
285; FLAT-NEXT:    s_mov_b64 vcc, s[2:3]
286; FLAT-NEXT:    s_cbranch_vccz .LBB3_2
287; FLAT-NEXT:  ; %bb.6: ; %if.end
288; FLAT-NEXT:    ; in Loop: Header=BB3_4 Depth=1
289; FLAT-NEXT:    s_mov_b64 s[10:11], -1
290; FLAT-NEXT:    s_mov_b64 vcc, s[4:5]
291; FLAT-NEXT:    s_cbranch_vccz .LBB3_1
292; FLAT-NEXT:  ; %bb.7: ; %if.else
293; FLAT-NEXT:    ; in Loop: Header=BB3_4 Depth=1
294; FLAT-NEXT:    s_mov_b64 s[10:11], 0
295; FLAT-NEXT:    buffer_store_dword v0, off, s[4:7], 0
296; FLAT-NEXT:    s_waitcnt vmcnt(0)
297; FLAT-NEXT:    s_branch .LBB3_1
298; FLAT-NEXT:  .LBB3_8: ; %loop.exit.guard4
299; FLAT-NEXT:    ; in Loop: Header=BB3_4 Depth=1
300; FLAT-NEXT:    s_and_b64 vcc, exec, s[8:9]
301; FLAT-NEXT:    s_cbranch_vccz .LBB3_4
302; FLAT-NEXT:  ; %bb.9: ; %loop.exit.guard
303; FLAT-NEXT:    s_and_b64 vcc, exec, s[12:13]
304; FLAT-NEXT:    s_cbranch_vccz .LBB3_13
305; FLAT-NEXT:  ; %bb.10: ; %for.cond.preheader
306; FLAT-NEXT:    s_cmpk_lt_i32 s14, 0x3e8
307; FLAT-NEXT:    s_cbranch_scc0 .LBB3_13
308; FLAT-NEXT:  ; %bb.11: ; %for.body
309; FLAT-NEXT:    s_and_b64 vcc, exec, 0
310; FLAT-NEXT:  .LBB3_12: ; %self.loop
311; FLAT-NEXT:    ; =>This Inner Loop Header: Depth=1
312; FLAT-NEXT:    s_mov_b64 vcc, vcc
313; FLAT-NEXT:    s_cbranch_vccz .LBB3_12
314; FLAT-NEXT:  .LBB3_13: ; %DummyReturnBlock
315; FLAT-NEXT:    s_endpgm
316entry:
317  %cmp = icmp sgt i32 %c0, 0
318  br label %while.cond.outer
319
320while.cond.outer:
321  %tmp = load float, float addrspace(1)* undef
322  br label %while.cond
323
324while.cond:
325  %cmp1 = icmp slt i32 %c1, 4
326  br i1 %cmp1, label %convex.exit, label %for.cond
327
328convex.exit:
329  %or = or i1 %cmp, %cmp1
330  br i1 %or, label %return, label %if.end
331
332if.end:
333  %tmp3 = call float @llvm.fabs.f32(float %tmp) nounwind readnone
334  %cmp2 = fcmp olt float %tmp3, 0x3E80000000000000
335  br i1 %cmp2, label %if.else, label %while.cond.outer
336
337if.else:
338  store volatile i32 3, i32 addrspace(1)* undef, align 4
339  br label %while.cond
340
341for.cond:
342  %cmp3 = icmp slt i32 %c3, 1000
343  br i1 %cmp3, label %for.body, label %return
344
345for.body:
346  br i1 %cmp3, label %self.loop, label %if.end.2
347
348if.end.2:
349  %or.cond2 = or i1 %cmp3, %arg
350  br i1 %or.cond2, label %return, label %for.cond
351
352self.loop:
353 br label %self.loop
354
355return:
356  ret void
357}
358
359declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
360
361attributes #0 = { nounwind readnone }
362