1; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
2
3; CHECK-LABEL: {{^}}test_kill_depth_0_imm_pos:
4; CHECK-NEXT: ; BB#0:
5; CHECK-NEXT: s_endpgm
6define amdgpu_ps void @test_kill_depth_0_imm_pos() #0 {
7  call void @llvm.AMDGPU.kill(float 0.0)
8  ret void
9}
10
11; CHECK-LABEL: {{^}}test_kill_depth_0_imm_neg:
12; CHECK-NEXT: ; BB#0:
13; CHECK-NEXT: s_mov_b64 exec, 0
14; CHECK-NEXT: ; BB#1:
15; CHECK-NEXT: s_endpgm
16define amdgpu_ps void @test_kill_depth_0_imm_neg() #0 {
17  call void @llvm.AMDGPU.kill(float -0.0)
18  ret void
19}
20
21; FIXME: Ideally only one would be emitted
22; CHECK-LABEL: {{^}}test_kill_depth_0_imm_neg_x2:
23; CHECK-NEXT: ; BB#0:
24; CHECK-NEXT: s_mov_b64 exec, 0
25; CHECK-NEXT: ; BB#1:
26; CHECK-NEXT: s_mov_b64 exec, 0
27; CHECK-NEXT: ; BB#2:
28; CHECK-NEXT: s_endpgm
29define amdgpu_ps void @test_kill_depth_0_imm_neg_x2() #0 {
30  call void @llvm.AMDGPU.kill(float -0.0)
31  call void @llvm.AMDGPU.kill(float -1.0)
32  ret void
33}
34
35; CHECK-LABEL: {{^}}test_kill_depth_var:
36; CHECK-NEXT: ; BB#0:
37; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v0
38; CHECK-NEXT: ; BB#1:
39; CHECK-NEXT: s_endpgm
40define amdgpu_ps void @test_kill_depth_var(float %x) #0 {
41  call void @llvm.AMDGPU.kill(float %x)
42  ret void
43}
44
45; FIXME: Ideally only one would be emitted
46; CHECK-LABEL: {{^}}test_kill_depth_var_x2_same:
47; CHECK-NEXT: ; BB#0:
48; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v0
49; CHECK-NEXT: ; BB#1:
50; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v0
51; CHECK-NEXT: ; BB#2:
52; CHECK-NEXT: s_endpgm
53define amdgpu_ps void @test_kill_depth_var_x2_same(float %x) #0 {
54  call void @llvm.AMDGPU.kill(float %x)
55  call void @llvm.AMDGPU.kill(float %x)
56  ret void
57}
58
59; CHECK-LABEL: {{^}}test_kill_depth_var_x2:
60; CHECK-NEXT: ; BB#0:
61; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v0
62; CHECK-NEXT: ; BB#1:
63; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v1
64; CHECK-NEXT: ; BB#2:
65; CHECK-NEXT: s_endpgm
66define amdgpu_ps void @test_kill_depth_var_x2(float %x, float %y) #0 {
67  call void @llvm.AMDGPU.kill(float %x)
68  call void @llvm.AMDGPU.kill(float %y)
69  ret void
70}
71
72; CHECK-LABEL: {{^}}test_kill_depth_var_x2_instructions:
73; CHECK-NEXT: ; BB#0:
74; CHECK-NEXT: v_cmpx_le_f32_e32 vcc, 0, v0
75; CHECK-NEXT: ; BB#1:
76; CHECK: v_mov_b32_e64 v7, -1
77; CHECK: v_cmpx_le_f32_e32 vcc, 0, v7
78; CHECK-NEXT: ; BB#2:
79; CHECK-NEXT: s_endpgm
80define amdgpu_ps void @test_kill_depth_var_x2_instructions(float %x) #0 {
81  call void @llvm.AMDGPU.kill(float %x)
82  %y = call float asm sideeffect "v_mov_b32_e64 v7, -1", "={VGPR7}"()
83  call void @llvm.AMDGPU.kill(float %y)
84  ret void
85}
86
87; FIXME: why does the skip depend on the asm length in the same block?
88
89; CHECK-LABEL: {{^}}test_kill_control_flow:
90; CHECK: s_cmp_lg_u32 s{{[0-9]+}}, 0
91; CHECK: s_cbranch_scc1 [[RETURN_BB:BB[0-9]+_[0-9]+]]
92
93; CHECK-NEXT: ; BB#1:
94; CHECK: v_mov_b32_e64 v7, -1
95; CHECK: v_nop_e64
96; CHECK: v_nop_e64
97; CHECK: v_nop_e64
98; CHECK: v_nop_e64
99; CHECK: v_nop_e64
100; CHECK: v_nop_e64
101; CHECK: v_nop_e64
102; CHECK: v_nop_e64
103; CHECK: v_nop_e64
104; CHECK: v_nop_e64
105
106; CHECK: v_cmpx_le_f32_e32 vcc, 0, v7
107; CHECK-NEXT: s_cbranch_execnz [[SPLIT_BB:BB[0-9]+_[0-9]+]]
108; CHECK-NEXT: ; BB#2:
109; CHECK-NEXT: exp null off, off, off, off done vm
110; CHECK-NEXT: s_endpgm
111
112; CHECK-NEXT: {{^}}[[SPLIT_BB]]:
113; CHECK-NEXT: s_endpgm
114define amdgpu_ps void @test_kill_control_flow(i32 inreg %arg) #0 {
115entry:
116  %cmp = icmp eq i32 %arg, 0
117  br i1 %cmp, label %bb, label %exit
118
119bb:
120  %var = call float asm sideeffect "
121    v_mov_b32_e64 v7, -1
122    v_nop_e64
123    v_nop_e64
124    v_nop_e64
125    v_nop_e64
126    v_nop_e64
127    v_nop_e64
128    v_nop_e64
129    v_nop_e64
130    v_nop_e64
131    v_nop_e64", "={VGPR7}"()
132  call void @llvm.AMDGPU.kill(float %var)
133  br label %exit
134
135exit:
136  ret void
137}
138
139; CHECK-LABEL: {{^}}test_kill_control_flow_remainder:
140; CHECK: s_cmp_lg_u32 s{{[0-9]+}}, 0
141; CHECK-NEXT: v_mov_b32_e32 v{{[0-9]+}}, 0
142; CHECK-NEXT: s_cbranch_scc1 [[RETURN_BB:BB[0-9]+_[0-9]+]]
143
144; CHECK-NEXT: ; BB#1: ; %bb
145; CHECK: v_mov_b32_e64 v7, -1
146; CHECK: v_nop_e64
147; CHECK: v_nop_e64
148; CHECK: v_nop_e64
149; CHECK: v_nop_e64
150; CHECK: v_nop_e64
151; CHECK: v_nop_e64
152; CHECK: v_nop_e64
153; CHECK: v_nop_e64
154; CHECK: ;;#ASMEND
155; CHECK: v_mov_b32_e64 v8, -1
156; CHECK: ;;#ASMEND
157; CHECK: v_cmpx_le_f32_e32 vcc, 0, v7
158; CHECK-NEXT: s_cbranch_execnz [[SPLIT_BB:BB[0-9]+_[0-9]+]]
159
160; CHECK-NEXT: ; BB#2:
161; CHECK-NEXT: exp null off, off, off, off done vm
162; CHECK-NEXT: s_endpgm
163
164; CHECK-NEXT: {{^}}[[SPLIT_BB]]:
165; CHECK: buffer_store_dword v8
166; CHECK: v_mov_b32_e64 v9, -2
167
168; CHECK: {{^}}BB{{[0-9]+_[0-9]+}}:
169; CHECK: buffer_store_dword v9
170; CHECK-NEXT: s_endpgm
171define amdgpu_ps void @test_kill_control_flow_remainder(i32 inreg %arg) #0 {
172entry:
173  %cmp = icmp eq i32 %arg, 0
174  br i1 %cmp, label %bb, label %exit
175
176bb:
177  %var = call float asm sideeffect "
178    v_mov_b32_e64 v7, -1
179    v_nop_e64
180    v_nop_e64
181    v_nop_e64
182    v_nop_e64
183    v_nop_e64
184    v_nop_e64
185    v_nop_e64
186    v_nop_e64
187    v_nop_e64
188    v_nop_e64
189    v_nop_e64", "={VGPR7}"()
190  %live.across = call float asm sideeffect "v_mov_b32_e64 v8, -1", "={VGPR8}"()
191  call void @llvm.AMDGPU.kill(float %var)
192  store volatile float %live.across, float addrspace(1)* undef
193  %live.out = call float asm sideeffect "v_mov_b32_e64 v9, -2", "={VGPR9}"()
194  br label %exit
195
196exit:
197  %phi = phi float [ 0.0, %entry ], [ %live.out, %bb ]
198  store float %phi, float addrspace(1)* undef
199  ret void
200}
201
202; CHECK-LABEL: {{^}}test_kill_divergent_loop:
203; CHECK: v_cmp_eq_u32_e32 vcc, 0, v0
204; CHECK-NEXT: s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], vcc
205; CHECK-NEXT: s_xor_b64 [[SAVEEXEC]], exec, [[SAVEEXEC]]
206; CHECK-NEXT: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]]
207; CHECK-NEXT: s_cbranch_execz [[EXIT]]
208
209; CHECK: {{BB[0-9]+_[0-9]+}}: ; %bb.preheader
210; CHECK: s_mov_b32
211
212; CHECK: [[LOOP_BB:BB[0-9]+_[0-9]+]]:
213
214; CHECK: v_mov_b32_e64 v7, -1
215; CHECK: v_nop_e64
216; CHECK: v_cmpx_le_f32_e32 vcc, 0, v7
217
218; CHECK-NEXT: ; BB#3:
219; CHECK: buffer_load_dword [[LOAD:v[0-9]+]]
220; CHECK: v_cmp_eq_u32_e32 vcc, 0, [[LOAD]]
221; CHECK-NEXT: s_and_b64 vcc, exec, vcc
222; CHECK-NEXT: s_cbranch_vccnz [[LOOP_BB]]
223
224; CHECK-NEXT: {{^}}[[EXIT]]:
225; CHECK: s_or_b64 exec, exec, [[SAVEEXEC]]
226; CHECK: buffer_store_dword
227; CHECK: s_endpgm
228define amdgpu_ps void @test_kill_divergent_loop(i32 %arg) #0 {
229entry:
230  %cmp = icmp eq i32 %arg, 0
231  br i1 %cmp, label %bb, label %exit
232
233bb:
234  %var = call float asm sideeffect "
235    v_mov_b32_e64 v7, -1
236    v_nop_e64
237    v_nop_e64
238    v_nop_e64
239    v_nop_e64
240    v_nop_e64
241    v_nop_e64
242    v_nop_e64
243    v_nop_e64
244    v_nop_e64
245    v_nop_e64", "={VGPR7}"()
246  call void @llvm.AMDGPU.kill(float %var)
247  %vgpr = load volatile i32, i32 addrspace(1)* undef
248  %loop.cond = icmp eq i32 %vgpr, 0
249  br i1 %loop.cond, label %bb, label %exit
250
251exit:
252  store volatile i32 8, i32 addrspace(1)* undef
253  ret void
254}
255
256; bug 28550
257; CHECK-LABEL: {{^}}phi_use_def_before_kill:
258; CHECK: v_cndmask_b32_e64 [[PHIREG:v[0-9]+]], 0, -1.0,
259; CHECK: v_cmpx_le_f32_e32 vcc, 0,
260; CHECK-NEXT: s_cbranch_execnz [[BB4:BB[0-9]+_[0-9]+]]
261
262; CHECK: exp
263; CHECK-NEXT: s_endpgm
264
265; CHECK: [[KILLBB:BB[0-9]+_[0-9]+]]:
266; CHECK-NEXT: s_cbranch_scc0 [[PHIBB:BB[0-9]+_[0-9]+]]
267
268; CHECK: [[PHIBB]]:
269; CHECK: v_cmp_eq_f32_e32 vcc, 0, [[PHIREG]]
270; CHECK-NEXT: s_cbranch_vccz [[ENDBB:BB[0-9]+_[0-9]+]]
271
272; CHECK: ; %bb10
273; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 9
274; CHECK: buffer_store_dword
275
276; CHECK: [[ENDBB]]:
277; CHECK-NEXT: s_endpgm
278define amdgpu_ps void @phi_use_def_before_kill() #0 {
279bb:
280  %tmp = fadd float undef, 1.000000e+00
281  %tmp1 = fcmp olt float 0.000000e+00, %tmp
282  %tmp2 = select i1 %tmp1, float -1.000000e+00, float 0.000000e+00
283  call void @llvm.AMDGPU.kill(float %tmp2)
284  br i1 undef, label %phibb, label %bb8
285
286phibb:
287  %tmp5 = phi float [ %tmp2, %bb ], [ 4.0, %bb8 ]
288  %tmp6 = fcmp oeq float %tmp5, 0.000000e+00
289  br i1 %tmp6, label %bb10, label %end
290
291bb8:
292  store volatile i32 8, i32 addrspace(1)* undef
293  br label %phibb
294
295bb10:
296  store volatile i32 9, i32 addrspace(1)* undef
297  br label %end
298
299end:
300  ret void
301}
302
303; CHECK-LABEL: {{^}}no_skip_no_successors:
304; CHECK: v_cmp_nge_f32
305; CHECK-NEXT: s_cbranch_vccz [[SKIPKILL:BB[0-9]+_[0-9]+]]
306
307; CHECK: ; %bb6
308; CHECK: s_mov_b64 exec, 0
309
310; CHECK: [[SKIPKILL]]:
311; CHECK: v_cmp_nge_f32_e32 vcc
312; CHECK-NEXT: BB#3: ; %bb5
313; CHECK-NEXT: .Lfunc_end{{[0-9]+}}
314define amdgpu_ps void @no_skip_no_successors(float inreg %arg, float inreg %arg1) #0 {
315bb:
316  %tmp = fcmp ult float %arg1, 0.000000e+00
317  %tmp2 = fcmp ult float %arg, 0x3FCF5C2900000000
318  br i1 %tmp, label %bb6, label %bb3
319
320bb3:                                              ; preds = %bb
321  br i1 %tmp2, label %bb5, label %bb4
322
323bb4:                                              ; preds = %bb3
324  br i1 true, label %bb5, label %bb7
325
326bb5:                                              ; preds = %bb4, %bb3
327  unreachable
328
329bb6:                                              ; preds = %bb
330  call void @llvm.AMDGPU.kill(float -1.000000e+00)
331  unreachable
332
333bb7:                                              ; preds = %bb4
334  ret void
335}
336
337; CHECK-LABEL: {{^}}if_after_kill_block:
338; CHECK: ; BB#0:
339; CHECK: s_and_saveexec_b64
340; CHECK: s_xor_b64
341; CHECK-NEXT: mask branch [[BB4:BB[0-9]+_[0-9]+]]
342
343; CHECK: v_cmpx_le_f32_e32 vcc, 0,
344; CHECK: [[BB4]]:
345; CHECK: s_or_b64 exec, exec
346; CHECK: image_sample_c
347
348; CHECK: v_cmp_neq_f32_e32 vcc, 0,
349; CHECK: s_and_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc
350; CHECK: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, exec
351; CHECK: mask branch [[END:BB[0-9]+_[0-9]+]]
352; CHECK-NOT: branch
353
354; CHECK: BB{{[0-9]+_[0-9]+}}: ; %bb8
355; CHECK: buffer_store_dword
356
357; CHECK: [[END]]:
358; CHECK: s_or_b64 exec, exec
359; CHECK: s_endpgm
360define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, <4 x i32> %arg2) #0 {
361bb:
362  %tmp = fcmp ult float %arg1, 0.000000e+00
363  br i1 %tmp, label %bb3, label %bb4
364
365bb3:                                              ; preds = %bb
366  call void @llvm.AMDGPU.kill(float %arg)
367  br label %bb4
368
369bb4:                                              ; preds = %bb3, %bb
370  %tmp5 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %arg2, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
371  %tmp6 = extractelement <4 x float> %tmp5, i32 0
372  %tmp7 = fcmp une float %tmp6, 0.000000e+00
373  br i1 %tmp7, label %bb8, label %bb9
374
375bb8:                                              ; preds = %bb9, %bb4
376  store volatile i32 9, i32 addrspace(1)* undef
377  ret void
378
379bb9:                                              ; preds = %bb4
380  ret void
381}
382
383declare void @llvm.AMDGPU.kill(float) #0
384declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
385declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) nounwind
386
387attributes #0 = { nounwind }
388attributes #1 = { nounwind readnone }
389